Data Science for Bryn Mawr using R

Author

Tae Hyun Lim

This webpage provides the materials for a presentation that was given during a campus visit to Bryn Mawr College on April 12, 2024.

You can download the slides used in the presentation here:

Please feel free to contact me at talim@syr.edu if you have any questions.

General Information

Remember

  • to use RScript or Quarto/RMarkdown
  • to back-up, back-up, and then, back-up
  • CRAN, Google, and ChatGPT (Gemini, Claude, CoPilot…) are your friend–when you use it right!
  • to check data structure and type when error

You can download and install R from here and RStudio and R from here.

You can download the Quarto file and the data sets used in the presentation here:

School Districts Data

setwd(path)
# Load the district_info dataset
district_info <- read_excel("district_info.xlsx")
Error in read_excel("district_info.xlsx"): could not find function "read_excel"
# Attach packages
#install.packages('tidyverse')
#install.packages('readxl')
library(tidyverse)
-- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
v dplyr     1.1.2     v readr     2.1.4
v forcats   1.0.0     v stringr   1.5.0
v ggplot2   3.4.2     v tibble    3.2.1
v lubridate 1.9.2     v tidyr     1.3.0
v purrr     1.0.1     
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
# Load the district_info dataset
district_info <- read_excel("district_info.xlsx")
# Display top five rows
head(district_info)
# A tibble: 6 x 34
  county  school_district     poverty_rate_children rural_urban population miles
  <chr>   <chr>                               <dbl> <chr>            <dbl> <dbl>
1 Fayette Albert Gallatin Ar~                0.204  Rural            22622 140. 
2 Fayette Brownsville Area SD                0.382  Rural            13741  55.2
3 Fayette Connellsville Area~                0.173  Rural            32001 216. 
4 Fayette Frazier SD                         0.0622 Rural             7633  58.3
5 Fayette Laurel Highlands SD                0.215  Urban            22718  55.5
6 Fayette Uniontown Area SD                  0.275  Rural            22482 247. 
# i 28 more variables: population_density <dbl>, male_perc <dbl>,
#   female_perc <dbl>, under_18 <dbl>, eighteen_thirtyfour <dbl>,
#   thirtyfive_sixtyfour <dbl>, sixtyfive <dbl>, median_age <dbl>,
#   povery_rate_total <dbl>, poverty_rate_working_adults <dbl>,
#   poverty_rate_senior <dbl>, income_55 <dbl>, income_99 <dbl>,
#   income_149 <dbl>, income_150 <dbl>, median_household_income <dbl>,
#   mean_household_income <dbl>, median_family_income <dbl>, ...
# Display variable names
colnames(district_info)
 [1] "county"                        "school_district"              
 [3] "poverty_rate_children"         "rural_urban"                  
 [5] "population"                    "miles"                        
 [7] "population_density"            "male_perc"                    
 [9] "female_perc"                   "under_18"                     
[11] "eighteen_thirtyfour"           "thirtyfive_sixtyfour"         
[13] "sixtyfive"                     "median_age"                   
[15] "povery_rate_total"             "poverty_rate_working_adults"  
[17] "poverty_rate_senior"           "income_55"                    
[19] "income_99"                     "income_149"                   
[21] "income_150"                    "median_household_income"      
[23] "mean_household_income"         "median_family_income"         
[25] "mean_family_income"            "per_capita_income"            
[27] "graduation_rate"               "total_revenue"                
[29] "revenues_local"                "revenues_state"               
[31] "revenues_federal"              "revenues_other"               
[33] "total_expenditure"             "total_expenditure_per_student"
head(district_info$poverty_rate_children)
[1] 0.20370006 0.38150989 0.17266436 0.06222865 0.21487603 0.27539561
is.numeric(district_info$poverty_rate_children)
[1] TRUE
# sorting in descending order
district_info %>%
  arrange(desc(poverty_rate_children))
# A tibble: 500 x 34
   county     school_district poverty_rate_children rural_urban population miles
   <chr>      <chr>                           <dbl> <chr>            <dbl> <dbl>
 1 Mercer     Farrell Area SD                 0.637 Urban             4843  3.19
 2 Beaver     Aliquippa SD                    0.622 Urban             9238  4.20
 3 Cambria    Greater Johnst~                 0.529 Urban            24609 28.2 
 4 Schuylkill Shenandoah Val~                 0.516 Urban             7028 11.9 
 5 Lackawanna Carbondale Are~                 0.479 Urban            11166 18.5 
 6 Allegheny  Sto-Rox SD                      0.470 Urban            12346  3.04
 7 Allegheny  Duquesne City ~                 0.465 Urban             5254  1.82
 8 Dauphin    Harrisburg Cit~                 0.437 Urban            50099  8.12
 9 Allegheny  Wilkinsburg Bo~                 0.426 Urban            14349  2.25
10 Delaware   Chester-Upland~                 0.408 Urban            39753  6.91
# i 490 more rows
# i 28 more variables: population_density <dbl>, male_perc <dbl>,
#   female_perc <dbl>, under_18 <dbl>, eighteen_thirtyfour <dbl>,
#   thirtyfive_sixtyfour <dbl>, sixtyfive <dbl>, median_age <dbl>,
#   povery_rate_total <dbl>, poverty_rate_working_adults <dbl>,
#   poverty_rate_senior <dbl>, income_55 <dbl>, income_99 <dbl>,
#   income_149 <dbl>, income_150 <dbl>, median_household_income <dbl>, ...
# top 1
district_info %>%
  slice_max(poverty_rate_children)
# A tibble: 1 x 34
  county school_district poverty_rate_children rural_urban population miles
  <chr>  <chr>                           <dbl> <chr>            <dbl> <dbl>
1 Mercer Farrell Area SD                 0.637 Urban             4843  3.19
# i 28 more variables: population_density <dbl>, male_perc <dbl>,
#   female_perc <dbl>, under_18 <dbl>, eighteen_thirtyfour <dbl>,
#   thirtyfive_sixtyfour <dbl>, sixtyfive <dbl>, median_age <dbl>,
#   povery_rate_total <dbl>, poverty_rate_working_adults <dbl>,
#   poverty_rate_senior <dbl>, income_55 <dbl>, income_99 <dbl>,
#   income_149 <dbl>, income_150 <dbl>, median_household_income <dbl>,
#   mean_household_income <dbl>, median_family_income <dbl>, ...

Interactive Graph of Top 15

# Attach packages
#install.packages('ggplot2')
library(ggplot2)
head(district_info$poverty_rate_children)
[1] 0.20370006 0.38150989 0.17266436 0.06222865 0.21487603 0.27539561
# Create ggplot object
p <- ggplot(district_info, aes(x = school_district, y = poverty_rate_children)) + 
  geom_bar(stat = "identity", aes(fill = poverty_rate_children))
p

# Select top 15 stations and reorder factor levels in descending order
top_districts <- district_info %>% 
  top_n(15, poverty_rate_children) %>% 
  mutate(school_district = fct_reorder(school_district, desc(poverty_rate_children)))
# Create ggplot object
p <- ggplot(top_districts, aes(x = school_district, y = poverty_rate_children)) +
  geom_bar(stat = "identity", aes(fill = poverty_rate_children)) 
p

# Create ggplot object with custom color scheme
p <- ggplot(top_districts, aes(x = school_district, y = poverty_rate_children, text = paste(school_district, "<br>Poverty Rate: ", poverty_rate_children))) + # text is included for the interactive plot
  geom_bar(stat = "identity", aes(fill = poverty_rate_children)) +
  scale_fill_gradient(low = "#FFF0BE", high = "#03335F") + # change color scheme
  labs(title = "Top 15 School Districts with the Highest Child Poverty Rate in 2021",
       x = "School Districts",
       y = "Child Poverty Rate",
       fill = "Child\nPoverty\nRate") + # change labels
  theme_bw() + # change themes
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + # change x axis label's orientation
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) # Format y as percentage
p

#install.packages('plotly')
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
# Convert ggplot to interactive plot
ggplotly(p)
# modify ggplotly object
ggplotly(p + theme(legend.position = "none"), # delete legend
         tooltip = "text") %>% # set text template for the text balloon when mouse hovers
  layout(hoverlabel = list(bgcolor = c(low = "#FFF0BE", high = "#03335F"))) # set color template

Interactive Map

# Attach packages
#install.packages('sf')
#install.packages('tmap')
#install.packages('leaflet')
library(sf)
library(tmap)
library(leaflet)
# Read in the shapefile for the Pennsylvania School Districts
ad_shapefile <- st_read("PaSchoolDistricts2024_03.geojson") %>% 
  rename(school_district = SCHOOL_DIS) %>% 
  mutate(school_district = case_when(
    school_district == "Dubois Area SD" ~ "DuBois Area SD",
    TRUE ~ school_district # Default case to leave values unchanged
  ))
Reading layer `PaSchoolDistricts2024_03' from data source 
  `C:\Users\limth\Dropbox\BMCV\PaSchoolDistricts2024_03.geojson' 
  using driver `GeoJSON'
Simple feature collection with 500 features and 16 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -80.51939 ymin: 39.71986 xmax: -74.68956 ymax: 42.26935
Geodetic CRS:  WGS 84
district_info <- district_info %>% 
  mutate(school_district = case_when(
    school_district == "Allegheny-Clarion Valley SD" ~ "Allegheny-Clarion Valley SD",
    school_district == "Apollo Ridge SD" ~ "Apollo-Ridge SD",
    school_district == "Center Valley SD" ~ "Central Valley SD",
    school_district == "East Pennsboro SD" ~ "East Pennsboro Area SD",
    school_district == "Eastern Lancaster Co SD" ~ "Eastern Lancaster County SD",
    school_district == "Hollidaysburg SD" ~ "Hollidaysburg Area SD",
    school_district == "Jeanette City SD" ~ "Jeannette City SD",
    school_district == "Lake Lehman SD" ~ "Lake-Lehman SD",
    school_district == "Laurel Area SD" ~ "Laurel SD",
    school_district == "Lower Moreland Twp SD" ~ "Lower Moreland Township SD",
    school_district == "Meyers Dale Area SD" ~ "Meyersdale Area SD",
    school_district == "Northern Bedford County SD" ~ "Northern Bedford County SD",
    school_district == "Philipsburg-Osceola Area SD" ~ "Philipsburg-Osceola Area SD",
    school_district == "Pine Grove SD" ~ "Pine Grove Area SD",
    school_district == "Riverside Beaver County SD" ~ "Riverside Beaver County SD",
    school_district == "Scranton City SD" ~ "Scranton SD",
    school_district == "Shanksville-Stony Creek SD" ~ "Shanksville-Stonycreek SD",
    school_district == "South Fayette Twp SD" ~ "South Fayette Township SD",
    school_district == "South Williamsport SD" ~ "South Williamsport Area SD",
    school_district == "Southern Huntingdon Co SD" ~ "Southern Huntingdon County SD",
    school_district == "Unionville Chadds-Ford SD" ~ "Unionville-Chadds Ford SD",
    school_district == "Upper Dauphin SD" ~ "Upper Dauphin Area SD",
    school_district == "Upper Moreland Twp SD" ~ "Upper Moreland Township SD",
    school_district == "Blairsville-Saltsburg SD" ~ "River Valley SD",
    school_district == "South Butler County SD" ~ "Knoch SD",
    TRUE ~ school_district # Default case to leave values unchanged
  ))
district_info_sf <- full_join(ad_shapefile, district_info, by = "school_district")
district_info_sf <- st_as_sf(district_info_sf, wkt = "geometry", crs = 4326)
# Define the color palette
pal <- leaflet::colorNumeric(palette = c("#FFF0BE", "#03335F"), domain = district_info_sf$poverty_rate_children)

# Create the leaflet map
leaflet <- leaflet() %>%
  addTiles() %>% 
  setView(lng = -75.2, lat = 40, zoom = 11) %>% 
  addProviderTiles("CartoDB.Positron") %>%
  addPolygons(data = district_info_sf, 
              fillColor = ~ifelse(is.na(poverty_rate_children), "#FFFFFF", pal(poverty_rate_children)), 
              weight = 0.3,
              color = ~ifelse(is.na(poverty_rate_children), "#FFFFFF", pal(poverty_rate_children)),  
              opacity = 1.0,
              fillOpacity = 0.5, 
              label = ~paste("District: ", school_district, ", ",
                             "Child Poverty Rate: ", format(round(poverty_rate_children, 4), nsmall = 2)),
              highlightOptions = leaflet::highlightOptions(
                weight = 1.5,
                color = "white",
                fillOpacity = 0,
                bringToFront = TRUE)) %>%
  addLegend("bottomleft", 
            pal = pal, 
            values = district_info_sf$poverty_rate_children,
            title = "Child Poverty Rate", 
            opacity = 0.8) %>%
  addScaleBar(position = "topright")

Child Poverty Rate by School Districts

leaflet

Linear Regression

# Check data type
is.numeric(district_info$graduation_rate)
[1] FALSE
is.numeric(district_info$poverty_rate_children)
[1] TRUE
# Check data type
district_info$graduation_rate <- as.numeric(district_info$graduation_rate)
Warning: NAs introduced by coercion
is.numeric(district_info$graduation_rate)
[1] TRUE
# Check data type
head(district_info$graduation_rate)
[1] 0.8707224 0.7241379 0.7651515 0.9494949 0.9196429 0.7243243
district_info <- district_info %>% 
  mutate(
    graduation_rate = round(graduation_rate * 100, 2),
    poverty_rate_children = round(poverty_rate_children * 100, 2))
head(district_info$graduation_rate)
[1] 87.07 72.41 76.52 94.95 91.96 72.43
# Linear regression
model <- lm(graduation_rate ~ poverty_rate_children, data = district_info)
summary(model)

Call:
lm(formula = graduation_rate ~ poverty_rate_children, data = district_info)

Residuals:
     Min       1Q   Median       3Q      Max 
-23.7120  -2.4321   0.7762   3.2968  24.3069 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)           95.47094    0.44796  213.12   <2e-16 ***
poverty_rate_children -0.34477    0.02518  -13.69   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.502 on 493 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.2755,    Adjusted R-squared:  0.274 
F-statistic: 187.5 on 1 and 493 DF,  p-value: < 2.2e-16

Interactive Graph of the Regression Results

# Attach packages
#install.packages('ggplot2')
#install.packages('plotly')
library(ggplot2)
library(plotly)
# Create ggplot object
ggp <- ggplot(district_info, aes(x = poverty_rate_children, y = graduation_rate)) +
  geom_point() +  # Add points
  theme_minimal() +  # Use a minimal theme
  labs(title = "Child Poverty Rate vs. Graduation Rate",
       x = "Child Poverty Rate (%)",
       y = "Graduation Rate (%)") +
  geom_smooth(method = "lm", se = TRUE, color = "#FFF0BE")  # Add a linear model fit line without confidence interval
ggp
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 5 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 5 rows containing missing values (`geom_point()`).

# Convert to plotly object for interactivity
ggplotly(ggp)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 5 rows containing non-finite values (`stat_smooth()`).

Helpful Resources

Here are some helpful resources learning R: