Research in R for a project looking into the correlation between pollution (levels of PM2.5) and race (levels of diversity) in Los Angeles. This notebook contains code used to calculate spatial and statistical measures and to plot diagrams and figures for the resulting paper 'The effect of exposure to particulate matter on the racial diveristy of populations across Los Angeles'.
install.packages(c('rgeos','rgdal','RColorBrewer','GISTools','raster','gstat','nlme','sp','geosphere','moments','plyr'))
options(warn=-1)
library(rgeos)
library(rgdal)
library(RColorBrewer)
library(GISTools)
library(raster)
library(gstat)
library(nlme)
library(sp)
require(geosphere)
library(spdep)
library(spgwr)
library(GWmodel)
library(moments)
library(plotrix)
library(spatstat)
library(maptools)
library(plyr)
Data has been collected from the California Environmental Protection Agency - these measures of particulate matter were taken from a total of 37 monitoring stations in the South Coast Air Basin, providing an hourly average each day throughout the month of April 2016.
#read pollution data
ozone = read.csv("data/OZONE_PICKDATA_2016-4-30.csv", header=T, sep=",")
partmat = read.csv("data/PM25HR_PICKDATA_2016-4-30.csv", header=T, sep=",")
#read shapefiles of EPA monitoring stations and the air basins
monitor <- readOGR(dsn="data/airmonitoringstations.shp", layer="airmonitoringstations")
Ca.AirBasin <- readOGR(dsn="data/CaAirBasin.shp",layer="CaAirBasin")
#extract points within the South coast
SC.monitor = monitor[monitor$AIRBASIN %in% c("South Coast"),]
SC.AirBasin = Ca.AirBasin[Ca.AirBasin$NAME %in% c("South Coast"),]
#reproject the data to a suitable projection (utm used here because of the scale of the analysis)
SC.monitor.t = spTransform(SC.monitor, CRS("+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 + towgs84=0,0,0"))
SC.AirBasin.t = spTransform(SC.AirBasin, CRS("+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"))
The census data was sourced from the American Community Survey. These frequencies of different races in census tracts throughout Los Angeles County are 5-year estimates based on Census 2010 data, with a 90% margin of error. A new measure of diversity is calculated which is a percentage measure of those in the population who are not in the majority race.
#read california census tracts
all.tracts = shapefile("data/cb_2015_06_tract_500k.shp")
#reproject the data to match the projection of the pollution datasets (utm)
all.tracts.t = spTransform(all.tracts, CRS("+proj=longlat +datum=NAD83 +no_defs + ellps=GRS80 +towgs84=0,0,0"))
#extract census tracts that intersect with the sc air basin
SC.tracts.t <- intersect(all.tracts.t, SC.AirBasin.t)
#read in the census data and merge with census tracts
census.data = read.csv("data/ACS_14_5YR_B02001_with_ann.csv") #race
census2.data = read.csv("data/ACS_14_5YR_B19013_with_ann.csv") #income
#race data set up
race.data = census.data[,c("GEO.id", "HD01_VD01", "HD01_VD02","HD01_VD03", "HD01_VD04","HD01_VD05","HD01_VD06")]
colnames(race.data)[1:7] = c("AFFGEOID","Total","White","Black","AIndian","Asian","Hawaiian")
#delete unneccessary info
race.data <- race.data[c(-1),]
#function for getting diversity measure
getdiversity <- function(rowval) {
n = rowval
tempmax = max(as.numeric(as.character(race.data$White[n])),as.numeric(as.character(race.data$Black[n])),as.numeric(as.character(race.data$AIndian[n])),as.numeric(as.character(race.data$Asian[n])),as.numeric(as.character(race.data$Hawaiian[n])))
diversity = tempmax / as.numeric(as.character(race.data$Total[n]))
return(diversity)
}
#add new measure onto dataset
Diversity <- matrix(, nrow=nrow(race.data))
for (i in 1:nrow(race.data)) {
Diversity[i] = getdiversity(i)
}
race.data = cbind(race.data, Diversity)
#income data set up
income.data = census2.data[,c("GEO.id", "HD01_VD01")]
colnames(income.data)[1:2] = c("AFFGEOID","Income")
#change the name of GEO.id so it can be merged with shapefile
colnames(census.data)[1] = "AFFGEOID"
#merge the sc census tracts with the census data
SC.tracts.t = sp::merge(SC.tracts.t, race.data, by='AFFGEOID')
SC.tracts.t = sp::merge(SC.tracts.t, income.data, by='AFFGEOID')
#remove missing values
SC.tracts.t <- SC.tracts.t[complete.cases(SC.tracts.t$Diversity),]
First let's check out where our South Coast air basins are in LA, and we can project them on top of a choropleth map of our new diversity measure too to see if that is telling us anything interesting.
#map the south coast air basin with census tracts and monitoring station locations
par(mfrow=c(1,1))
plot(SC.tracts.t, border="lightgrey", lwd=0.3, col=SC.tracts.t$Diversity)
plot(SC.AirBasin.t, add=T)
plot(SC.monitor.t, add=T, col="red", cex=0.5, pch=16)
#choropleth map of diversity values
shades = auto.shading(as.numeric(SC.tracts.t$Diversity),cols=brewer.pal(9,"Blues"))
choropleth(SC.tracts.t,SC.tracts.t$Diversity,shading=shades, border=NA)
plot(SC.monitor.t, add=T, col="red", cex=0.5, pch=16)
plot(SC.AirBasin.t, add=T, border="lightgrey")
We can calculate some average ozone recordings for these sites
#calculate mean and max ozone level for each site for all readings
mean.partmat <- aggregate(value ~ site, partmat, mean)
max.partmat <- aggregate(value ~ site, partmat, max)
#join the mean and max ozone values to their respective monitoring stations
#and rename the first col of monitoring data to site for merging purposes
names(SC.monitor.t)[1] <- "site"
#merge SC.monitor with ozone mean and max data
mrg.tab.mean <- sp::merge(SC.monitor.t, mean.partmat, by="site", all.x=FALSE)
mrg.tab.max <- sp::merge(SC.monitor.t, max.partmat, by="site", all.x=FALSE)
#create a max and a mean spatialPointDataFrame from data
partmat.mean.spdf = na.omit(mrg.tab.mean)
partmat.max.spdf = na.omit(mrg.tab.max)
#add the merged moniotring stations to map in blue (there are less stations now as a result of merge)
par(mfrow=c(1,1))
plot(SC.tracts.t, border="lightgrey", col=SC.tracts.t$Diversity)
plot(SC.AirBasin.t, add=T)
plot(SC.monitor.t, add=T, col="red", cex=0.5, pch=16)
plot(partmat.mean.spdf,add=T,col="blue")
head(SC.tracts.t)
DV = as.numeric(SC.tracts.t$White)
INDV = as.numeric(SC.tracts.t$Black)
#plot(DV,INDV)
cor.test(DV,INDV)
#first, plot data and fit linear line through it based on least squares
#lmfit = lm(DV~INDV)
#abline(lmfit)
#conduct regression analysis
linear.model = lm(DV~INDV)
summary(linear.model)
resids = residuals(linear.model)
#prepare data for plotting residuals from the regression analysis
#first convert census tracts to a data frame object (need to not use spdf for now)
tracts.df = as.data.frame(SC.tracts.t)
#combine census data with residuals from census data
tracts.df = cbind(tracts.df[1],resids)
#change col header name
colnames(tracts.df)[2] = "residuals"
#merge the residuals data set with the census data set
SC.tracts.t = merge(SC.tracts.t, tracts.df, by='AFFGEOID')
#set min and max lat/long for mapping residuals
minlat = min(SC.monitor$LATITUDE - 0.1)
maxlat = max(SC.monitor$LATITUDE + 0.1)
minlong = min(SC.monitor$LONGITUDE - 0.1)
maxlong = max(SC.monitor$LONGITUDE + 0.1)
#create a colouring scheme for choropleth map
shades = auto.shading(as.numeric(SC.tracts.t$residuals),cols=brewer.pal(9,"Greens"))
#create a choropleth map
choropleth(SC.tracts.t, SC.tracts.t$residuals, border=NA, shading=shades, xlim=c(minlong,maxlong), ylim=c(minlat,maxlat))