This notebook reads in and processes the following U.S. Census Bureau county-level datasets on population estimates by age, sex and race/ethnicity:
For the 2010-18 period (all counties downloaded)
For the 2000-10 period (only Maryland counties downloaded)
For the 1990-99 period (all counties downloaded)
Notes:
The race/ethnicity categories for the 1990-2000 period are not directly comparable to data from post-2000 period in that the individuals were not presented with the option to self-identify with more than one race until the 2000 Census. Also during the 1990-99 period, "Asian or Pacific Islander" was a race category, while 2000 onwards breaks out "Asian" and "Native Hawaiian or Other Pacific Islander" separately.
In the 1990-90 and 2000-09 periods, the youngest age group is <1 years and the second-youngest age group is 1-4 years. In the 2010-18 period, the youngest age group is 0-4 years. This notebook standardizes the age groups such that the youngest category is 0-4 years.
The raw datasets are saved in the input/
folder. The processed datasets are saved in the output/
folder.
suppressMessages(library('tidyverse'))
suppressMessages(library('reshape2'))
Functions to recode year, age and race variables
recodeyear.18 <- function (df) {
df <- df %>% mutate(year_name = case_when(YEAR == 3 ~ 2010,
YEAR == 4 ~ 2011,
YEAR == 5 ~ 2012,
YEAR == 6 ~ 2013,
YEAR == 7 ~ 2014,
YEAR == 8 ~ 2015,
YEAR == 9 ~ 2016,
YEAR == 10 ~ 2017,
YEAR == 11 ~ 2018))
return(df)
}
recodeyear.00 <- function (df) {
df <- df %>% mutate(year_name = case_when(YEAR == 2 ~ 2000,
YEAR == 3 ~ 2001,
YEAR == 4 ~ 2002,
YEAR == 5 ~ 2003,
YEAR == 6 ~ 2004,
YEAR == 7 ~ 2005,
YEAR == 8 ~ 2006,
YEAR == 9 ~ 2007,
YEAR == 10 ~ 2008,
YEAR == 11 ~ 2009))
return(df)
}
recodeage.18 <- function (df) {
df <- df %>% mutate(age_name = case_when(AGEGRP == 1 ~ 'age_00-04',
AGEGRP == 2 ~ 'age_05-09',
AGEGRP == 3 ~ 'age_10-14',
AGEGRP == 4 ~ 'age_15-19',
AGEGRP == 5 ~ 'age_20-24',
AGEGRP == 6 ~ 'age_25-29',
AGEGRP == 7 ~ 'age_30-34',
AGEGRP == 8 ~ 'age_35-39',
AGEGRP == 9 ~ 'age_40-44',
AGEGRP == 10 ~ 'age_45-49',
AGEGRP == 11 ~ 'age_50-54',
AGEGRP == 12 ~ 'age_55-59',
AGEGRP == 13 ~ 'age_60-64',
AGEGRP == 14 ~ 'age_65-69',
AGEGRP == 15 ~ 'age_70-74',
AGEGRP == 16 ~ 'age_75-79',
AGEGRP == 17 ~ 'age_80-84',
AGEGRP == 18 ~ 'age_85+',
AGEGRP == 0 ~ 'total'))
return(df)
}
recodeage.90.00 <- function (df) {
df <- df %>% mutate(age_name = case_when(AGEGRP == 0 ~ 'age_00-04',
AGEGRP == 1 ~ 'age_00-04',
AGEGRP == 2 ~ 'age_05-09',
AGEGRP == 3 ~ 'age_10-14',
AGEGRP == 4 ~ 'age_15-19',
AGEGRP == 5 ~ 'age_20-24',
AGEGRP == 6 ~ 'age_25-29',
AGEGRP == 7 ~ 'age_30-34',
AGEGRP == 8 ~ 'age_35-39',
AGEGRP == 9 ~ 'age_40-44',
AGEGRP == 10 ~ 'age_45-49',
AGEGRP == 11 ~ 'age_50-54',
AGEGRP == 12 ~ 'age_55-59',
AGEGRP == 13 ~ 'age_60-64',
AGEGRP == 14 ~ 'age_65-69',
AGEGRP == 15 ~ 'age_70-74',
AGEGRP == 16 ~ 'age_75-79',
AGEGRP == 17 ~ 'age_80-84',
AGEGRP == 18 ~ 'age_85+',
AGEGRP == 99 ~ 'total'))
return(df)
}
recoderace <- function (df) {
df <- df %>% mutate(race_name = case_when(race == 'H' ~ 'Hispanic',
race == 'NHAA' ~ 'Asian',
race == 'NHBA' ~ 'Black',
race == 'NHIA' ~ 'American Indian and Alaska Native',
race == 'NHNA' ~ 'Native Hawaiian and Other Pacific Islander',
race == 'NHTOM' ~ 'Multiple',
race == 'NHWA' ~ 'White',
race == 'TOT' ~ 'Total'))
return(df)
}
recoderace.90 <- function (df) {
df <- df %>% mutate(race_sex = case_when(age_sex == 1 ~ 'White_male',
age_sex == 2 ~ 'White_female',
age_sex == 3 ~ 'Black_male',
age_sex == 4 ~ 'Black_female',
age_sex == 5 ~ 'AmericanIndianAlaskanNative_male',
age_sex == 6 ~ 'AmericanIndianAlaskanNative_female',
age_sex == 7 ~ 'AsianPac_male',
age_sex == 8 ~ 'AsianPac_female'),
hisp = case_when(ethnic == 1 ~ 'not_Hispanic',
ethnic == 2 ~ 'Hispanic'))
return(df)
}
2010-18 period
data.18 <- suppressMessages(read_csv('input/CC-EST2018-ALLDATA.csv'))
Select a subset of the data and include just the July population estimates (YEAR > 2, per the file layout documentation). Race/ethnicity categories selected such that the sum of the categories will be the total population.
data.18.subset <- data.18 %>% select(STATE,
STNAME,
COUNTY,
CTYNAME,
AGEGRP,
YEAR,
TOT_POP,
TOT_MALE,
TOT_FEMALE,
NHWA_MALE,
NHWA_FEMALE,
NHBA_MALE,
NHBA_FEMALE,
NHIA_MALE,
NHIA_FEMALE,
NHAA_MALE,
NHAA_FEMALE,
NHNA_MALE,
NHNA_FEMALE,
NHTOM_MALE,
NHTOM_FEMALE,
H_MALE,
H_FEMALE) %>% filter(YEAR > 2)
Recode year and age
data.18.subset <- recodeyear.18(data.18.subset)
data.18.subset <- recodeage(data.18.subset)
Reshape data to long and create a separate column for sex
data.18.subset.long <- melt(data.18.subset, id.vars = c('STATE',
'STNAME',
'COUNTY',
'CTYNAME',
'age_name',
'AGEGRP',
'YEAR',
'year_name'))
data.18.subset.long <- data.18.subset.long %>% separate(variable, into = c('race',
'sex')) %>%
mutate(sex = ifelse(sex == 'POP', 'both', sex),
race = ifelse(race == 'TOT', 'all', race))
Recode race
data.18.subset.long <- recoderace(data.18.subset.long)
Save to CSV
write_csv(data.18.subset.long,
'output/data_2018.csv')
2000-09 period
md.data.00 <- suppressMessages(read_csv('co-est00int-alldata-24.csv'))
Select a subset of the data and include just the July population estimates over the period (YEAR > 1 and YEAR < 12, per the file layout documentation). Race/ethnicity categories selected such that the sum of the categories will be the total population.
md.data.00.subset <- md.data.00 %>% select(COUNTY,
CTYNAME,
AGEGRP,
YEAR,
TOT_POP,
TOT_MALE,
TOT_FEMALE,
NHWA_MALE,
NHWA_FEMALE,
NHBA_MALE,
NHBA_FEMALE,
NHIA_MALE,
NHIA_FEMALE,
NHAA_MALE,
NHAA_FEMALE,
NHNA_MALE,
NHNA_FEMALE,
NHTOM_MALE,
NHTOM_FEMALE,
H_MALE,
H_FEMALE) %>%
filter(YEAR > 1 & YEAR < 12)
Recode year and age
md.data.00.subset <- recodeyear.00(md.data.00.subset)
md.data.00.subset <- recodeage.90.00(md.data.00.subset)
Reshape data to long and create a separate column for sex
md.data.00.subset.long <- melt(md.data.00.subset, id.vars = c('COUNTY',
'CTYNAME',
'age_name',
'AGEGRP',
'YEAR',
'year_name'))
md.data.00.subset.long <- md.data.00.subset.long %>% separate(variable, into = c('race',
'sex')) %>%
mutate(sex = ifelse(sex == 'POP', 'both', sex),
race = ifelse(race == 'TOT', 'all', race))
Group by age categories and sum to get the totals for the 0-4 age category
md.data.00.subset.long.grouped <- md.data.00.subset.long %>%
group_by(year_name,
COUNTY,
CTYNAME,
age_name,
race,
sex,
YEAR) %>%
summarise(value = sum(value))
Recode race
md.data.00.subset.long.grouped <- recoderace(md.data.00.subset.long.grouped)
Save to CSV
write_csv(md.data.00.subset.long.grouped,
'output/data_md_2000.csv')
1990-99 period
data.90 <- suppressMessages(read_csv('input/stch-icen1990.csv'))
Recode race and age
data.90 <- recodeage.90.00(data.90)
data.90 <- recoderace.90(data.90)
Create a separate column for sex
data.90 <- data.90 %>% separate(race_sex, into = c('race', 'sex'))
Group by age categories and sum to get the totals for the 0-4 age category
data.1990.grouped <- data.90 %>%
group_by(year, fips, age_name, race, sex, hisp) %>%
summarise(pop = sum(pop)) %>% ungroup() %>%
mutate(year = paste0('19',year))
write_csv(data.1990.grouped, 'output/data_1990.csv')
head(data.1990.grouped)
year | fips | age_name | race | sex | hisp | pop |
---|---|---|---|---|---|---|
1990 | 1001 | age_00-04 | AmericanIndianAlaskanNative | female | Hispanic | 0 |
1990 | 1001 | age_00-04 | AmericanIndianAlaskanNative | female | not_Hispanic | 4 |
1990 | 1001 | age_00-04 | AmericanIndianAlaskanNative | male | Hispanic | 0 |
1990 | 1001 | age_00-04 | AmericanIndianAlaskanNative | male | not_Hispanic | 0 |
1990 | 1001 | age_00-04 | AsianPac | female | Hispanic | 0 |
1990 | 1001 | age_00-04 | AsianPac | female | not_Hispanic | 4 |