#!/usr/bin/env python # coding: utf-8 # # Population Synthesis # # Notes: # - This notebook is best viewed [on nbviewer](http://nbviewer.jupyter.org/gist/oztalha/674b9cbd7b56254cde33593801710bfd/Population%20Synthesis.ipynb). # - This work is part of a [project](https://socialcomplexity.gmu.edu/projectsdtra/) conducted in the Center for Social Complexity, and is funded by DTRA. # # To simulate ["What Happens If a Nuclear Bomb Goes Off in Manhattan?"](https://www.theatlantic.com/technology/archive/2017/03/nuclear-bomb-manhattan-simulation/519585/) # we first need to create the population (~20 million) living in the area. This notebook shows how we generate our synthetic population on a sample of two counties in New York (Sullivan and Ulster): # # - Create the road topology # - Clean the shapefiles to get a giant connected road network # - Create living spaces # - Houses, work places, and schools # - Create individuals # - Age and sex # - Create relationships # - Households: Families (Husband-wife family, etc.), non-families (Householder living alone, etc.) # - Group quarters (Institutionalized and noninstitutionalized) # - School mate (pre-k, elementary, middle, high school) # - Work colleague (inter-tract and inter-county relations) # # Every operation other than the work assignment happens at census-tract level. # The work place assignment takes place at county level. # ## Census Data # # We use three datasets from US Census to synthesize our population: # - Road Network # - Road data from 2010 Census TIGER shapefiles (primary, secondary, and residential roads) # - Demographics # - Census-tract level Demographic Profile (DP) TIGER [shapefiles](https://www.census.gov/geo/maps-data/data/tiger-data.html) (particular [shapefile ZIP](http://www2.census.gov/geo/tiger/TIGER2010DP1/Tract_2010Census_DP1.zip)) # - Commute flows # - Census [Commuting (Journey to Work) Main](https://www.census.gov/hhes/commuting/) # page has a table of [County to County Commuting Flows](https://www.census.gov/hhes/commuting/files/2013/Table%201%20County%20to%20County%20Commuting%20Flows-%20ACS%202009-2013.xlsx) # ## Preprocessing (Cleaning) # # ### Creating Road Network # # We need to get the road shapefiles topologically connected to be able to simulate the agents moving correctly. # The road shapefiles most of the time are not perfect. Roads are not touching each other: either falling short or long when we ideally expect the endpoint of a road to be a vertex on another road. More interestingly, two LineStrings intersecting each other (or having a common internal vertex) spatially does not mean that they are topologically connected (at least according to QGIS and NetworkX). To illustrate (first look at the graphs, then read the printed output, skip the code/input cell), the two (horizontal and vertical) LineStrings below intersects each other according to GIS but they (G1 and G2) are not connected according to NetworkX because it creates the edges (from a shapefile) by looking at the endpoints of the lines. # # Solution to this problem comes after the illustration below (just keep reading!) # In[99]: import networkx as nx from osgeo import ogr from shapely.prepared import prep from shapely.ops import snap, linemerge, nearest_points from shapely.geometry import MultiLineString, LineString, Point, Polygon, GeometryCollection from concurrent.futures import ThreadPoolExecutor from functools import partial from itertools import chain import sys import timeit import os from io import StringIO from IPython.display import display, HTML, Image import geopandas as gpd import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl mpl.rcParams.update(mpl.rcParamsDefault) import seaborn as sns plt.style.use('ggplot') Set1 = plt.get_cmap('Set1') get_ipython().run_line_magic('matplotlib', 'inline') mpl.rcParams['savefig.dpi'] = 150 mpl.rcParams['figure.dpi'] = 150 mpl.rcParams['figure.figsize'] = 16, 12 mpl.rcParams['axes.facecolor']='white' mpl.rcParams['savefig.facecolor']='white' mpl.rcParams['axes.formatter.useoffset']=False # In[2]: # Topology vs Space from shapely.ops import split lh = LineString([(0,1),(2,1)]) #horizontal line lv = LineString([(1,0),(1,2)]) #vertical line print('Do the lines intersect spatially (in GIS terms)?',lh.intersects(lv)) fig, axes = plt.subplots(nrows=1,ncols=3,figsize=(5,1)) [ax.axis('off') for ax in axes] fig.subplots_adjust(wspace=1) def plot_vertices(l, ax): ax.plot(l.xy[0],l.xy[1], 'o', color=next(ax._get_lines.prop_cycler)['color'], markersize=3, alpha=0.75) #gpd.GeoSeries([Point(c) for c in l.coords]).plot(ax=ax,color=next(ax._get_lines.prop_cycler)['color']) print('----------- GRAPH 1 -----------') print('G1: Construct a Networkx graph by adding the LineStrings as edges') ll = gpd.GeoSeries([lh,lv],name='G1') #GeoPandas for spatial data management print('The GeoSeries object from which G1 is created:',ll,sep='\n') g = nx.Graph() #NetworkX for topological operations g.add_edges_from(ll.apply(lambda l: l.coords[:])) print('G1 has',nx.number_of_nodes(g),'nodes and',nx.number_of_edges(g),'edges') print('Is G1 connected (w.r.t. graph/network theory)?',nx.is_connected(g)) ax = ll.plot(ax=axes[0],cmap=Set1) #plotting with gpd is better/easier since space matters ll.apply(lambda x: plot_vertices(x, ax)) print('----------- GRAPH 2 -----------') print('G2: Add the intersection point explicitly to both of the LineStrings') p = lh.intersection(lv) #Point(1,1) ll = gpd.GeoSeries([LineString([l.coords[0],p,l.coords[1]]) for l in [lh,lv]],name='G2') print('The GeoSeries object from which G2 is created:',ll,sep='\n') ll.to_file('ll.shp') g = nx.read_shp('ll.shp').to_undirected() print('G2 has',nx.number_of_nodes(g),'nodes and',nx.number_of_edges(g),'edges') print('Is G2 connected?',nx.is_connected(g)) ax = ll.plot(ax=axes[1],cmap=Set1) ll.apply(lambda x: plot_vertices(x, ax)) print('----------- GRAPH 3 -----------') print('G3: Split the two LineStrings by each other') ll = gpd.GeoSeries([c for c in chain(split(lh,lv),split(lv,lh))],name='G3') print('The GeoSeries object from which G3 is created:',ll,sep='\n') ll.to_file('ll.shp') g = nx.read_shp('ll.shp').to_undirected() print('G3 has',nx.number_of_nodes(g),'nodes and',nx.number_of_edges(g),'edges') print('Are they connected?',nx.is_connected(g)) ax = ll.plot(ax=axes[2],cmap=Set1) #plotting with gpd is better/easier since space matters ll.apply(lambda x: plot_vertices(x, ax)) [ax.set_title('GRAPH '+str(i+1),size=10) for i,ax in enumerate(axes)]; # ### Solution to Road Cleaning: GRASS # # "Geographic Resources Analysis Support System, commonly referred to as GRASS, is a Geographic Information System (GIS) used for geospatial data management and analysis, image processing, graphics/maps production, spatial modeling, and visualization". Installing it might be cumbersome, good news: it comes with QGIS. Here is the list of operations I did on my shapefile (in order): # # - snap: snap lines to vertex in threshold # - break: break lines at each intersection # - rmdupl: remove duplicate geometry features # - rmsa: remove small angles between lines at nodes # # The command to achieve this in GRASS (within QGIS) is `v.clean.advanced` [cleaning tools](https://grass.osgeo.org/grass72/manuals/v.clean.html) and `v.net.components` with parameters: # - No snapping when v.in.ogr (QGIS asks this in two places: v.net.clean.advanced and v.net.components), i.e. tolerance value -1 = no snap (default QGIS value) # - `v.clean.advanced`: `snap,break,rmdupl,rmsa` with tolerance values `0.0001,0.0,0.0,0.0` # - once cleaned, we then run `v.net.components` tool in GRASS to get the components. The new layer created does only have a single feature: `comp`. Select the giant component using the expression `comp=1`, and save it as a csv (the `comps.csv` file read in the next step). # # GRASS scales well because it processes the shapefile tile by tile (instead of reading all in once into the memory). # Here is the output of these two operations (yellow roads are part of the giant connected component): # # # # The second step is to save the giant connected component (road network) as a shapefile. # The original road file has too many (204) columns, probably because of this reason, the simple operation of adding (joining) the output of `v.net.components` to the cleaned layer within QGIS crashed it on my machine. So, I joined them using GeoPandas by removing the 197 unneeded columns, and saved it as `comp1.shp`. # # road = gpd.read_file('../data/SU/SU_cleaned.shp') # comp = pd.read_csv('../data/SU/comps.csv', usecols=[0]) # roads = road[['MTFCC','LINEARID','STATEFP','COUNTYFP','geometry']].join(comp) # roads = roads[roads.comp == 1].drop('comp',axis=1) # roads.to_file('../data/SU/comp1.shp') # # Now that we got our connected road network it is time to build spaces on it. # ### Cleaning the Commuting Flows # # Census [Commuting (Journey to Work) Main](https://www.census.gov/hhes/commuting/) # page has a table of [County to County Commuting Flows](https://www.census.gov/hhes/commuting/files/2013/Table%201%20County%20to%20County%20Commuting%20Flows-%20ACS%202009-2013.xlsx). We use this information in work place assignment. This is one of the three data files we read in. The other two are the road shapefile (already cleaned) and the tract-level demographic profile shapefile (no cleaning was needed). # # url = 'https://www.census.gov/hhes/commuting/files/2013/Table%201%20County%20to%20County%20Commuting%20Flows-%20ACS%202009-2013.xlsx' # wf = pd.read_excel(url), converters={col: str for col in range(12)},skiprows=range(5)) # wf['from'] = wf.iloc[:,0] + wf.iloc[:,1] #State FIPS + County FIPS # wf['to'] = wf.iloc[:,6].str[1:] + wf.iloc[:,7] #State FIPS + County FIPS # wf['flow'] = wf.iloc[:,12].fillna(0).astype(int) #Number of workers (age>16) # wf = wf.iloc[:-6,-3:] #last few lines of the Excel file make up a footnote # wf.to_csv('workflow.csv',index=False) # In[3]: #The table below shows the road feautures in the CENSUS. We are building the spaces only on S1400 and S1740. mtfcc = pd.read_html('https://www.census.gov/geo/reference/mtfcc.html')[0].set_index('MTFCC') with pd.option_context('display.max_colwidth', -1): display(mtfcc[mtfcc.Superclass == 'Road/Path Features'][['Feature Class','Feature Class Description']]) # In[3]: #read the census and the connected-roads shapefiles, and the workflow table road = gpd.read_file('../data/SU/comp1.shp') #road file census = gpd.read_file('../data/SU/SU_census.shp') #demographic profiles wf = pd.read_csv('../data/workflow.csv',dtype=str,converters={'flow':int}) #commute flows # In[4]: #Type of roads and their counts pd.crosstab(road.COUNTYFP,road.MTFCC, margins=True, normalize=False) # ## Space Creation # Once the road cleaning is done, the first step of the population generation effort involves creating the spaces: # - the houses in which the households live # - the workplaces where the employed adults (18+) work # - the schools that the children (<18) attend # # We build all of these spaces onto the residential roads: # - DP table gives us number of housing per tract (DP0180001) # - Road shp has residential roads info in MAF/TIGER Feature Class Codes (MTFCC) (S1400 and S1740) # # Then, to create the living spaces: # 1. Get total length of residential roads within a census tract. # 2. Uniformly distribute the living spaces on these roads. # In[5]: def create_spaces(tract,road,num_of_schools=4): """Given roads and tract info, generate building locations and types (house, work, school) The number of empty houses assumed to be proportional to the number of occupied houses. Vacant houses are designated as workplaces. On average each census tract is 4000 people. Four schools per district sounds plausible. Returns a GeoDataFrame of two columns: Point and space type (house, work, or school) Parameters =========== tract: US Census tract `GeoSeries` with column DP0180001 (number of houses) and DP0130001 (number of households) road: road `GeoDataframe` with column MTFCC (read from TIGER/Line Shapefile) num_of_schools: an integer (default: 4) See: https://www.census.gov/geo/maps-data/data/tiger-data.html """ field = tract.geometry mask = road.intersects(field) & road.MTFCC.str.contains('S1400|S1740') rd = road[mask].intersection(field) rd = rd[rd.geom_type.isin(['LinearRing', 'LineString', 'MultiLineString'])] #shapely geometries are not hashable, here is my hash function to check the duplicates def hashGeom(x): if x.geom_type == 'MultiLineString': return tuple((round(lat,6),round(lon,6)) for s in x for lat,lon in s.coords[:]) else: return tuple((round(lat,6),round(lon,6)) for lat,lon in x.coords[:]) rd = rd[~rd.apply(hashGeom).duplicated()] #DP0180001: Total housing units, DP0180002: Occupied housing units rph = rd.length.sum() / tract.DP0180001 #space between two houses #gpd.GeoSeries houses = rd.apply(lambda x: pd.Series([x.interpolate(seg) for seg in np.arange(0.00001,x.length,rph)])) houses = houses.unstack().dropna().reset_index(drop=True) #gpd.GeoSeries houses.index = tract.GEOID10[:-2] + houses.index.to_series().astype(str) space = gpd.GeoDataFrame(houses,columns=['geometry']) space['stype'] = 'w' #workplace n = tract.DP0130001 #number of households if tract.DP0120014 > 0: n += 1 #people living in group quarters (not in households) space.stype.loc[space.sample(n=n,random_state=123).index] = 'h' #house space.stype.loc[space[space.stype=='w'].sample(n=num_of_schools).index] = 's' #school return space # ## Population Generation # # For each census-tract, DP table provides the number of individuals for 18 age bands for each sex. # In age creation, we assume a uniform distribution within each band (Note: if the number of individuals in a band are not divisible by 5, then the remainder is added to the latest age in the band) # In[6]: def create_individuals(tract): """Generate a population of ages and sexes as a DataFrame Given the number of individuals for 18 age groups of each sex, return a two column DataFrame of age ([0,89]) and sex ('m','f') """ age_sex_groups = tract[22:59].drop('DP0010039') dfs=[] for code,count in enumerate(age_sex_groups): age_gr = code % 18 gender = 'm' if code < 18 else 'f' ages = [] for age in range(age_gr*5,age_gr*5+4): ages.extend([age]*(count//5)) ages.extend([age_gr*5+4]*(count-len(ages))) dfs.append(pd.DataFrame({'code':code, 'age':ages,'sex':[gender]*count})) df = pd.concat(dfs).sample(frac=1,random_state=123).reset_index(drop=True) df.index = tract.GEOID10[:-2] + df.index.to_series().astype(str) return df # ## Household Generation # # Number of households for 11 types are given in the DP table: # # 0 h&w (no<18) # 1 h&w&ch (ch<18) # 2 male (no<18) # 3 male (ch<18) # 4 female (no<18) # 5 female (ch<18) # 6 nonfamily group # 7 lone male <65 # 8 lone male >65 # 9 lone female<65 # 10 lone female>65 # # Using this information as a constraint, we assign individuals to households. # We also added other constraints such as # - `-4 < husband.age - wife.age < 9` # - father (mother) and the child age difference not to exceed 50 (40) # # Also, those who live in the group quarters given one place. # No assumption is made on their ages since we do not have information about the instutitions within the tract such as the number universities (college dorm), nursing houses, jails, or military bases. # In[13]: def create_households(tract): """ Eleven household types: 0 h&w (no<18) 1 h&w&ch (ch<18) 2 male (no<18) 3 male (ch<18) 4 female (no<18) 5 female (ch<18) 6 nonfamily group 7 lone male <65 8 lone male >65 9 lone female<65 10 lone female>65 """ householdConstraints = tract[150:166] #HOUSEHOLDS BY TYPE actually is in 151:166, never using [0] hh_cnt = pd.Series(np.zeros(11),dtype=int) #11 household types (group quarters are not household) # husband/wife families hh_cnt[0] = householdConstraints[4] - householdConstraints[5]; # husband-wife family hh_cnt[1] = householdConstraints[5]; # husband-wife family, OWN CHILDREN < 18 # male householders hh_cnt[2] = householdConstraints[6] - householdConstraints[7]; # single male householder hh_cnt[3] = householdConstraints[7]; # single male householder, OWN CHILDREN < 18 # female householders hh_cnt[4] = householdConstraints[8] - householdConstraints[9]; # single female householder hh_cnt[5] = householdConstraints[9]; # single female householder, OWN CHILDREN < 18 # nonfamily householders hh_cnt[6] = householdConstraints[10] - householdConstraints[11]; # nonfamily group living hh_cnt[7] = householdConstraints[12] - householdConstraints[13]; # lone male < 65 hh_cnt[8] = householdConstraints[13]; # lone male >= 65 hh_cnt[9] = householdConstraints[14] - householdConstraints[15]; # lone female < 65 hh_cnt[10] = householdConstraints[15]; # lone female >= 65 it = iter(range(11)) s = hh_cnt.apply(lambda x: x*[next(it)]) s = pd.Series([y for x in s for y in x]) #pd.Series(list(chain(*s.values))) hholds = pd.DataFrame() hholds['htype'] = s.sample(frac=1,random_state=123).reset_index(drop=True) #shuffle return hholds[hholds.htype != 6].sort_values('htype',ascending=False).append(hholds[hholds.htype == 6]) # In[33]: def populate_households(tract, people, hholds): """ Eleven household types: 0 h&w (no<18) 1 h&w&ch (ch<18) 2 male (no<18) 3 male (ch<18) 4 female (no<18) 5 female (ch<18) 6 nonfamily group 7 lone male <65 8 lone male >65 9 lone female<65 10 lone female>65 """ def gen_households(hh_type, mask): """Helper """ members = [] head_ranges = [range(4,18), range(4,14), range(4,18), range(4,14), range(22,36), range(22,30), chain(range(1,18),range(19,36)), range(4,13), range(13,18), range(21,31), range(31,36)] # head_age = [range(15,99), range(20,70), range(15,99), range(20,70), range(15,99), # range(20,70), range(15,99), range(15,65), range(65,99), range(15,65), range(65,99)] # head_sex= [('m'),('m'),('m'),('m'),('f'),('f'),('m','f'),('m'),('m'),('f'),('f')] #add the householder pot = people[mask].code iindex = pot[pot.isin(head_ranges[hh_type])].index[0] h1 = people.ix[iindex] #age & sex of h1 mask[iindex] = False members.append(iindex) #if living alone then return the household if hh_type > 6: return members #if husband and wife, add the wife if hh_type in (0,1): pot = people[mask].code iindex = pot[pot.isin(range(h1.code+17,h1.code+19))].index[0] h2 = people.ix[iindex] # -4 < husband.age - wife.age < 9 mask[iindex] = False members.append(iindex) #if may have children 18+ then add them # if (hh_type <= 5) and (h1.age > 36) and (np.random.random() < 0.3): # #to have a child older than 18, h1 should be at least 37 # pot = people[mask] # iindex = pot[pot.age.isin(range(18,h1.age-15))].index[0] # ch18 = people.ix[iindex] #child should be at least 19 years younger than h1 # mask[iindex] = False # members.append(iindex) """A child includes a son or daughter by birth (biological child), a stepchild, or an adopted child of the householder, regardless of the child’s age or marital status. The category excludes sons-in-law, daughters- in-law, and foster children.""" #household types with at least one child (18-) if hh_type in (1,3,5): #https://www.census.gov/hhes/families/files/graphics/FM-3.pdf if hh_type == 1: num_of_child = max(1,abs(int(np.random.normal(2)))) #gaussian touch elif hh_type == 3: num_of_child = max(1,abs(int(np.random.normal(1.6)))) #gaussian touch elif hh_type == 5: num_of_child = max(1,abs(int(np.random.normal(1.8)))) #gaussian touch pot = people[mask] if hh_type == 1: iindices = pot[(pot.age<18) & (45>h2.age-pot.age)].index[:num_of_child] else: #father (mother) and child age difference not to exceed 50 (40) age_diff = 45 if hh_type == 5 else 55 iindices = pot[(pot.age<18) & (age_diff>h1.age-pot.age)].index[:num_of_child] for i in iindices: child = people.ix[i] mask[i] = False members.append(i) #if nonfamily group then either friends or unmarried couples if hh_type == 6: pot = people[mask].code num_of_friends = max(1,abs(int(np.random.normal(1.3)))) #gaussian touch iindices = pot[pot.isin(range(h1.code-2,h1.code+3))].index[:num_of_friends] for i in iindices: friend = people.ix[i] mask[i] = False members.append(i) return members mask = pd.Series(True,index=people.index) #[True]*len(people) hholds['members'] = hholds.htype.apply(gen_households,args=(mask,)) """The seven types of group quarters are categorized as institutional group quarters (correctional facilities for adults, juvenile facilities, nursing facilities/skilled-nursing facilities, and other institutional facilities) or noninstitutional group quarters (college/university student housing, military quarters, and other noninstitutional facilities).""" group_population = tract.DP0120014 #people living in group quarters (not in households) #gq_indices = people[(people.age>=65) | (people.age<18)].index[:group_population] gq_indices = people[mask].index[:group_population] #for i in gq_indices: mask[i] = False mask.loc[gq_indices] = False #now distribute the remaining household guys as relatives... relatives = people[mask].index it = iter(relatives) #sample by replacement relative_hhs = hholds[hholds.htype<7].sample(n=len(relatives),replace=True) relative_hhs.members.apply(lambda x: x.append(next(it))) #appends on mutable lists #for i in relatives: mask[i] = False mask.loc[relatives]= False #print('is anyone left homeless:',any(mask)) #add those living in group quarters as all living in a house of 12th type if group_population > 0: hholds.loc[len(hholds)] = {'htype':11, 'members':gq_indices} # In[9]: def assign_hholds_to_houses(hholds,space,people): hholds['house'] = space[space.stype=='h'].sample(frac=1,random_state=123).index def hh_2_people(hh,people): for m in hh.members: people.loc[m,'house']=hh.house people.loc[m,'htype']=hh.htype people['house'] = 'homeless' people['htype'] = 'homeless' hholds.apply(hh_2_people,args=(people,),axis=1) # ## Percentage Errors # DP table has other info which we can use to test the accuracy of our synthesized population against. # These are: # - average family size # - number of households with minors (<18) # - number of households with seniors (65+) # # We report our percentage errors, and provide scatter plots further below. # In[10]: def get_errors(tract,hholds,people): """Percentage errors """ err = {} fh = hholds[hholds.htype < 6] #family households hh = hholds[hholds.htype != 11] #all households err['tract'] = tract.GEOID10[:-2] err['population'] = tract.DP0010001 err['in_gq'] = tract.DP0120014 err['family_size'] = 100*(fh.members.apply(len).mean() - tract.DP0170001) / tract.DP0170001 err['senior_hh'] = 100*(people[people.age>=65].house.nunique() - tract.DP0150001) / tract.DP0150001 err['minor_hh'] = 100*(people[people.age<18].house.nunique() - tract.DP0140001) / tract.DP0140001 # err['senior_hh'] = 100*(hh.members.apply(lambda h: any(people.loc[m].age>=65 for m in h)).sum() - tract.DP0150001) / tract.DP0150001 # err['minor_hh'] = 100*(hh.members.apply(lambda h: any(people.loc[m].age<18 for m in h)).sum() - tract.DP0140001) / tract.DP0140001 return err # ## Assigning Minors to Schools # - Reminder: A census tract has a population of ~4000 on average. # - Maryland on average [has in total one public school per census tract](http://msa.maryland.gov/msa/mdmanual/01glance/html/edschools.html) (total of elementary, middle and high schools). # - And, each tract has four schools, one for each grade level: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
AgeLevel of StudyUS Grade
03 - 4Pre-schoolNaN
15 - 10Elementary / Primary SchoolKindergarten - 5th
211 - 13Middle School6th - 8th
314 - 18High School9th - 12th (Freshman - Senior)
# # Given these, we build one school for each grade-level. Assign children according to their age. # Note: pre-k starts from 0 instead of 3 in our case. # In[11]: def assign_kids_to_schools(people,space): """ Each tract has four schools, one for each grade level: Age Level of Study US Grade 3 - 4 Pre-school N/A 5 - 10 Elementary / Primary School Kindergarten - 5th 11 - 13 Middle School 6th - 8th 14 - 18 High School 9th - 12th (Freshman - Senior) """ people['work'] = 'unemployed' school = space[space.stype=='s'] people.loc[people.age.isin(range(0,5)),'work'] = school.index[0] people.loc[people.age.isin(range(5,11)),'work'] = school.index[1] people.loc[people.age.isin(range(11,14)),'work'] = school.index[2] people.loc[people.age.isin(range(14,18)),'work'] = school.index[3] # In[34]: # THIS IS THE CELL THAT APPLIES THE ABOVE SYNTHESIS OPERATIONS ON EACH CENSUS TRACT pops = [] # populations errs = [] # percentage errors spaces = [] # home, work, school locations def synthesize(tract,pops,spaces,errs,road): #pops=pops,spaces=spaces,errs=errs,road=road): # tract = tract[1] start_time = timeit.default_timer() print(tract.GEOID10[:-2],'started...',end=' ') space = create_spaces(tract,road) #spaces: houses + workspaces + schools people = create_individuals(tract) hhold = create_households(tract) populate_households(tract, people, hhold) assign_hholds_to_houses(hhold,space,people) assign_kids_to_schools(people,space) err = get_errors(tract,hhold,people) pops.append(people) spaces.append(space) errs.append(err) print(tract.GEOID10[:-2],'now ended ({:.1f} secs)'.format(timeit.default_timer() - start_time)) census.apply(synthesize,args=(pops,spaces,errs,road),axis=1); # synthesize_tracts = partial(synthesize, pops=pops,spaces=spaces,errs=errs,road=road) # with ThreadPoolExecutor(max_workers=100) as executor: # executor.map(synthesize_tracts, census.iterrows()) # ## Tract-level Outputs # # At census tract level we created these: # - Percentage errors table (family size, # of households w/ minor, # of households w/ ) # - Spaces table (home, work, school locations) # - Population table (age, sex, house, school) # # Now at county level, employed individuals will be sampled and assigned to work places given the commuting flow table. # In[ ]: #errors table err = pd.DataFrame(errs).set_index('tract') err = err[['population', 'in_gq', 'family_size', 'minor_hh', 'senior_hh']] err.to_csv('../data/errSU.csv') err.head() # In[ ]: #spaces (h (houses), w (work), s (school)) space = pd.concat(spaces) space = space.reset_index().rename(columns={'index':'id'}) space.to_file('../data/spaceSU.shp') print(space.stype.value_counts()) space.head() # In[ ]: #population: age, sex, house, school/work population = pd.concat(pops) population.to_csv('../data/populationSU.csv',index_label='id') print('population size:',len(population)) population.tail() #work not assigned yet, we'll do it next # # Workplace Assignment # # Census [Commuting (Journey to Work) Main](https://www.census.gov/hhes/commuting/) # page has a table of [County to County Commuting Flows](https://www.census.gov/hhes/commuting/files/2013/Table%201%20County%20to%20County%20Commuting%20Flows-%20ACS%202009-2013.xlsx) with the following columns: `Residence, Place of Work, Commuting Flow`. As discussed above, let's read in the cleaned files: # In[35]: #this is the output wf = pd.read_csv('../data/workflow.csv',dtype=str,converters={'flow':int}) population = pd.read_csv('../data/populationSU.csv',dtype=str,converters={'age':int}).set_index('id') space = gpd.read_file('../data/spaceSU.shp').set_index('id') errors = pd.read_csv('../data/errSU.csv',index_col='tract') wf.head() # In[36]: def assign_employed_to_workplaces(cfrom,wf,population,space): global assigned assigned = 0 cwf = wf[wf['from'] == cfrom] total_workers = cwf.flow.sum() cpeople = population[population.index.str[:5] == cfrom] workers = cpeople[cpeople.age>=18].sample(n=total_workers).index #ctos exclude 'to' counties that are not in our map #people working in those counties are assigned to their home county cids = population.index.str[:5].unique().values ctos = [c for c in cids if c != cfrom] cwft = cwf[cwf['to'].isin(ctos)] cto = cfrom wp = space[(space.stype.isin(['w','s']))] #all workplaces cwp = wp[wp.index.str[:5] == cto] #workplaces in cto n = total_workers - cwft.flow.sum() population.loc[workers[assigned:n+assigned],'work'] = cwp.sample(n=n,replace=True).index assigned += n def assign_workers(cto,workers,population): global assigned cwp = wp[wp.index.str[:5] == cto.to] #workplaces in cto n = cto.flow population.loc[workers[assigned:n+assigned],'work'] = cwp.sample(n=n,replace=True).index assigned += n cwft.apply(assign_workers,args=(workers,population),axis=1) # In[37]: #county level synthesis county = pd.Series(census.GEOID10.str[:5].unique()) county.apply(assign_employed_to_workplaces,args=(wf,population,space)) population['hwkt'] = population.house.map(space.geometry) population['wwkt'] = population.work.map(space.geometry) population.to_csv('../data/popSU.csv') population['age,sex,hwkt,wwkt'.split(',')].to_csv('../data/popSU_wkt.csv') # # Results # # - Population density map: number of people per land-km^2 # - Percentage error scatter plot # - Mapping living spaces (house, school, work) in a sample tract # - Social network of a sample tract # In[107]: import geoplot as gplt import geoplot.crs as gcrs #number of people per land-km^2 ax = gplt.choropleth(census, hue=1000000*(census.DP0010001/census.ALAND10), figsize=(12, 8), cmap='OrRd', linewidth=0.5, k=5, legend=True,scheme='Quantiles') ax.set(title='Ulster and Sullivan Counties, New York') ax.get_legend().set(title='population / km^2') ax.get_figure().savefig('../data/SU-map.png',bbox_inches='tight') #def annotate(x): ax.annotate(s=x.GEOID10[5:9], xy=x.geometry.centroid.coords[0], ha='center', size=8) #census.apply(annotate,axis=1); # In[91]: mpl.rcParams.update(mpl.rcParamsDefault) import seaborn as sns mpl.rcParams['savefig.dpi'] = 150 mpl.rcParams['figure.dpi'] = 150 mpl.rcParams['figure.figsize'] = 16, 12 mpl.rcParams['axes.facecolor']='white' mpl.rcParams['savefig.facecolor']='white' ax = errors.plot.scatter(x='family_size',y='minor_hh',figsize=(10,5)) ax.set(xlabel='Family size',ylabel='Households with Minor',title="Percentage Errors\n100*(Synthesized - Actual)/Actual"); # In[92]: ax = errors.plot.scatter(x='family_size',y='senior_hh',figsize=(10,5)) ax.set(xlabel='Family size',ylabel='Households with Senior',title="Percentage Errors\n100*(Synthesized - Actual)/Actual"); # ## Intimate Social Network # # Individuals living in the same house, or working in the same workplace, or attending the same school are grouped. When a group size is less than 7, a complete network is generated for that group (i.e. a clique of group size is added). If the number of house members, employees, or students in a group is seven or more, then a [Newman–Watts–Strogatz small-world graph](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.generators.random_graphs.newman_watts_strogatz_graph.html) is generated with `k=4` and `p=0.1` within that group. Then all the networks are combined. Here is the resulting network (each color represent one of the 18 age bands): # # # In[52]: #create the edges for a census tract #http://www.sciencedirect.com/science/article/pii/S0375960199007574 from itertools import combinations def create_edges(x,g,etype=None): """Creates the edges within group `g` and adds them to `edges` if the group size is <7, then a complete network is generated. Otherwise, a Newman–Watts–Strogatz small-world graph is generated with k=4 and p=0.1 """ if len(x)<7: for c in combinations(x.index,2): if etype: g.add_edge(*c,etype=etype) else: g.add_edge(*c) else: sw = nx.newman_watts_strogatz_graph(len(x),4,0.1) sw = nx.relabel_nodes(sw,dict(zip(sw.nodes(), x.index.values))) if etype: g.add_edges_from(sw.edges(),etype=etype) else: g.add_edges_from(sw.edges()) g = nx.Graph() nodes = population[population.index.str.startswith('361059506')] grouped = nodes.groupby('house') grouped.apply(lambda x: create_edges(x,g,etype='hhold')) grouped = nodes[nodes.age>=18].groupby('work') grouped.apply(lambda x: create_edges(x,g,etype='work')) grouped = nodes[nodes.age<18].groupby('work') grouped.apply(lambda x: create_edges(x,g,etype='school')) # In[54]: #get the giant connected component giant = max(nx.connected_component_subgraphs(g), key=len) nx.write_edgelist(giant,'../data/361059506_edge_list.csv',delimiter=',') giant.number_of_nodes() # In[56]: cc = list(zip(*giant.edges(data=True))) #edges df = pd.DataFrame({'Source':cc[0],'Target':cc[1]}) df = df.join(pd.DataFrame(list(cc[2]))) #df['e3'] = (df.min(axis=1).astype(int).astype(str) + df.max(axis=1).astype(int).astype(str)) #df = df.drop_duplicates(subset='e3').drop('e3',axis=1) df['type'] = 'undirected' df.to_csv('../data/361059506_giant_edges.csv',index=False) # In[57]: node_info = population[population.index.isin(giant.nodes())].copy() base = (node_info.code.astype(int) %18)* 5 node_info['band'] = base.astype(str) + '-' + (base+4).astype(str) node_info.to_csv('../data/361059506_giant_nodes.csv',columns=['band','sex','house','work','age','htype']) # ## Network Visualization # Read the network for visualization # In[63]: nodes = pd.read_csv('../data/361059506_giant_nodes.csv',dtype=str).set_index('id') nodes.head() # In[64]: edges = pd.read_csv('../data/361059506_giant_edges.csv',dtype=str) edges.head() # In[88]: #create the graph g = nx.from_pandas_dataframe(edges,'Source','Target',edge_attr='etype') g.add_nodes_from([(a,b.to_dict()) for a,b in nodes.iterrows()]) pos = nx.spring_layout(g) # In[104]: #draw the graph fig,ax = plt.subplots(); blues = plt.get_cmap('Blues') indiv = '361059506557' #hhold members of indiv hhold = [indiv] + [e[1] for e in g.edges([indiv],data=True) if e[2]['etype'] == 'hhold'] #edges within hhold (clique) hhold_edges = [e for e in g.edges(hhold,data=True) if e[2]['etype'] == 'hhold'] #edges outside hhold work_edges = set(g.edges(hhold)) - set([(e[0],e[1]) for e in hhold_edges]) #combined network of hhold members (school and work networks) nodeset = set([n for m in hhold for n in g.neighbors(m)]) labels = {n[0]:str(n[1]['age'])+','+n[1]['sex'] for n in g.nodes(data=True) if n[0] in nodeset} nx.draw_networkx(g,pos=pos,ax=ax,edgelist = [],with_labels=False,alpha=0.5,node_size=30,node_color='green') nx.draw_networkx(g,pos,nodelist=nodeset,node_color=[0.3]*len(nodeset),cmap=blues,vmin=0,vmax=1,alpha=0.5, labels=labels,node_size=1800,font_size=10,edgelist=[], ax=ax) #ax.get_figure().savefig('../data/nodes_only.png',bbox_inches='tight') nx.draw_networkx_edges(g,pos=pos, edge_color='red',edgelist = hhold_edges,width=3,ax=ax) #ax.get_figure().savefig('../data/hhold_edges.png',bbox_inches='tight') nx.draw_networkx_edges(g,pos=pos, edge_color='orange',edgelist = work_edges,width=3,ax=ax) #ax.get_figure().savefig('../data/work_edges.png',bbox_inches='tight') ax.get_figure().savefig('../data/all_nodes.png',bbox_inches='tight') # In[108]: Image(filename="../data/edge_types.png") # ## Spaces # In[46]: #let's plot the roads and the spaces for one tract geoid= '361059506' tract= census[census.GEOID10.str.startswith(geoid)].iloc[0] #census.iloc[61] buildings = space[space.index.str.startswith(geoid)] mask = road.intersects(tract.geometry) rd = road[mask].intersection(tract.geometry) # In[49]: #plt.style.use('ggplot') ax = gpd.GeoDataFrame(tract).T.plot(ls='--',color='white') #draw tract borders ax = rd.plot(color='grey',ax=ax) #draw roads ax = buildings[buildings.stype=='h'].plot(ax=ax,markersize=3,color='orange') #draw houses ax = buildings[buildings.stype=='w'].plot(ax=ax,markersize=3,color='purple') #draw workplaces ax = buildings[buildings.stype=='s'].plot(ax=ax,markersize=5,color='red') #draw schools ax.get_figure().savefig('../data/361059506-spaces.png',bbox_inches='tight') # ## Agents Moving # At this stage we wanted to visualize the output on a MASON model: Gridlock. # Gridlock expects a csv file of home-edge-ids and work-edge-ids. # So, we need to reverse engineer from the points to the roads (GEOID10s) they are on. # But before doing that, let's look at how it works when we randomly assign home and work road segments. # # Note: the video below will not show on github (as it does not run js), either play it on [youtube](https://youtu.be/jvDPzeOpiLA) directly, or open this notebook in [nbviewer](http://nbviewer.jupyter.org/gist/oztalha/674b9cbd7b56254cde33593801710bfd/Population%20Synthesis.ipynb). # In[191]: #generate data for gridlock gl = pd.DataFrame() h1 = road[road.COUNTYFP=='105'].LINEARID.sample(n=1000,replace=True) w1 = road[road.COUNTYFP=='105'].LINEARID.sample(n=1000,replace=True) h2 = road[road.COUNTYFP=='111'].LINEARID.sample(n=2500,replace=True) w2 = road[road.COUNTYFP=='111'].LINEARID.sample(n=2500,replace=True) gl['home'] = h1.append(h2,ignore_index=True) gl['work'] = w1.append(w2,ignore_index=True) print(gl.shape) gl.index.name = 'id_id' gl.to_csv('../data/gl.csv') gl.head() # In[197]: from IPython.display import YouTubeVideo YouTubeVideo('82vEQWiUvHI') #https://youtu.be/82vEQWiUvHI