Retrieves data from the Census Bureau's ZIP Code Business Patterns API for a specific area and summarizes it by ZCTA. Three tables are generated: one for employees that contains employment, establishments, and wages, one for industries that contains counts of establishments by 2-digit sector NAICS codes, and one reference table that correlates sector numbers and names. Initial data retrieved from API is written to json, final output is written to a SQLite database.
Confirmed to work with the 2018 ZBP series
https://www.census.gov/data/developers/data-sets/cbp-nonemp-zbp/zbp-api.html
import pandas as pd, requests, sqlite3, os, json
from IPython.display import clear_output
#Crosswalk files - update only if necessary
uszips_file='zip_to_zcta_2019_uds.csv'
zcta_file='geocorr14_modified.csv'
uszips_path=os.path.join('inputs',uszips_file)
zcta_path=zcta_file=os.path.join('inputs',zcta_file)
#Dump files for api data storage
ejsonpath=os.path.join('outputs', 'emp_data.json')
ijsonpath=os.path.join('outputs', 'ind_data.json')
cjsonpath=os.path.join('outputs', 'codes_data.json')
#API variables - UPDATE THE YEAR
keyfile='census_key.txt'
year='2018'
dsource='zbp'
state='36'
ecols='ESTAB,EMP,PAYQTR1,PAYANN'
icols='ESTAB'
ncodes=['00','11','21','22','23','31-33','42','44-45','48-49',
'51','52','53','54','55','56','61','62','71','72','81',
'99']
#SQL output - UPDATE EACH TABLE NAME
dbname=os.path.join('outputs','testdb.sqlite')
emptable='zbp2018emp'
indtable='zbp2018ind'
codetable='zbp2018indcodes'
Read in the files that relate US ZIP codes to ZCTAs (from JSI) and ZCTAs to counties for local area (from MCDC Geocorr), then join them by ZIP Code to create a ZCTA to ZIP table for the local area
uszips=pd.read_csv(uszips_path, sep=',', dtype={'ZIP_CODE':str, 'ZCTA':str})
uszips.head()
ZIP_CODE | PO_NAME | STATE | ZIP_TYPE | ZCTA | zip_join_type | |
---|---|---|---|---|---|---|
0 | 00501 | Holtsville | NY | Post Office or large volume customer | 11742 | Spatial join to ZCTA |
1 | 00544 | Holtsville | NY | Post Office or large volume customer | 11742 | Spatial join to ZCTA |
2 | 00601 | Adjuntas | PR | Zip Code Area | 00601 | Zip Matches ZCTA |
3 | 00602 | Aguada | PR | Zip Code Area | 00602 | Zip Matches ZCTA |
4 | 00603 | Aguadilla | PR | Zip Code Area | 00603 | Zip Matches ZCTA |
#All ZIP Codes in US
uszips.shape
(41107, 6)
zcta=pd.read_csv(zcta_path, sep=',', dtype={'zcta5':str, 'county14':str})
zcta.head()
zcta5 | county14 | cntyname2 | zipname | pop10 | afact | |
---|---|---|---|---|---|---|
0 | 10001 | 36061 | New York NY | New York, NY | 21102 | 1.0 |
1 | 10002 | 36061 | New York NY | New York, NY | 81410 | 1.0 |
2 | 10003 | 36061 | New York NY | New York, NY | 56024 | 1.0 |
3 | 10004 | 36061 | New York NY | New York, NY | 3089 | 1.0 |
4 | 10005 | 36061 | New York NY | New York, NY | 7135 | 1.0 |
#ZCTAs in local area
zcta.shape
(214, 6)
#Merge ZIP Codes with ZCTAs for local area
zip2zcta = pd.merge(uszips[['ZIP_CODE','ZIP_TYPE','PO_NAME','ZCTA']],zcta[['zcta5','county14']],how='right',
left_on='ZCTA', right_on='zcta5').set_index('ZIP_CODE')
zip2zcta.drop(columns=['ZCTA'],inplace=True)
zip2zcta.head()
ZIP_TYPE | PO_NAME | zcta5 | county14 | |
---|---|---|---|---|
ZIP_CODE | ||||
10001 | Zip Code Area | New York | 10001 | 36061 |
10118 | Post Office or large volume customer | New York | 10001 | 36061 |
10120 | Post Office or large volume customer | New York | 10001 | 36061 |
10122 | Post Office or large volume customer | New York | 10001 | 36061 |
10123 | Post Office or large volume customer | New York | 10001 | 36061 |
#ZIP Codes in local area
zip2zcta.shape
(313, 4)
Request the ZBP data from the Census Bureau for the state, join the ZBP data to the local ZCTA to ZIP table based on ZIP Code, and group the data by ZCTA
with open(keyfile) as key:
api_key=key.read().strip()
base_url = f'https://api.census.gov/data/{year}/{dsource}'
base_url
'https://api.census.gov/data/2018/zbp'
This data is requested in a series of chunks which contain multiple ZIP Codes - do not rerun the requests block if retrieval is successful but subsequent notebook changes are needed. Proceed to the next block and pull data from json dump file.
def chunks(l, n):
# For item i in a range that is a length of l,
for i in range(0, len(l), n):
# Create an index range for l of n items:
yield l[i:i+n]
reqzips=list(chunks(zip2zcta.index.tolist(),48))
print('Number of chunks:',len(reqzips))
Number of chunks: 7
#Code 200 = success, do not rerun this block unless it's necessary
emp_data=[]
for i, v in enumerate (reqzips):
batchzips=','.join(v)
edata_url = f'{base_url}?get={ecols}&EMPSZES=001&for=zipcode:{batchzips}&key={api_key}'
response=requests.get(edata_url)
if response.status_code==200:
clear_output(wait=True)
data=response.json()
if i == 0:
for record in data:
emp_data.append(record)
else:
for record in data[1:]:
emp_data.append(record)
print('Retrieved data for chunk',i)
else:
print('***Problem with retrieval***, response code',response.status_code)
break
with open(ejsonpath, 'w') as f:
json.dump(emp_data, f)
print('Done - Data dumped to json file')
Retrieved data for chunk 6 Done - Data dumped to json file
with open(ejsonpath, 'r') as f:
ejsondata=json.load(f)
zbpemp=pd.DataFrame(ejsondata[1:], columns=ejsondata[0]).rename(columns={'PAYQTR1':'PAYQ1','PAYANN':'PAYAN','zip code':'zipcode'}).set_index('zipcode')
for field in zbpemp.columns:
zbpemp=zbpemp.astype(dtype={field:'int64'})
zbpemp.drop(columns=['EMPSZES'],inplace=True)
zbpemp.head()
ESTAB | EMP | PAYQ1 | PAYAN | |
---|---|---|---|---|
zipcode | ||||
10118 | 326 | 6358 | 202796 | 815737 |
10120 | 70 | 1378 | 41023 | 132076 |
10121 | 69 | 6104 | 199451 | 887899 |
10122 | 177 | 1917 | 40477 | 156824 |
10123 | 180 | 2036 | 51353 | 195513 |
#ZIP Codes retrieved - may differ from zip2zcta as some zips have no businesses
zbpemp.shape
(266, 4)
#Flag columns count the number of establishments for which data is not disclosed
flags=['FLAG_EMP','FLAG_PAYQ1','FLAG_PAYAN']
for flagcol in flags:
datacol=flagcol.split('_')[1]
zbpemp[flagcol]=0
zbpemp.loc[zbpemp[datacol] == 0, flagcol] = zbpemp['ESTAB']
zbpemp.head()
ESTAB | EMP | PAYQ1 | PAYAN | FLAG_EMP | FLAG_PAYQ1 | FLAG_PAYAN | |
---|---|---|---|---|---|---|---|
zipcode | |||||||
10118 | 326 | 6358 | 202796 | 815737 | 0 | 0 | 0 |
10120 | 70 | 1378 | 41023 | 132076 | 0 | 0 | 0 |
10121 | 69 | 6104 | 199451 | 887899 | 0 | 0 | 0 |
10122 | 177 | 1917 | 40477 | 156824 | 0 | 0 | 0 |
10123 | 180 | 2036 | 51353 | 195513 | 0 | 0 | 0 |
#Join to ZIP ZCTA crosswalk
zbpemp2zcta = pd.merge(zip2zcta,zbpemp,how='inner',left_index=True,right_index=True)
zbpemp2zcta.index.name = 'ZIP_CODE'
zbpemp2zcta.head()
ZIP_TYPE | PO_NAME | zcta5 | county14 | ESTAB | EMP | PAYQ1 | PAYAN | FLAG_EMP | FLAG_PAYQ1 | FLAG_PAYAN | |
---|---|---|---|---|---|---|---|---|---|---|---|
ZIP_CODE | |||||||||||
10001 | Zip Code Area | New York | 10001 | 36061 | 7248 | 151769 | 2717186 | 10646611 | 0 | 0 | 0 |
10118 | Post Office or large volume customer | New York | 10001 | 36061 | 326 | 6358 | 202796 | 815737 | 0 | 0 | 0 |
10120 | Post Office or large volume customer | New York | 10001 | 36061 | 70 | 1378 | 41023 | 132076 | 0 | 0 | 0 |
10122 | Post Office or large volume customer | New York | 10001 | 36061 | 177 | 1917 | 40477 | 156824 | 0 | 0 | 0 |
10123 | Post Office or large volume customer | New York | 10001 | 36061 | 180 | 2036 | 51353 | 195513 | 0 | 0 | 0 |
#ZIP codes in the local area that appear in the ZBP data
zbpemp2zcta.shape
(266, 11)
#Aggregate to ZCTAs
zctaemp=zbpemp2zcta[['zcta5','ESTAB','EMP','PAYQ1','PAYAN','FLAG_EMP','FLAG_PAYQ1','FLAG_PAYAN']].groupby(['zcta5'])[['ESTAB','EMP','PAYQ1','PAYAN','FLAG_EMP','FLAG_PAYQ1','FLAG_PAYAN']].sum()
zctaemp.head()
ESTAB | EMP | PAYQ1 | PAYAN | FLAG_EMP | FLAG_PAYQ1 | FLAG_PAYAN | |
---|---|---|---|---|---|---|---|
zcta5 | |||||||
10001 | 8006 | 163513 | 3053219 | 11948301 | 0 | 0 | 0 |
10002 | 2962 | 22737 | 176288 | 764720 | 0 | 0 | 0 |
10003 | 4273 | 98742 | 1830369 | 6916902 | 0 | 0 | 0 |
10004 | 1660 | 69025 | 2337956 | 7349828 | 0 | 0 | 0 |
10005 | 1397 | 48129 | 3043620 | 7586070 | 0 | 0 | 0 |
This data must be requested one record at a time. NOTE that this request can take a LONG TIME, up to one hour to complete for approx 300 ZIP Codes. Once the request is finished the data gets dumped into a json file. If the request is successful but subsequent blocks need to be modified, don't rerun the requests block - pull the data from the json file.
zipcodes=zbpemp2zcta.index.tolist()
len(zipcodes)
266
Retrieving approx 300 ZIP Codes takes 1 hour
NOTE - revise in the future to retrieve chunks of zip codes
#If this block is successful but there are subsequent problems, do not rerun it - start from the following block.
#For industry data, if there are no records for an industry create a blank record with zeros
n=0
z=0
ind_data=[['estab','naics','zipcode']]
for zcode in zipcodes:
#for zcode in zipcodes[0:5]:
clear_output(wait=True)
for naics in ncodes:
idata_url = f'{base_url}?get={icols}&NAICS2017={naics}&for=zipcode:{zcode}&key={api_key}'
try:
response=requests.get(idata_url)
except requests.exceptions.RequestException as e:
print (e)
break
if response.status_code==200:
jsondata=response.json()
ind_data.append(jsondata[1])
n=n+1
elif response.status_code==204:
record=['0',naics,zcode]
ind_data.append(record)
n=n+1
else:
print('Problem retrieving data, status code:',response.status_code)
break
z=z+1
print(n,'records have been retrieved for',z,'ZIP codes...')
print('Done')
with open(ijsonpath, 'w') as f:
json.dump(ind_data, f)
print('Data dumped to json file')
5586 records have been retrieved for 266 ZIP codes... Done Data dumped to json file
with open(ijsonpath, 'r') as f:
ijsondata=json.load(f)
zbpind = pd.DataFrame(ijsondata[1:],columns=ijsondata[0])
zbpind['estab']=zbpind['estab'].astype('int64')
zbpind.head()
estab | naics | zipcode | |
---|---|---|---|
0 | 7248 | 00 | 10001 |
1 | 0 | 11 | 10001 |
2 | 0 | 21 | 10001 |
3 | 5 | 22 | 10001 |
4 | 262 | 23 | 10001 |
#Pivot data to move NAICS to columns
zbpind_tab=zbpind.pivot(index='zipcode', columns='naics', values='estab')
zbpind_tab=zbpind_tab.add_prefix('N')
zbpind_tab.rename(columns=lambda x: x.replace('-', '_'),inplace=True)
zbpind_tab.head()
naics | N00 | N11 | N21 | N22 | N23 | N31_33 | N42 | N44_45 | N48_49 | N51 | ... | N53 | N54 | N55 | N56 | N61 | N62 | N71 | N72 | N81 | N99 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
zipcode | |||||||||||||||||||||
10001 | 7248 | 0 | 0 | 5 | 262 | 188 | 1079 | 654 | 62 | 418 | ... | 500 | 1453 | 153 | 363 | 161 | 405 | 313 | 528 | 506 | 9 |
10002 | 2962 | 0 | 0 | 0 | 100 | 60 | 167 | 520 | 46 | 66 | ... | 274 | 323 | 3 | 88 | 27 | 206 | 100 | 607 | 314 | 3 |
10003 | 4268 | 0 | 0 | 0 | 51 | 35 | 94 | 433 | 5 | 195 | ... | 372 | 672 | 25 | 120 | 75 | 342 | 561 | 658 | 504 | 8 |
10004 | 1600 | 0 | 0 | 0 | 34 | 13 | 68 | 65 | 20 | 109 | ... | 87 | 483 | 12 | 84 | 34 | 85 | 31 | 131 | 134 | 4 |
10005 | 1346 | 0 | 0 | 0 | 26 | 6 | 39 | 43 | 11 | 72 | ... | 78 | 421 | 25 | 74 | 21 | 41 | 30 | 81 | 102 | 3 |
5 rows × 21 columns
#Create column to summarize businesses that were not disclosed and categorized, but that are included in the total
zbpind_tab['NXX']=zbpind_tab.loc[:,'N00'].subtract(zbpind_tab.loc[:,'N11':'N99'].sum(axis=1))
zbpind_tab.head()
naics | N00 | N11 | N21 | N22 | N23 | N31_33 | N42 | N44_45 | N48_49 | N51 | ... | N54 | N55 | N56 | N61 | N62 | N71 | N72 | N81 | N99 | NXX |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
zipcode | |||||||||||||||||||||
10001 | 7248 | 0 | 0 | 5 | 262 | 188 | 1079 | 654 | 62 | 418 | ... | 1453 | 153 | 363 | 161 | 405 | 313 | 528 | 506 | 9 | 0 |
10002 | 2962 | 0 | 0 | 0 | 100 | 60 | 167 | 520 | 46 | 66 | ... | 323 | 3 | 88 | 27 | 206 | 100 | 607 | 314 | 3 | 2 |
10003 | 4268 | 0 | 0 | 0 | 51 | 35 | 94 | 433 | 5 | 195 | ... | 672 | 25 | 120 | 75 | 342 | 561 | 658 | 504 | 8 | 2 |
10004 | 1600 | 0 | 0 | 0 | 34 | 13 | 68 | 65 | 20 | 109 | ... | 483 | 12 | 84 | 34 | 85 | 31 | 131 | 134 | 4 | 1 |
10005 | 1346 | 0 | 0 | 0 | 26 | 6 | 39 | 43 | 11 | 72 | ... | 421 | 25 | 74 | 21 | 41 | 30 | 81 | 102 | 3 | 0 |
5 rows × 22 columns
#Join to ZIP ZCTA crosswalk
zbpind2zcta = pd.merge(zip2zcta[['zcta5']],zbpind_tab,how='inner',left_index=True,right_index=True)
zbpind2zcta.index.name = 'ZIP_CODE'
zbpind2zcta.head()
zcta5 | N00 | N11 | N21 | N22 | N23 | N31_33 | N42 | N44_45 | N48_49 | ... | N54 | N55 | N56 | N61 | N62 | N71 | N72 | N81 | N99 | NXX | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ZIP_CODE | |||||||||||||||||||||
10001 | 10001 | 7248 | 0 | 0 | 5 | 262 | 188 | 1079 | 654 | 62 | ... | 1453 | 153 | 363 | 161 | 405 | 313 | 528 | 506 | 9 | 0 |
10118 | 10001 | 326 | 0 | 0 | 0 | 8 | 0 | 45 | 10 | 5 | ... | 103 | 12 | 17 | 0 | 8 | 29 | 11 | 23 | 0 | 5 |
10120 | 10001 | 70 | 0 | 0 | 0 | 0 | 0 | 8 | 8 | 0 | ... | 18 | 10 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 11 |
10122 | 10001 | 177 | 0 | 0 | 0 | 0 | 0 | 10 | 3 | 0 | ... | 45 | 0 | 7 | 0 | 5 | 53 | 0 | 10 | 0 | 3 |
10123 | 10001 | 180 | 0 | 0 | 0 | 7 | 0 | 14 | 4 | 0 | ... | 77 | 0 | 13 | 7 | 11 | 3 | 0 | 8 | 0 | 6 |
5 rows × 23 columns
zbpind2zcta.shape
(266, 23)
#Aggregate to ZCTAs
zctaind=zbpind2zcta.groupby(['zcta5']).sum()
zctaind.head()
N00 | N11 | N21 | N22 | N23 | N31_33 | N42 | N44_45 | N48_49 | N51 | ... | N54 | N55 | N56 | N61 | N62 | N71 | N72 | N81 | N99 | NXX | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
zcta5 | |||||||||||||||||||||
10001 | 8006 | 0 | 0 | 5 | 277 | 188 | 1156 | 679 | 67 | 450 | ... | 1696 | 175 | 400 | 168 | 432 | 398 | 539 | 547 | 9 | 30 |
10002 | 2962 | 0 | 0 | 0 | 100 | 60 | 167 | 520 | 46 | 66 | ... | 323 | 3 | 88 | 27 | 206 | 100 | 607 | 314 | 3 | 2 |
10003 | 4273 | 0 | 0 | 0 | 51 | 35 | 94 | 433 | 5 | 195 | ... | 672 | 25 | 120 | 75 | 342 | 561 | 658 | 504 | 8 | 7 |
10004 | 1660 | 0 | 0 | 0 | 34 | 13 | 68 | 65 | 20 | 126 | ... | 491 | 12 | 88 | 34 | 85 | 31 | 131 | 137 | 4 | 12 |
10005 | 1397 | 0 | 0 | 0 | 26 | 6 | 39 | 43 | 11 | 72 | ... | 421 | 25 | 74 | 21 | 41 | 30 | 81 | 102 | 3 | 13 |
5 rows × 22 columns
#Generate and calculate percent total columns
ncols=list(zctaind)
for c in ncols[1:]:
pct=c+'_PCT'
zctaind[pct]=((zctaind[c]/zctaind['N00'])*100).round(2)
zctaind.head()
N00 | N11 | N21 | N22 | N23 | N31_33 | N42 | N44_45 | N48_49 | N51 | ... | N54_PCT | N55_PCT | N56_PCT | N61_PCT | N62_PCT | N71_PCT | N72_PCT | N81_PCT | N99_PCT | NXX_PCT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
zcta5 | |||||||||||||||||||||
10001 | 8006 | 0 | 0 | 5 | 277 | 188 | 1156 | 679 | 67 | 450 | ... | 21.18 | 2.19 | 5.00 | 2.10 | 5.40 | 4.97 | 6.73 | 6.83 | 0.11 | 0.37 |
10002 | 2962 | 0 | 0 | 0 | 100 | 60 | 167 | 520 | 46 | 66 | ... | 10.90 | 0.10 | 2.97 | 0.91 | 6.95 | 3.38 | 20.49 | 10.60 | 0.10 | 0.07 |
10003 | 4273 | 0 | 0 | 0 | 51 | 35 | 94 | 433 | 5 | 195 | ... | 15.73 | 0.59 | 2.81 | 1.76 | 8.00 | 13.13 | 15.40 | 11.79 | 0.19 | 0.16 |
10004 | 1660 | 0 | 0 | 0 | 34 | 13 | 68 | 65 | 20 | 126 | ... | 29.58 | 0.72 | 5.30 | 2.05 | 5.12 | 1.87 | 7.89 | 8.25 | 0.24 | 0.72 |
10005 | 1397 | 0 | 0 | 0 | 26 | 6 | 39 | 43 | 11 | 72 | ... | 30.14 | 1.79 | 5.30 | 1.50 | 2.93 | 2.15 | 5.80 | 7.30 | 0.21 | 0.93 |
5 rows × 43 columns
codedict={}
codes_url=f'https://api.census.gov/data/2018/zbp/variables/NAICS2017.json'
response=requests.get(codes_url)
codes_data=response.json()
codedict.update(codes_data['values']['item'])
sectordict=dict((k, codedict[k]) for k in ncodes)
sectordict['XX']='Establishments omitted from classification due to privacy regulations'
codes=pd.DataFrame(list(sectordict.items()), columns=['naics', 'name']).set_index('naics')
codes
name | |
---|---|
naics | |
00 | Total for all sectors |
11 | Agriculture, forestry, fishing and hunting |
21 | Mining, quarrying, and oil and gas extraction |
22 | Utilities |
23 | Construction |
31-33 | Manufacturing |
42 | Wholesale trade |
44-45 | Retail trade |
48-49 | Transportation and warehousing |
51 | Information |
52 | Finance and insurance |
53 | Real estate and rental and leasing |
54 | Professional, scientific, and technical services |
55 | Management of companies and enterprises |
56 | Administrative and support and waste managemen... |
61 | Educational services |
62 | Health care and social assistance |
71 | Arts, entertainment, and recreation |
72 | Accommodation and food services |
81 | Other services (except public administration) |
99 | Industries not classified |
XX | Establishments omitted from classification due... |
#Does sum of industries equal industry total?
indsum=zctaind['N00'].subtract(zctaind.iloc[:,1:22].sum(axis=1))
if indsum.sum()==0:
print (True)
else:
print(indsum.loc[indsum != 0])
True
#Is sum of percent totals approximately 100?
ptotal=zctaind.iloc[:,22:].sum(axis=1)
if ptotal.loc[(ptotal <= 99.05) | (ptotal >= 100.05)].empty:
print(True)
else:
print(ptotal.loc[(ptotal <= 99.05) | (ptotal >= 100.05)])
True
#Do number of ZCTAs in employment table match the industries table?
ecount=zctaemp.shape[0]
icount=zctaind.shape[0]
if ecount == icount:
print (True)
else:
print('Mistmatched count between employment',ecount, 'rows and industry',icount, 'rows')
True
#Does sum of estabslishments from employment table equal establishments in industries table?
estsum=zctaemp['ESTAB'].subtract(zctaind['N00'])
if estsum.sum()==0:
print (True)
else:
print(estsum.loc[estsum != 0])
True
con = sqlite3.connect(dbname)
cur = con.cursor()
#Employment table
cur.execute('DROP TABLE IF EXISTS {};'.format(emptable))
qcreate_emptab="""
CREATE TABLE {}(
zcta5 TEXT NOT NULL PRIMARY KEY,
estab INTEGER,
emp INTEGER,
payq1 INTEGER,
payan INTEGER,
flag_emp INTEGER,
flag_payq1 INTEGER,
flag_payan INTEGER);
""".format(emptable)
cur.execute(qcreate_emptab)
<sqlite3.Cursor at 0x7fac37579570>
#Don't run this block unless you've run the previous one
zctaemp.to_sql(name='{}'.format(emptable), if_exists='append', index=True, con=con)
cur.execute('SELECT COUNT(*) FROM {};'.format(emptable))
rows = cur.fetchone()
print(rows[0], 'records written to', emptable)
212 records written to zbp2018emp
#Replace zeros with nulls, as these values really represent no data
for col in zctaemp.columns[1:]:
qupdate='UPDATE {} SET {} = NULL WHERE {} = 0;'.format(emptable,col,col)
cur.execute(qupdate)
print(cur.rowcount,'records updated for',col)
con.commit()
0 records updated for EMP 0 records updated for PAYQ1 0 records updated for PAYAN 212 records updated for FLAG_EMP 212 records updated for FLAG_PAYQ1 212 records updated for FLAG_PAYAN
#Industry table
cur.execute('DROP TABLE IF EXISTS {}'.format(indtable))
qcreate_indtab="""
CREATE TABLE {} (
zcta5 TEXT NOT NULL PRIMARY KEY,
N00 INTEGER,
N11 INTEGER,
N21 INTEGER,
N22 INTEGER,
N23 INTEGER,
N31_33 INTEGER,
N42 INTEGER,
N44_45 INTEGER,
N48_49 INTEGER,
N51 INTEGER,
N52 INTEGER,
N53 INTEGER,
N54 INTEGER,
N55 INTEGER,
N56 INTEGER,
N61 INTEGER,
N62 INTEGER,
N71 INTEGER,
N72 INTEGER,
N81 INTEGER,
N99 INTEGER,
NXX INTEGER,
N11_PCT REAL,
N21_PCT REAL,
N22_PCT REAL,
N23_PCT REAL,
N31_33_PCT REAL,
N42_PCT REAL,
N44_45_PCT REAL,
N48_49_PCT REAL,
N51_PCT REAL,
N52_PCT REAL,
N53_PCT REAL,
N54_PCT REAL,
N55_PCT REAL,
N56_PCT REAL,
N61_PCT REAL,
N62_PCT REAL,
N71_PCT REAL,
N72_PCT REAL,
N81_PCT REAL,
N99_PCT REAL,
NXX_PCT REAL);
""".format(indtable)
cur.execute(qcreate_indtab)
<sqlite3.Cursor at 0x7fac37579570>
#Don't run this block unless you've run the previous one
zctaind.to_sql(name='{}'.format(indtable), if_exists='append', index=True, con=con)
cur.execute('SELECT COUNT(*) FROM {};'.format(indtable))
rows = cur.fetchone()
print(rows[0], 'records written to', indtable)
212 records written to zbp2018ind
#For percentages, replace zeros with nulls, as these values really represent no data
for col in zctaind.columns[22:]:
qupdate='UPDATE {} SET {} = NULL WHERE {} = 0.0;'.format(indtable,col,col)
cur.execute(qupdate)
print(cur.rowcount,'records updated for',col)
con.commit()
212 records updated for N11_PCT 211 records updated for N21_PCT 201 records updated for N22_PCT 27 records updated for N23_PCT 58 records updated for N31_33_PCT 26 records updated for N42_PCT 18 records updated for N44_45_PCT 35 records updated for N48_49_PCT 39 records updated for N51_PCT 12 records updated for N52_PCT 10 records updated for N53_PCT 7 records updated for N54_PCT 140 records updated for N55_PCT 12 records updated for N56_PCT 39 records updated for N61_PCT 16 records updated for N62_PCT 46 records updated for N71_PCT 17 records updated for N72_PCT 8 records updated for N81_PCT 158 records updated for N99_PCT 19 records updated for NXX_PCT
#For establishments, replace zeros with nulls unless establishments were omitted from classification
for col in zctaind.columns[1:22]:
qupdate='UPDATE {} SET {} = NULL WHERE {} = 0 AND NXX !=0;'.format(indtable,col,col)
cur.execute(qupdate)
print(cur.rowcount,'records updated for',col)
con.commit()
193 records updated for N11 192 records updated for N21 184 records updated for N22 27 records updated for N23 58 records updated for N31_33 26 records updated for N42 18 records updated for N44_45 35 records updated for N48_49 39 records updated for N51 12 records updated for N52 10 records updated for N53 7 records updated for N54 132 records updated for N55 12 records updated for N56 39 records updated for N61 16 records updated for N62 46 records updated for N71 17 records updated for N72 8 records updated for N81 147 records updated for N99 0 records updated for NXX
#NAICS code table
cur.execute('DROP TABLE IF EXISTS {};'.format(codetable))
qcreate_codetab="""
CREATE TABLE {}(
naics TEXT NOT NULL PRIMARY KEY,
name TEXT);
""".format(codetable)
cur.execute(qcreate_codetab)
<sqlite3.Cursor at 0x7fac37579570>
#Don't run this block unless you've run the previous one
codes.to_sql(name='{}'.format(codetable), if_exists='append', index=True, con=con)
cur.execute('SELECT COUNT(*) FROM {};'.format(codetable))
rows = cur.fetchone()
print(rows[0], 'records written to', codetable)
22 records written to zbp2018indcodes
con.close()