Elements of Data Science
Copyright 2021 Allen B. Downey
License: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
Downloaded November 16, 2018:
import pandas as pd
import numpy as np
from statadict import parse_stata_dict
stata_dict = parse_stata_dict('data/2013_2015_FemPregSetup.dct')
import gzip
fp = gzip.open('data/2013_2015_FemPregData.dat.gz')
nsfg = pd.read_fwf(fp,
names=stata_dict.names,
colspecs=stata_dict.colspecs)
nsfg.head()
CASEID | PREGORDR | HOWPREG_N | HOWPREG_P | MOSCURRP | NOWPRGDK | PREGEND1 | PREGEND2 | HOWENDDK | NBRNALIV | ... | SECU | SEST | CMINTVW | CMLSTYR | CMJAN3YR | CMJAN4YR | CMJAN5YR | QUARTER | PHASE | INTVWYEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60418 | 1 | NaN | NaN | NaN | NaN | 5.0 | NaN | NaN | 1.0 | ... | 4 | 342 | 1381 | 1369 | 1345 | 1333 | 1321 | 14 | 1 | 2015 |
1 | 60418 | 2 | NaN | NaN | NaN | NaN | 5.0 | NaN | NaN | 1.0 | ... | 4 | 342 | 1381 | 1369 | 1345 | 1333 | 1321 | 14 | 1 | 2015 |
2 | 60418 | 3 | NaN | NaN | NaN | NaN | 5.0 | NaN | NaN | 1.0 | ... | 4 | 342 | 1381 | 1369 | 1345 | 1333 | 1321 | 14 | 1 | 2015 |
3 | 60419 | 1 | 33.0 | 1.0 | 8.0 | NaN | NaN | NaN | NaN | NaN | ... | 3 | 318 | 1388 | 1376 | 1345 | 1333 | 1321 | 16 | 1 | 2015 |
4 | 60420 | 1 | NaN | NaN | NaN | NaN | 6.0 | NaN | NaN | 1.0 | ... | 1 | 339 | 1388 | 1376 | 1345 | 1333 | 1321 | 16 | 1 | 2015 |
5 rows × 278 columns
import utils
nsfg = utils.read_stata('data/2013_2015_FemPregSetup.dct',
'data/2013_2015_FemPregData.dat.gz',
compression='gzip')
variables = ['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1',
'prglngth', 'nbrnaliv', 'agecon', 'agepreg', 'birthord',
'hpagelb', 'wgt2013_2015']
nsfg = nsfg[variables]
nsfg.shape
(9358, 11)
nsfg.to_hdf('nsfg.hdf5', 'nsfg')
%time nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')
CPU times: user 8 ms, sys: 4 ms, total: 12 ms Wall time: 12.1 ms
np.random.seed(18)
sample = utils.resample_rows_weighted(nsfg, 'wgt2013_2015')
sample.shape
(9358, 11)
sample.to_hdf('nsfg_sample.hdf5', 'nsfg')
%time nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')
CPU times: user 12 ms, sys: 0 ns, total: 12 ms Wall time: 11 ms
type(nsfg)
pandas.core.frame.DataFrame
nsfg.shape
(9358, 11)
nsfg.head()
caseid | outcome | birthwgt_lb1 | birthwgt_oz1 | prglngth | nbrnaliv | agecon | agepreg | birthord | hpagelb | wgt2013_2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60418 | 1 | 5.0 | 4.0 | 40 | 1.0 | 2000 | 2075.0 | 1.0 | 22.0 | 3554.964843 |
1 | 60418 | 1 | 4.0 | 12.0 | 36 | 1.0 | 2291 | 2358.0 | 2.0 | 25.0 | 3554.964843 |
2 | 60418 | 1 | 5.0 | 4.0 | 36 | 1.0 | 3241 | 3308.0 | 3.0 | 52.0 | 3554.964843 |
3 | 60419 | 6 | NaN | NaN | 33 | NaN | 3650 | NaN | NaN | NaN | 2484.535358 |
4 | 60420 | 1 | 8.0 | 13.0 | 41 | 1.0 | 2191 | 2266.0 | 1.0 | 24.0 | 2903.782914 |
nsfg.columns
Index(['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1', 'prglngth', 'nbrnaliv', 'agecon', 'agepreg', 'birthord', 'hpagelb', 'wgt2013_2015'], dtype='object')
for column in nsfg.columns:
print(column)
caseid outcome birthwgt_lb1 birthwgt_oz1 prglngth nbrnaliv agecon agepreg birthord hpagelb wgt2013_2015