Preparing the NSFG Data¶

Elements of Data Science

License: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International

Loading¶

Downloaded November 16, 2018:

In [1]:

import pandas as pd
import numpy as np

In [18]:

from statadict import parse_stata_dict

stata_dict = parse_stata_dict('data/2013_2015_FemPregSetup.dct')

In [19]:

import gzip

fp = gzip.open('data/2013_2015_FemPregData.dat.gz')

nsfg = pd.read_fwf(fp, 
                   names=stata_dict.names, 
                   colspecs=stata_dict.colspecs)

In [21]:

nsfg.head()

Out[21]:

	CASEID	PREGORDR	HOWPREG_N	HOWPREG_P	MOSCURRP	NOWPRGDK	PREGEND1	PREGEND2	HOWENDDK	NBRNALIV	...	SECU	SEST	CMINTVW	CMLSTYR	CMJAN3YR	CMJAN4YR	CMJAN5YR	QUARTER	PHASE	INTVWYEAR
0	60418	1	NaN	NaN	NaN	NaN	5.0	NaN	NaN	1.0	...	4	342	1381	1369	1345	1333	1321	14	1	2015
1	60418	2	NaN	NaN	NaN	NaN	5.0	NaN	NaN	1.0	...	4	342	1381	1369	1345	1333	1321	14	1	2015
2	60418	3	NaN	NaN	NaN	NaN	5.0	NaN	NaN	1.0	...	4	342	1381	1369	1345	1333	1321	14	1	2015
3	60419	1	33.0	1.0	8.0	NaN	NaN	NaN	NaN	NaN	...	3	318	1388	1376	1345	1333	1321	16	1	2015
4	60420	1	NaN	NaN	NaN	NaN	6.0	NaN	NaN	1.0	...	1	339	1388	1376	1345	1333	1321	16	1	2015

5 rows × 278 columns

In [6]:

import utils

nsfg = utils.read_stata('data/2013_2015_FemPregSetup.dct', 
                        'data/2013_2015_FemPregData.dat.gz',
                        compression='gzip')

In [8]:

variables = ['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1',
             'prglngth', 'nbrnaliv', 'agecon', 'agepreg', 'birthord',
             'hpagelb', 'wgt2013_2015']

nsfg = nsfg[variables]
nsfg.shape

Out[8]:

(9358, 11)

In [9]:

nsfg.to_hdf('nsfg.hdf5', 'nsfg')

In [10]:

%time nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')

CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 12.1 ms

In [16]:

np.random.seed(18)

sample = utils.resample_rows_weighted(nsfg, 'wgt2013_2015')
sample.shape

Out[16]:

(9358, 11)

In [17]:

sample.to_hdf('nsfg_sample.hdf5', 'nsfg')

Loading the unsampled data¶

In [8]:

%time nsfg = pd.read_hdf('nsfg.hdf5', 'nsfg')

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 11 ms

In [9]:

type(nsfg)

Out[9]:

pandas.core.frame.DataFrame

In [10]:

nsfg.shape

Out[10]:

(9358, 11)

In [11]:

nsfg.head()

Out[11]:

	caseid	outcome	birthwgt_lb1	birthwgt_oz1	prglngth	nbrnaliv	agecon	agepreg	birthord	hpagelb	wgt2013_2015
0	60418	1	5.0	4.0	40	1.0	2000	2075.0	1.0	22.0	3554.964843
1	60418	1	4.0	12.0	36	1.0	2291	2358.0	2.0	25.0	3554.964843
2	60418	1	5.0	4.0	36	1.0	3241	3308.0	3.0	52.0	3554.964843
3	60419	6	NaN	NaN	33	NaN	3650	NaN	NaN	NaN	2484.535358
4	60420	1	8.0	13.0	41	1.0	2191	2266.0	1.0	24.0	2903.782914

In [12]:

nsfg.columns

Out[12]:

Index(['caseid', 'outcome', 'birthwgt_lb1', 'birthwgt_oz1', 'prglngth',
       'nbrnaliv', 'agecon', 'agepreg', 'birthord', 'hpagelb', 'wgt2013_2015'],
      dtype='object')

In [13]:

for column in nsfg.columns:
    print(column)

caseid
outcome
birthwgt_lb1
birthwgt_oz1
prglngth
nbrnaliv
agecon
agepreg
birthord
hpagelb
wgt2013_2015