import re import pandas as pd import numpy as np def parseStataCode(statafile): with open(statafile,'r') as f: statacode = f.readlines() indStart = min([i for i in range(len(statacode)) if statacode[i].find('infix')>-1]) indEnd = min([i for i in range(len(statacode)) if statacode[i].find('using')>-1]) coldata =statacode[indStart:indEnd] coldata[0]=coldata[0].replace('infix','') startcol=[re.findall(' (\d+)', x)[0] for x in coldata]; maxString = re.findall(r'(\d+)\-*(\d*)', coldata[-1])[0][-1] startcol.append(maxString) startcol=np.array([int(x) for x in startcol]) widths = np.diff(startcol) varnames = [re.findall('[A-Z]\w+', x)[0] for x in coldata] out = pd.DataFrame({'varnames':varnames, 'widths':widths }) return out wt = parseStataCode('StataLoad_NIS_2008_Core.Do') def defineFormat(wt, keep=['DRG','HOSPID']): widths = list(wt.widths) varnames=list(wt.varnames) fieldwidths = [-x for x in widths] for s in keep: fieldwidths[varnames.index(s)] *= -1 fmtstring = ' '.join('{}{}'.format(abs(fw), 'x' if fw < 0 else 's') for fw in fieldwidths) return fmtstring defineFormat(wt) import struct fieldstruct = struct.Struct(defineFormat(wt)) parse = fieldstruct.unpack_from import zipfile dat = [] drgcodes = ['500'] with zipfile.ZipFile('NIS_2008_Core.exe','r') as zf: with zf.open("NIS_2008_Core.ASC", 'r') as f: for line in f: x = list(parse(line)) if x[0] in drgcodes: dat.append(x) dat = pd.DataFrame(dat, columns = ['DRG','HOSPID']) def categorize(s, bins=[0,10,20,5000], labels=['Low','Medium','High']): """ Split into categories """ out = pd.cut(s, bins=bins, labels=labels) return out volumes = categorize(dat.HOSPID.value_counts()) freq = volumes.value_counts() freq %matplotlib inline freq.plot(kind='bar',rot=30);