In [1]:
import pandas as pd
print pd.__version__
0.12.0
/home/aman/Workspace/ENVSYS/lib/python2.7/site-packages/pytz/__init__.py:35: UserWarning: Module numpy was already imported from /usr/local/anaconda/lib/python2.7/site-packages/numpy/__init__.pyc, but /home/aman/Workspace/ENVSYS/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream
In [2]:
!head 'National Program of Cancer Registries, 1999-2010 Incidence.txt'
#!tail -n 55 'National Program of Cancer Registries, 1999-2010 Incidence.txt'









In [3]:
raw_df = pd.io.parsers.read_csv('National Program of Cancer Registries, 1999-2010 Incidence.txt', 
        sep='\t', 
        skipfooter=55,
        na_values=['Missing', 'Not Applicable'],
)
In [4]:
#Check col names, also making sure file read properly
raw_df.columns
Out[4]:
Index([u'Notes', u'Leading Cancer Sites', u'Leading Cancer Sites Code', u'Year', u'Year Code', u'State', u'State Code', u'Sex', u'Sex Code', u'Count', u'Population', u'Age-Adjusted Rate', u'Age-Adjusted Rate Lower 95% Confidence Interval', u'Age-Adjusted Rate Upper 95% Confidence Interval'], dtype=object)
In [5]:
#easier col names to deal with
col_names = ['notes', 'cancer_site', 'cancer_site_code', 'year', 'year_code', 'state', 'state_code', 'sex', 'sex_code', 'cancer_count', 'population', 'age_adj_rate',
 'age_adj_rate_lower_95_confidence', 'age_adj_rate_upper_95_confidence']

#Make sure didn't miss anything
print len(col_names), len(raw_df.columns)
14 14
In [6]:
raw_df.columns = col_names
print raw_df.columns
Index([u'notes', u'cancer_site', u'cancer_site_code', u'year', u'year_code', u'state', u'state_code', u'sex', u'sex_code', u'cancer_count', u'population', u'age_adj_rate', u'age_adj_rate_lower_95_confidence', u'age_adj_rate_upper_95_confidence'], dtype=object)
In [7]:
#Let's drop some columns we don't need
desired_cols = ['cancer_site', 'cancer_site_code', 'year', 'state', 'state_code', 'sex_code', 'cancer_count', 'population', 'age_adj_rate']
df = raw_df[desired_cols]

#Separate out the combined data from the data on invididual cancer sites
combined_df = df[df.cancer_site_code == '00']
df = df[df.cancer_site_code != '00']
In [8]:
len(df.cancer_site.unique())
Out[8]:
22
In [9]:
print df.age_adj_rate.describe()
count    18873.000000
mean        24.193403
std         34.180612
min          0.000000
25%          6.900000
50%         11.300000
75%         22.000000
max        244.800000
dtype: float64
In [10]:
"""
Plotting cancer occurrences in each cancer site, by sex
"""
#excerpt (this is a view, right?)
tt = df[['cancer_site', 'age_adj_rate', 'sex_code']]

for idx, group in tt.groupby(by='cancer_site', ):
    plt.figure()
    group.boxplot(by='sex_code')
    plt.ylim((0,100))
    plt.title(idx)
In [11]:
"""
I'm curious about the shape of the boxplot for Urinary bladders
"""
#temporary df for convenience plots
tt = df[['age_adj_rate', 'state', 'cancer_site', 'sex_code']].fillna(0)
tt = tt[tt.cancer_site == 'Urinary Bladder']

tt.boxplot(by='sex_code',figsize=(8,4), vert=False)
plt.title("Urinary Bladder, adjusted rate by sex")
Out[11]:
<matplotlib.text.Text at 0x4babd10>
In [12]:
tt.boxplot(by='state', figsize=(8,12), vert=False)
plt.title("Urinary Bladder, count by state")
Out[12]:
<matplotlib.text.Text at 0x4bc5810>
In [19]:
"""
I want those boxplots sorted, damnit. 
"""
sorted_df = tt.groupby(by='state').age_adj_rate.sum()
sorted_df.sort()
sorted_states = list(sorted_df.index)

fig, axarr = plt.subplots(nrows=len(sorted_states), sharex=True, )
axarr[0].set_title("Urinary Bladder occurrence, age adjusted\nby State")
for ipos, state in enumerate(sorted_states):
    mask = (tt.state == state)
    tt[mask].boxplot(
            ax=axarr[ipos],
            positions = [ipos+1], 
            vert=False, 
            rot=90, 
            widths=0.75,
            )
    
    #Cosmetics (axis)
    axarr[ipos].set_yticks([])
    axarr[ipos].set_ylabel('{}'.format(state), rotation='horizontal')
    axarr[ipos].set_frame_on(False)
    

#Cosmetics (figure)
fig.set_size_inches(8,25)
fig.subplots_adjust(hspace = 0)

fig.show()
In [ ]: