import pandas as pd print pd.__version__ !head 'National Program of Cancer Registries, 1999-2010 Incidence.txt' #!tail -n 55 'National Program of Cancer Registries, 1999-2010 Incidence.txt' raw_df = pd.io.parsers.read_csv('National Program of Cancer Registries, 1999-2010 Incidence.txt', sep='\t', skipfooter=55, na_values=['Missing', 'Not Applicable'], ) #Check col names, also making sure file read properly raw_df.columns #easier col names to deal with col_names = ['notes', 'cancer_site', 'cancer_site_code', 'year', 'year_code', 'state', 'state_code', 'sex', 'sex_code', 'cancer_count', 'population', 'age_adj_rate', 'age_adj_rate_lower_95_confidence', 'age_adj_rate_upper_95_confidence'] #Make sure didn't miss anything print len(col_names), len(raw_df.columns) raw_df.columns = col_names print raw_df.columns #Let's drop some columns we don't need desired_cols = ['cancer_site', 'cancer_site_code', 'year', 'state', 'state_code', 'sex_code', 'cancer_count', 'population', 'age_adj_rate'] df = raw_df[desired_cols] #Separate out the combined data from the data on invididual cancer sites combined_df = df[df.cancer_site_code == '00'] df = df[df.cancer_site_code != '00'] len(df.cancer_site.unique()) print df.age_adj_rate.describe() """ Plotting cancer occurrences in each cancer site, by sex """ #excerpt (this is a view, right?) tt = df[['cancer_site', 'age_adj_rate', 'sex_code']] for idx, group in tt.groupby(by='cancer_site', ): plt.figure() group.boxplot(by='sex_code') plt.ylim((0,100)) plt.title(idx) """ I'm curious about the shape of the boxplot for Urinary bladders """ #temporary df for convenience plots tt = df[['age_adj_rate', 'state', 'cancer_site', 'sex_code']].fillna(0) tt = tt[tt.cancer_site == 'Urinary Bladder'] tt.boxplot(by='sex_code',figsize=(8,4), vert=False) plt.title("Urinary Bladder, adjusted rate by sex") tt.boxplot(by='state', figsize=(8,12), vert=False) plt.title("Urinary Bladder, count by state") """ I want those boxplots sorted, damnit. """ sorted_df = tt.groupby(by='state').age_adj_rate.sum() sorted_df.sort() sorted_states = list(sorted_df.index) fig, axarr = plt.subplots(nrows=len(sorted_states), sharex=True, ) axarr[0].set_title("Urinary Bladder occurrence, age adjusted\nby State") for ipos, state in enumerate(sorted_states): mask = (tt.state == state) tt[mask].boxplot( ax=axarr[ipos], positions = [ipos+1], vert=False, rot=90, widths=0.75, ) #Cosmetics (axis) axarr[ipos].set_yticks([]) axarr[ipos].set_ylabel('{}'.format(state), rotation='horizontal') axarr[ipos].set_frame_on(False) #Cosmetics (figure) fig.set_size_inches(8,25) fig.subplots_adjust(hspace = 0) fig.show()