import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory populating namespace with data
binary_df = pd.DataFrame()
gender = clinical.clinical.gender.dropna() == 'male'
binary_df['gender_male'] = gender
survival_and_stats(gender.ix[keepers_o], surv)
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas-0.14.0.dev-py2.7-linux-x86_64.egg/pandas/computation/expressions.py:184: UserWarning: evaluating in Python space because the '*' operator is not supported by numexpr for the bool dtype, use '&' instead unsupported[op_str]))
race = clinical.clinical.race
survival_and_stats(race.ix[keepers_o], surv)
white = race.ix[keepers_o].dropna() == 'white'
binary_df['race_white'] = white
black = race.ix[keepers_o].dropna() == 'black or african american'
binary_df['race_black'] = black
survival_and_stats(black, surv)
stage = clinical.stage.pathologicstage.ix[keepers_o].fillna('nx')
stage = stage.dropna().map(lambda s: s.replace('a','').replace('b',''))
stage = stage.map(lambda s: s.replace('stge','Stage'))
survival_and_stats(stage, surv)
binary_df['stage_iv'] = stage.replace('nx',nan).dropna() == 'Stage iv'
Lymphnode Stage
lymph_stage = clinical.stage.pathologicn
lymph_stage = lymph_stage.map(lambda s: s[:2], na_action='ignore')
lymph_stage = lymph_stage.fillna('nx')
survival_and_stats(lymph_stage.ix[keepers_o], surv)
binary_df['lymph_n0'] = lymph_stage.replace('nx',nan).dropna() == 'n0'
binary_df['lymph_n2+'] = lymph_stage.replace('nx',nan).dropna().isin(['n2','n3'])
Extra-Capsular Spread
spread = clinical.clinical.presenceofpathologicalnodalextracapsularspread
spread = spread.map(str.lower, na_action='ignore')
spread = spread.map({'no extranodal extension': False, 'microscopic extension':True,
'gross extension':True}).dropna()
survival_and_stats(spread.ix[keepers_o], surv)
binary_df['spread'] = spread
Combining lymph-node stage with extra-capsular spread
lymph = lymph_stage.ix[keepers_o] != 'n0'
lymph_status = combine(lymph, spread.ix[keepers_o])
lymph_status = lymph_status.map({'neither': 'n0', lymph.name: 'lymph_node',
'both': 'extra_capsular_spread'})
survival_and_stats(lymph_status, surv)
invasion = clinical.clinical.perineuralinvasionpresent.replace('nan', nan)
invasion = invasion.str.lower()
survival_and_stats(invasion.ix[keepers_o], surv)
binary_df['invasion'] = invasion.dropna() == 'yes'
group = [['oral tongue','oral cavity','floor of mouth','buccal mucosa','alveolar ridge','hard palate','lip'],
['oropharynx','tonsil','base of tongue'],
#['hypopharynx'],
['larynx']]
groups = ['oral cavity','oropharynx','larynx']
tumor_subdivision = pd.Series({idx: groups[i] for i,g in enumerate(group) for idx,j in
clinical.clinical.anatomicneoplasmsubdivision.iteritems()
if j.lower() in g})
survival_and_stats(tumor_subdivision.ix[keepers_o], surv)
binary_df['oral_cavity'] = tumor_subdivision.dropna() == 'oral cavity'
binary_df['larynx'] = tumor_subdivision.dropna() == 'oral larynx'
binary_df['oropharynx'] = tumor_subdivision.dropna() == 'oropharynx'
age = clinical.clinical.age.astype(float)
old_age = (age >= 75)
fig, ax = subplots(figsize=(3,3))
age.hist(color='grey')
prettify_ax(ax)
ax.set_ylabel('# of Patients')
ax.set_xlabel('Age in Years')
fig.tight_layout()
fig.savefig(FIGDIR + 'hpv_sup_d.pdf', transparent=True)
fig, ax = subplots(figsize=(5,3))
draw_survival_curve(1.*(age >= 85) + 1.*(age >=75), surv, ax=ax, colors=[colors[2], colors[4], colors[0]])
ax.legend(title=False, frameon=False, loc='lower right')
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'hpv_sup_c.pdf', transparent=True)
survival_stat_plot(get_surv_fit(surv, 1.*(age >= 85) + 1.*(age >=75)))
smoking = clinical.clinical.tobaccosmokinghistory.ix[keepers_o]
smoking = smoking.replace({'current reformed smoker for < or = 15 years': 'reformed (recent)',
'current reformed smoker for > 15 years': 'reformed (distant)'})
smoking.value_counts()
current smoker 84 reformed (recent) 72 lifelong non-smoker 44 reformed (distant) 40 dtype: int64
survival_and_stats(smoking, surv)
get_surv_fit_lr(surv, smoking.fillna('M'))
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
22.7 | 0.000148 | |||||||||
current smoker | 84 | 38 | 1.6 | 1.35 | NaN | 0.284 | 0.146 | 0.553 | ||
reformed (recent) | 72 | 35 | 2.36 | 1.49 | NaN | 0.368 | 0.251 | 0.54 | ||
lifelong non-smoker | 44 | 14 | 4.71 | 2.96 | NaN | 0.393 | 0.19 | 0.813 | ||
reformed (distant) | 40 | 7 | NaN | NaN | NaN | 0.803 | 0.681 | 0.947 | ||
M | 10 | 8 | 1.48 | 0.805 | NaN | 0.12 | 0.0194 | 0.744 |
pack_years = py = clinical.clinical.numberpackyearssmoked.dropna().astype(float)
binary_df['current_smoker'] = smoking.dropna() == 'current smoker'
binary_df['non_smoker'] = smoking.dropna() == 'lifelong non-smoker'
binary_df['recent_smoker'] = smoking.dropna().isin(['reformed (recent)',
'current smoker'])
freq = clinical.clinical.frequencyofalcoholconsumption.astype(float)
count = clinical.clinical.amountofalcoholconsumptionperday.fillna(0).astype(float)
drinks_per_week = (freq * count).ix[keepers_o].dropna()
drinks_per_week.hist()
<matplotlib.axes.AxesSubplot at 0x9234550>
survival_and_stats(drinks_per_week > 10, surv)
binary_df['drinking_status'] = drinks_per_week > 10
survival_and_stats(binary_df['drinking_status'].ix[keepers_o].fillna('M'), surv)
get_surv_fit_lr(surv, binary_df['drinking_status'].ix[keepers_o].fillna('M'))
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
3.2 | 0.202 | |||||||||
M | 148 | 67 | 2.84 | 1.81 | NaN | 0.401 | 0.305 | 0.528 | ||
True | 57 | 24 | 2.5 | 1.79 | NaN | 0.319 | 0.176 | 0.577 | ||
False | 45 | 11 | NaN | NaN | NaN | 0.694 | 0.552 | 0.871 |
clinical.clinical.alcoholhistorydocumented.ix[keepers_o].value_counts()
yes 166 no 79 dtype: int64
year = clinical.clinical.yearofinitialpathologicdiagnosis.ix[keepers_o]
year = year.replace('[Discrepancy]', nan).astype(float)
pre_2000 = year < 2000
pre_2000.name = 'pre_2000'
year.hist()
<matplotlib.axes.AxesSubplot at 0xa4abd50>
survival_and_stats(pre_2000, surv)
binary_df['pre_2000'] = pre_2000
binary_df = binary_df.ix[keepers_o].T
clinical.binary_df = binary_df
clinical.save()