import NotebookImport
from Imports import *
importing IPython notebook from Imports.ipynb Populating the interactive namespace from numpy and matplotlib changing to source dirctory populating namespace with data
stage = clinical.stage.pathologicstage.ix[keepers_o].fillna('nx')
stage = stage.dropna().map(lambda s: s.replace('a','').replace('b',''))
stage = stage.map(lambda s: s.replace('stge','Stage'))
lymph_stage = clinical.stage.pathologicn.ix[keepers_o]
lymph_stage = lymph_stage.dropna().map(lambda s: s[:2])
old_age = (age >= 75).map({True: 'Age > 75', False: 'Age < 75'})
pack_years = py = clinical.clinical.numberpackyearssmoked.dropna().astype(float)
group = [['oral tongue','oral cavity','floor of mouth','buccal mucosa','alveolar ridge','hard palate','lip'],
['oropharynx','tonsil','base of tongue'],
#['hypopharynx'],
['larynx']]
groups = ['oral cavity','oropharynx','larynx']
tumor_subdivision = pd.Series({idx: groups[i] for i,g in enumerate(group) for idx,j in
clinical.clinical.anatomicneoplasmsubdivision.iteritems()
if j.lower() in g})
invasion = clinical.clinical.perineuralinvasionpresent.replace('nan', nan)
invasion = invasion.str.lower()
spread = clinical.clinical.presenceofpathologicalnodalextracapsularspread
spread = spread.map(str.lower, na_action='ignore')
spread = spread.map({'no extranodal extension': 'no', 'microscopic extension':'yes',
'gross extension':'yes'}).dropna()
year = clinical.clinical.yearofinitialpathologicdiagnosis
year = year.replace('[Discrepancy]', nan).astype(float)
lymph = lymph_stage.ix[keepers_o] != 'n0'
lymph_status = combine(lymph, spread.ix[keepers_o]=='yes')
lymph_status = lymph_status.map({'neither': 'n0', lymph.name: 'lymph_node', 'both': 'extra_capsular_spread'})
#lymph_status = (lymph_status == 'extra_capsular_spread').astype(float)
from Stats.Classification import SVC_fill
smoker = clinical.clinical.tobaccosmokinghistory.str.lower().ix[keepers_o].dropna()
smoker_binary = smoker[smoker.isin(['current smoker','lifelong non-smoker'])] == 'current smoker'
smoker.value_counts()
current smoker 84 current reformed smoker for < or = 15 years 72 lifelong non-smoker 44 current reformed smoker for > 15 years 40 dtype: int64
ret = SVC_fill(smoker_binary, rna.features.ix['real'])
ret['auc']
0.953125
figsize(6,4)
fun = ret['decision_function']
o = ['current smoker','current reformed smoker for < or = 15 years',
'current reformed smoker for > 15 years', 'lifelong non-smoker']
violin_plot_pandas(smoker, fun, order = o)
ax = plt.gca()
t = ax.set_xticklabels(o, rotation=20)
prettify_ax(ax)
get_surv_fit_lr(surv, smoker_binary.ix[keepers_o].fillna('Missing'))
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
7.54 | 0.0231 | |||||||||
Missing | 122 | 50 | 4 | 2.3 | NaN | 0.484 | 0.387 | 0.606 | ||
True | 84 | 38 | 1.6 | 1.35 | NaN | 0.284 | 0.146 | 0.553 | ||
False | 44 | 14 | 4.71 | 2.96 | NaN | 0.393 | 0.19 | 0.813 |
4 rows × 10 columns
smoker_inferred = 1.*smoker_binary.combine_first(ret['filled_feature'])
smoker_inferred.name = 'smoker_inferred'
pd.crosstab(smoker_inferred, smoker.ix[smoker_inferred.index].fillna('M')).T
smoker_inferred | 0.0 | 1.0 |
---|---|---|
tobaccosmokinghistory | ||
M | 8 | 10 |
current reformed smoker for < or = 15 years | 18 | 54 |
current reformed smoker for > 15 years | 23 | 17 |
current smoker | 0 | 84 |
lifelong non-smoker | 44 | 0 |
5 rows × 2 columns
get_surv_fit_lr(clinical.survival.survival, smoker_inferred)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
13 | 0.000304 | |||||||||
1 | 165 | 89 | 1.83 | 1.5 | 2.84 | 0.335 | 0.25 | 0.447 | ||
0 | 93 | 32 | 5.48 | 4.49 | NaN | 0.533 | 0.392 | 0.724 |
3 rows × 10 columns
survival_and_stats(smoker_binary.ix[smoker_inferred.index], surv)
si = smoker_inferred.map({1:'smoker_inf', 0:'non-smoker_inf'})
s = smoker_binary.map({True:'smoker', False:'non-smoker'})
survival_and_stats(s.combine_first(si), surv)
figsize(6,4)
clinical.clinical.amountofalcoholconsumptionperday.astype(float).hist()
<matplotlib.axes.AxesSubplot at 0xaf76950>
clinical.clinical.alcoholhistorydocumented.value_counts()
yes 254 no 118 dtype: int64
freq = clinical.clinical.frequencyofalcoholconsumption.astype(float)
count = clinical.clinical.amountofalcoholconsumptionperday.astype(float)
drinker = (freq * count).ix[keepers_o].dropna()
print len(drinker)
drinker = drinker[(drinker < 8) + (drinker > 14)]
drinker = drinker > 10
95
drinker.value_counts()
True 45 False 35 dtype: int64
ret = SVC_fill(drinker, rna.features.ix['real'])
ret['auc']
0.83750000000000002
fun = ret['decision_function']
violin_plot_pandas(drinker, fun)
series_scatter((freq * count).dropna(), fun)
xlim(-1,20)
(-1, 20)
get_surv_fit_lr(surv, drinker*1.)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
1.73 | 0.189 | |||||||||
1 | 45 | 16 | 4 | 1.79 | NaN | 0.393 | 0.221 | 0.696 | ||
0 | 35 | 7 | NaN | NaN | NaN | 0.739 | 0.583 | 0.935 |
3 rows × 10 columns
drinker_inferred = 1.*drinker.combine_first(ret['filled_feature'])
get_surv_fit_lr(surv, drinker_inferred)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
6.27 | 0.0122 | |||||||||
1 | 183 | 87 | 2.16 | 1.6 | 3.53 | 0.358 | 0.276 | 0.464 | ||
0 | 75 | 22 | NaN | 4.49 | NaN | 0.56 | 0.405 | 0.774 |
3 rows × 10 columns
si = drinker_inferred.map({1:'drinker_inf', 0:'non-drinker_inf'})
s = drinker.map({True:'drinker', False:'non-drinker'})
survival_and_stats(s.combine_first(si), surv)
survival_and_stats(drinker_inferred, surv)
invasion = invasion.ix[keepers_o].dropna()
invasion.value_counts()
yes 95 no 89 dtype: int64
ret = SVC_fill(invasion[invasion.isin(['yes','no'])]=='yes',
rna.features.ix['real'])
ret['auc']
0.88586956521739135
fun = ret['decision_function']
violin_plot_pandas(invasion, fun)
invasion_inferred = 1.*(invasion.dropna()=='yes').combine_first(ret['filled_feature'])
get_surv_fit_lr(surv, invasion)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
8.73 | 0.00314 | |||||||||
yes | 95 | 45 | 2.5 | 1.38 | NaN | 0.398 | 0.289 | 0.548 | ||
no | 89 | 23 | NaN | 4 | NaN | 0.598 | 0.455 | 0.786 |
3 rows × 10 columns
get_surv_fit_lr(surv, invasion_inferred)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
3.76 | 0.0524 | |||||||||
0 | 141 | 52 | 4 | 2.25 | NaN | 0.455 | 0.348 | 0.596 | ||
1 | 117 | 57 | 1.71 | 1.42 | 4.49 | 0.364 | 0.265 | 0.5 |
3 rows × 10 columns
survival_and_stats(invasion_inferred, surv)
survival_and_stats(invasion.ix[invasion_inferred.index].combine_first(invasion_inferred), surv)
spread = spread.ix[keepers_o].dropna()
spread.value_counts()
no 118 yes 51 dtype: int64
ret = SVC_fill(spread=='yes', rna.features.ix['real'])
ret['auc']
0.89349112426035504
fun = ret['decision_function']
violin_plot_pandas(spread, fun)
spread_inferred = 1.*(spread.dropna()=='yes').combine_first(ret['filled_feature'])
get_surv_fit_lr(surv, spread)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
13.1 | 0.000299 | |||||||||
no | 118 | 40 | NaN | 2.96 | NaN | 0.523 | 0.409 | 0.668 | ||
yes | 51 | 29 | 1.42 | 1.24 | 2.08 | 0.258 | 0.145 | 0.46 |
3 rows × 10 columns
get_surv_fit_lr(surv, spread_inferred)
Stats | Median Survival | 5y Survival | Log-Rank | |||||||
---|---|---|---|---|---|---|---|---|---|---|
# Patients | # Events | Median | Lower | Upper | Surv | Lower | Upper | chi2 | p | |
12.8 | 0.000341 | |||||||||
0 | 197 | 74 | 4.36 | 2.58 | NaN | 0.457 | 0.365 | 0.572 | ||
1 | 61 | 35 | 1.42 | 1.25 | 1.9 | 0.251 | 0.146 | 0.43 |
3 rows × 10 columns
survival_and_stats(spread, surv)
survival_and_stats(spread_inferred, surv)
clinical_processed = pd.concat({'spread': spread,
'spread_inferred': spread_inferred,
'invasion': invasion,
'invasion_inferred': invasion_inferred,
'hpv': clinical.hpv,
'smoker': smoker,
'smoker_inferred': smoker_inferred,
'drinker': drinker,
'drinker_inferred': drinker_inferred,
'stage': stage,
'lymph_stage': lymph_stage,
'age': age,
'old_age': old_age,
'pack_years': pack_years,
'year': (year < 2000).map({True: 'pre_2000', False: 'post_2000'}),
'lymph_status': lymph_status,
'tumor_subdivision': tumor_subdivision}, axis=1)
clinical.processed = clinical_processed
clinical.save()