# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
from datascienceutils import predictiveModels as pm
from datascienceutils import sklearnUtils as sku
from IPython.display import Image
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from sklearn import cross_validation
from sklearn import metrics
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()
# Set pandas display options
#pd.set_option('display.width', pd.util.terminal.get_terminal_size()[0])
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
# Data set from https://archive.ics.uci.edu/ml/machine-learning-databases/audiology/ i.e: famous uci ml data set repository
with open('./data/audiology.data', 'r') as fd:
data = fd.readlines()
from pprint import pprint
with open('./data/audiology.names', 'r') as fd:
pprint(fd.readlines())
['WARNING: This database should be credited to the original owner whenever\n', ' used for any publication whatsoever.\n', '\n', '1. Title: Audiology Database\n', '\n', '2. Sources:\n', ' (a) Original Owner: Professor Jergen at Baylor College of Medicine\n', ' (b) Donor: Bruce Porter (porter@fall.cs.utexas.EDU)\n', ' (c) Date Received: 12/3/1987\n', '\n', '3. Past Usage: \n', ' -- See: Bareiss, E. Ray, & Porter, Bruce (1987). Protos: An ' 'Exemplar-Based\n', ' Learning Apprentice. In the Proceedings of the 4th International\n', ' Workshop on Machine Learning, 12-23, Irvine, CA: Morgan Kaufmann.\n', '\n', '4. Relevant Information:\n', ' -- Contact Ray Bareiss (rbareiss@uunet.uucp ??), now at Vanderbilt \n', ' University, for more information.\n', ' -- Domain expert: Professor Craig Wier of the University of Texas, ' 'Austin.\n', '\n', '5. Number of instances: 200 training cases, 26 test cases\n', '\n', '6. Number of attributes: ???\n', '\n', '7. Attribute information: (all attributes are nominally valued)\n', ' 1. case identifier.\n', ' 2. classification (24 classes)\n', ' 3. List of case features\n', ' -- format: form f(v) should be read as "feature f has value v"\n', '\n', '8. Missing attribute values:\n', ' -- This database does NOT use a standard set of attributes per ' 'instance.\n', '\n', '9. Class Distribution: (in the training set)\n', ' 1. acoustic_neuroma: 1\n', ' 2. bells_palsy: 1\n', ' 3. cochlear_age: 46\n', ' 4. cochlear_age_and_noise: 18\n', ' 5. cochlear_age_plus_poss_menieres: 1\n', ' 6. cochlear_noise_and_heredity: 2\n', ' 7. cochlear_poss_noise: 16\n', ' 8. cochlear_unknown: 48\n', ' 9. conductive_discontinuity: 2\n', ' 10. conductive_fixation: 6\n', ' 11. mixed_cochlear_age_fixation: 1\n', ' 12. mixed_cochlear_age_otitis_media: 4\n', ' 13. mixed_cochlear_age_s_om: 2\n', ' 14. mixed_cochlear_unk_discontinuity: 2\n', ' 15. mixed_cochlear_unk_fixation: 5\n', ' 16. mixed_cochlear_unk_ser_om: 3\n', ' 17. mixed_poss_central_om: 1\n', ' 18. mixed_poss_noise_om: 2\n', ' 19. normal_ear: 20\n', ' 20. otitis_media: 4\n', ' 21. poss_central: 1\n', ' 22. possible_brainstem_disorder: 4\n', ' 23. possible_menieres: 8\n', ' 24. retrocochlear_unknown: 2\n', ' --------------------Total: 200\n']
all_obs = set()
def parse_line(line):
global all_obs
line = line.strip('\n')
line = line.strip(']')
line = line.strip('[')
all_f = line.split(',')
caseid = all_f[0]
classif = all_f[1]
descs = all_f[2:]
descs[0] = descs[0].strip('[')
features = list()
for ea in descs:
all_obs.add(ea)
descs = ','.join(descs)
return [caseid, classif, descs]
audiology_df = pd.DataFrame(columns=['case_id', 'classification', 'case_features']) #'age_gt_60', 'boneAbnormal','airBoneGap', 'ar_c(normal)'])
for idx, each in enumerate(data):
if bool(each):
line = parse_line(each)
audiology_df.loc[idx] = line
audiology_df.head()
case_id | classification | case_features | |
---|---|---|---|
0 | p1 | cochlear_unknown | boneAbnormal,air(mild),ar_c(normal),ar_u(normal),o_ar_c(normal),o_ar_u(normal),speech(normal),static(normal),tymp(a) |
1 | p2 | cochlear_unknown | boneAbnormal,air(moderate),ar_c(normal),ar_u(normal),o_ar_c(normal),o_ar_u(normal),speech(normal),static(normal),tymp(a) |
2 | p3 | mixed_cochlear_age_fixation | age_gt_60,airBoneGap,boneAbnormal,air(mild),ar_u(absent),bone(mild),o_ar_u(absent),speech(normal),static(normal),tymp(as) |
3 | p4 | mixed_cochlear_age_otitis_media | age_gt_60,airBoneGap,air(mild),ar_u(absent),bone(mild),o_ar_u(absent),speech(normal),static(normal),tymp(b) |
4 | p5 | cochlear_age | age_gt_60,boneAbnormal,air(mild),ar_c(normal),ar_u(normal),bone(mild),o_ar_c(normal),o_ar_u(normal),speech(good),static(normal),tymp(a) |
print(audiology_df.groupby('classification').count())
case_id case_features classification acoustic_neuroma 1 1 bells_palsy 1 1 cochlear_age 46 46 cochlear_age_and_noise 18 18 cochlear_age_plus_poss_menieres 1 1 cochlear_noise_and_heredity 2 2 cochlear_poss_noise 16 16 cochlear_unknown 48 48 conductive_discontinuity 2 2 conductive_fixation 6 6 mixed_cochlear_age_fixation 1 1 mixed_cochlear_age_otitis_media 4 4 mixed_cochlear_age_s_om 2 2 mixed_cochlear_unk_discontinuity 2 2 mixed_cochlear_unk_fixation 5 5 mixed_cochlear_unk_ser_om 3 3 mixed_poss_central_om 1 1 mixed_poss_noise_om 2 2 normal_ear 20 20 otitis_media 4 4 poss_central 1 1 possible_brainstem_disorder 4 4 possible_menieres 8 8 retrocochlear_unknown 2 2
#def check_defect_presence():
# if ea in all_obs:
# pass
for ea in all_obs:
audiology_df[ea] = audiology_df['case_features'].apply( lambda x: True if ea in x else False)
audiology_df.drop('case_features', 1, inplace=True)
audiology_df.head()
case_id | classification | bone(normal) | history(fullness) | o_ar_c(elevated) | mod_s_sn_gt_500 | mod_sn_gt_4k | notch_4k | late_wave(poor) | s_sn_gt_2k | ... | air(normal) | tymp(b) | m_s_sn_gt_2k | o_ar_u(elevated) | age_gt_60 | tymp(ad) | history(recruitment) | m_m_sn | m_sn_gt_1k | o_ar_u(normal) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | p1 | cochlear_unknown | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | True |
1 | p2 | cochlear_unknown | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | True |
2 | p3 | mixed_cochlear_age_fixation | False | False | False | False | False | False | False | False | ... | False | False | False | False | True | False | False | False | False | False |
3 | p4 | mixed_cochlear_age_otitis_media | False | False | False | False | False | False | False | False | ... | False | True | False | False | True | False | False | False | False | False |
4 | p5 | cochlear_age | False | False | False | False | False | False | False | False | ... | False | False | False | False | True | False | False | False | False | True |
5 rows × 89 columns
* a tree based prediction is best(since it is all boolean features)
* Xgboost since it is mostly False/empty features.(aka sparse features)
audiology_df.head()
case_id | classification | bone(normal) | history(fullness) | o_ar_c(elevated) | mod_s_sn_gt_500 | mod_sn_gt_4k | notch_4k | late_wave(poor) | s_sn_gt_2k | ... | air(normal) | tymp(b) | m_s_sn_gt_2k | o_ar_u(elevated) | age_gt_60 | tymp(ad) | history(recruitment) | m_m_sn | m_sn_gt_1k | o_ar_u(normal) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | p1 | cochlear_unknown | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | True |
1 | p2 | cochlear_unknown | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | True |
2 | p3 | mixed_cochlear_age_fixation | False | False | False | False | False | False | False | False | ... | False | False | False | False | True | False | False | False | False | False |
3 | p4 | mixed_cochlear_age_otitis_media | False | False | False | False | False | False | False | False | ... | False | True | False | False | True | False | False | False | False | False |
4 | p5 | cochlear_age | False | False | False | False | False | False | False | False | ... | False | False | False | False | True | False | False | False | False | True |
5 rows × 89 columns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(audiology_df['classification'].unique())
audiology_df['classification'] = le.transform(audiology_df['classification'])
target = audiology_df.classification
audiology_df.drop(['case_id', 'classification'], 1, inplace=True)
audiology_df.head()
bone(normal) | history(fullness) | o_ar_c(elevated) | mod_s_sn_gt_500 | mod_sn_gt_4k | notch_4k | late_wave(poor) | s_sn_gt_2k | bone(moderate) | mod_gt_4k | ... | air(normal) | tymp(b) | m_s_sn_gt_2k | o_ar_u(elevated) | age_gt_60 | tymp(ad) | history(recruitment) | m_m_sn | m_sn_gt_1k | o_ar_u(normal) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | True |
1 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | True |
2 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | True | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | False | ... | False | True | False | False | True | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | False | False | ... | False | False | False | False | True | False | False | False | False | True |
5 rows × 87 columns
X_train, X_test, y_train, y_test = train_test_split(audiology_df, target, test_size=0.3)
tree_model = pm.train(X_train, y_train, 'tree')
tree_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((tree_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % tree_model.score(X_test, y_test))
Mean squared error: 24.52 Variance score: 0.72
plotter.show_tree_model(tree_model, model_type='tree')
# Train the model using the training sets
xgb_model = pm.train(X_train, y_train, 'xgboost')
xgb_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
% np.mean((xgb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
plotter.show_tree_model(xgb_model, model_type='xgboost')