In [1]:

# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
from datascienceutils import predictiveModels as pm
from datascienceutils import sklearnUtils as sku

from IPython.display import Image
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random

from sklearn import cross_validation
from sklearn import metrics

from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()

# Set pandas display options
#pd.set_option('display.width', pd.util.terminal.get_terminal_size()[0])
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Loading BokehJS ...

In [2]:

# Data set from https://archive.ics.uci.edu/ml/machine-learning-databases/audiology/ i.e: famous uci ml data set repository

with open('./data/audiology.data', 'r') as fd:
    data = fd.readlines()

In [3]:

from pprint import pprint
with open('./data/audiology.names', 'r') as fd:
    pprint(fd.readlines())

['WARNING: This database should be credited to the original owner whenever\n',
 '         used for any publication whatsoever.\n',
 '\n',
 '1. Title: Audiology Database\n',
 '\n',
 '2. Sources:\n',
 '    (a) Original Owner: Professor Jergen at Baylor College of Medicine\n',
 '    (b) Donor: Bruce Porter (porter@fall.cs.utexas.EDU)\n',
 '    (c) Date Received: 12/3/1987\n',
 '\n',
 '3. Past Usage: \n',
 '   -- See: Bareiss, E. Ray, & Porter, Bruce (1987).  Protos: An '
 'Exemplar-Based\n',
 '      Learning Apprentice.  In the Proceedings of the 4th International\n',
 '      Workshop on Machine Learning, 12-23, Irvine, CA: Morgan Kaufmann.\n',
 '\n',
 '4. Relevant Information:\n',
 '   -- Contact Ray Bareiss (rbareiss@uunet.uucp ??), now at Vanderbilt \n',
 '      University, for more information.\n',
 '   -- Domain expert: Professor Craig Wier of the University of Texas, '
 'Austin.\n',
 '\n',
 '5. Number of instances: 200 training cases, 26 test cases\n',
 '\n',
 '6. Number of attributes: ???\n',
 '\n',
 '7. Attribute information: (all attributes are nominally valued)\n',
 '   1. case identifier.\n',
 '   2. classification (24 classes)\n',
 '   3. List of case features\n',
 '      -- format: form f(v) should be read as "feature f has value v"\n',
 '\n',
 '8. Missing attribute values:\n',
 '   -- This database does NOT use a standard set of attributes per '
 'instance.\n',
 '\n',
 '9. Class Distribution: (in the training set)\n',
 '    1. acoustic_neuroma: 1\n',
 '    2. bells_palsy: 1\n',
 '    3. cochlear_age: 46\n',
 '    4. cochlear_age_and_noise: 18\n',
 '    5. cochlear_age_plus_poss_menieres: 1\n',
 '    6. cochlear_noise_and_heredity: 2\n',
 '    7. cochlear_poss_noise: 16\n',
 '    8. cochlear_unknown: 48\n',
 '    9. conductive_discontinuity: 2\n',
 '   10. conductive_fixation: 6\n',
 '   11. mixed_cochlear_age_fixation: 1\n',
 '   12. mixed_cochlear_age_otitis_media: 4\n',
 '   13. mixed_cochlear_age_s_om: 2\n',
 '   14. mixed_cochlear_unk_discontinuity: 2\n',
 '   15. mixed_cochlear_unk_fixation: 5\n',
 '   16. mixed_cochlear_unk_ser_om: 3\n',
 '   17. mixed_poss_central_om: 1\n',
 '   18. mixed_poss_noise_om: 2\n',
 '   19. normal_ear: 20\n',
 '   20. otitis_media: 4\n',
 '   21. poss_central: 1\n',
 '   22. possible_brainstem_disorder: 4\n',
 '   23. possible_menieres: 8\n',
 '   24. retrocochlear_unknown: 2\n',
 '   --------------------Total: 200\n']

In [4]:

all_obs = set()

def parse_line(line):
    global all_obs
    line = line.strip('\n')
    line = line.strip(']')
    line = line.strip('[')
    all_f = line.split(',')
    caseid = all_f[0]
    classif = all_f[1]
    descs = all_f[2:]
    descs[0] = descs[0].strip('[')
    features = list()
    for ea in descs:
        all_obs.add(ea)
    descs = ','.join(descs)
    return [caseid, classif, descs]

In [5]:

audiology_df = pd.DataFrame(columns=['case_id', 'classification', 'case_features']) #'age_gt_60', 'boneAbnormal','airBoneGap', 'ar_c(normal)'])
for idx, each in enumerate(data):
    if bool(each):
        line = parse_line(each)
        audiology_df.loc[idx] = line
    

In [6]:

audiology_df.head()

Out[6]:

	case_id	classification	case_features
0	p1	cochlear_unknown	boneAbnormal,air(mild),ar_c(normal),ar_u(normal),o_ar_c(normal),o_ar_u(normal),speech(normal),static(normal),tymp(a)
1	p2	cochlear_unknown	boneAbnormal,air(moderate),ar_c(normal),ar_u(normal),o_ar_c(normal),o_ar_u(normal),speech(normal),static(normal),tymp(a)
2	p3	mixed_cochlear_age_fixation	age_gt_60,airBoneGap,boneAbnormal,air(mild),ar_u(absent),bone(mild),o_ar_u(absent),speech(normal),static(normal),tymp(as)
3	p4	mixed_cochlear_age_otitis_media	age_gt_60,airBoneGap,air(mild),ar_u(absent),bone(mild),o_ar_u(absent),speech(normal),static(normal),tymp(b)
4	p5	cochlear_age	age_gt_60,boneAbnormal,air(mild),ar_c(normal),ar_u(normal),bone(mild),o_ar_c(normal),o_ar_u(normal),speech(good),static(normal),tymp(a)

Looks like the case_features are all text labels/observations by doctors. Let's split them into features and make them boolean.¶

In [7]:

print(audiology_df.groupby('classification').count())

                                  case_id  case_features
classification                                          
acoustic_neuroma                        1              1
bells_palsy                             1              1
cochlear_age                           46             46
cochlear_age_and_noise                 18             18
cochlear_age_plus_poss_menieres         1              1
cochlear_noise_and_heredity             2              2
cochlear_poss_noise                    16             16
cochlear_unknown                       48             48
conductive_discontinuity                2              2
conductive_fixation                     6              6
mixed_cochlear_age_fixation             1              1
mixed_cochlear_age_otitis_media         4              4
mixed_cochlear_age_s_om                 2              2
mixed_cochlear_unk_discontinuity        2              2
mixed_cochlear_unk_fixation             5              5
mixed_cochlear_unk_ser_om               3              3
mixed_poss_central_om                   1              1
mixed_poss_noise_om                     2              2
normal_ear                             20             20
otitis_media                            4              4
poss_central                            1              1
possible_brainstem_disorder             4              4
possible_menieres                       8              8
retrocochlear_unknown                   2              2

In [8]:

#def check_defect_presence():
#    if ea in all_obs:
#       pass
for ea in all_obs:
    audiology_df[ea] = audiology_df['case_features'].apply( lambda x: True if ea in x else False)
audiology_df.drop('case_features', 1, inplace=True)

In [9]:

audiology_df.head()

Out[9]:

	case_id	classification	bone(normal)	history(fullness)	o_ar_c(elevated)	mod_s_sn_gt_500	mod_sn_gt_4k	notch_4k	late_wave(poor)	s_sn_gt_2k	...	air(normal)	tymp(b)	m_s_sn_gt_2k	o_ar_u(elevated)	age_gt_60	tymp(ad)	history(recruitment)	m_m_sn	m_sn_gt_1k	o_ar_u(normal)
0	p1	cochlear_unknown	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	True
1	p2	cochlear_unknown	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	True
2	p3	mixed_cochlear_age_fixation	False	False	False	False	False	False	False	False	...	False	False	False	False	True	False	False	False	False	False
3	p4	mixed_cochlear_age_otitis_media	False	False	False	False	False	False	False	False	...	False	True	False	False	True	False	False	False	False	False
4	p5	cochlear_age	False	False	False	False	False	False	False	False	...	False	False	False	False	True	False	False	False	False	True

5 rows × 89 columns

OKay, based on the above data set sample, the only meaningful thing we can try is to see if we can predict the case classification based on any of the observed features.¶

We have 87 features,(I'm assuming these are labels that came out of human judgment) and most of it is false.. aka this is a sparsely populated dataset in these dimensions, and most likely the dimensions are not orthogonal(aka independent) to(of) each other.¶

Due to these reasons,¶

* a tree based prediction is best(since it is all boolean features)
* Xgboost since it is mostly False/empty features.(aka sparse features)

In [10]:

audiology_df.head()

Out[10]:

	case_id	classification	bone(normal)	history(fullness)	o_ar_c(elevated)	mod_s_sn_gt_500	mod_sn_gt_4k	notch_4k	late_wave(poor)	s_sn_gt_2k	...	air(normal)	tymp(b)	m_s_sn_gt_2k	o_ar_u(elevated)	age_gt_60	tymp(ad)	history(recruitment)	m_m_sn	m_sn_gt_1k	o_ar_u(normal)
0	p1	cochlear_unknown	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	True
1	p2	cochlear_unknown	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	True
2	p3	mixed_cochlear_age_fixation	False	False	False	False	False	False	False	False	...	False	False	False	False	True	False	False	False	False	False
3	p4	mixed_cochlear_age_otitis_media	False	False	False	False	False	False	False	False	...	False	True	False	False	True	False	False	False	False	False
4	p5	cochlear_age	False	False	False	False	False	False	False	False	...	False	False	False	False	True	False	False	False	False	True

5 rows × 89 columns

In [11]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(audiology_df['classification'].unique())
audiology_df['classification'] = le.transform(audiology_df['classification'])
target = audiology_df.classification

audiology_df.drop(['case_id', 'classification'], 1, inplace=True)

In [12]:

audiology_df.head()

Out[12]:

	bone(normal)	history(fullness)	o_ar_c(elevated)	mod_s_sn_gt_500	mod_sn_gt_4k	notch_4k	late_wave(poor)	s_sn_gt_2k	bone(moderate)	mod_gt_4k	...	air(normal)	tymp(b)	m_s_sn_gt_2k	o_ar_u(elevated)	age_gt_60	tymp(ad)	history(recruitment)	m_m_sn	m_sn_gt_1k	o_ar_u(normal)
0	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	True
1	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	True
2	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	True	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False	False	...	False	True	False	False	True	False	False	False	False	False
4	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	True	False	False	False	False	True

5 rows × 87 columns

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(audiology_df, target, test_size=0.3)
tree_model = pm.train(X_train, y_train, 'tree')
tree_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((tree_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % tree_model.score(X_test, y_test))

Mean squared error: 24.52
Variance score: 0.72

In [ ]:

plotter.show_tree_model(tree_model, model_type='tree')

In [ ]:

# Train the model using the training sets
xgb_model = pm.train(X_train, y_train, 'xgboost')
xgb_model.fit(X_train, y_train)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((xgb_model.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction

In [ ]:

plotter.show_tree_model(xgb_model, model_type='xgboost')