Notebook

In [3]:

# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
from datascienceutils import predictiveModels as pm
from datascienceutils import sklearnUtils as sku

from IPython.display import Image
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random

from sklearn import cross_validation
from sklearn import metrics

from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()

# Set pandas display options
#pd.set_option('display.width', pd.util.terminal.get_terminal_size()[0])
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

BokehJS successfully loaded.

In [6]:

# Data set from https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/
columns = ['class', 'age', 'sex', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia', 
           'big_liver', 'firm_liver', 'palpable_spleen', 'spiders', 'ascites', 'varices', 'bilirubin',
          'alk_phosphate', 'sgot', 'albumin', 'protime', 'histology']

hepatitis_df = pd.read_csv('~/DataScientist/data/Hepatitis/hepatitis.data', names=columns, na_values=['?'])
    

In [9]:

from pprint import pprint
import os
with open(os.path.expanduser('~/DataScientist/data/Hepatitis/hepatitis.names'), 'r') as fd:
    pprint(fd.readlines())

['1. Title: Hepatitis Domain\n',
 '\n',
 '2. Sources:\n',
 '     (a) unknown\n',
 '     (b) Donor: G.Gong  (Carnegie-Mellon University) via \n',
 '                   Bojan Cestnik\n',
 '                   Jozef Stefan Institute\n',
 '                   Jamova 39\n',
 '                   61000 Ljubljana\n',
 '                   Yugoslavia (tel.: (38)(+61) 214-399 ext.287) }\n',
 '     (c) Date: November, 1988\n',
 '\n',
 '3. Past Usage:\n',
 '    1. Diaconis,P. & Efron,B. (1983).  Computer-Intensive Methods in \n',
 '       Statistics.  Scientific American, Volume 248.\n',
 '       -- Gail Gong reported a 80% classfication accuracy\n',
 '    2. Cestnik,G., Konenenko,I, & Bratko,I. (1987). Assistant-86: A\n',
 '       Knowledge-Elicitation Tool for Sophisticated Users.  In I.Bratko\n',
 '       & N.Lavrac (Eds.) Progress in Machine Learning, 31-45, Sigma Press.\n',
 '       -- Assistant-86: 83% accuracy\n',
 '\n',
 '4. Relevant Information:\n',
 '    Please ask Gail Gong for further information on this database.\n',
 '\n',
 '5. Number of Instances: 155\n',
 '\n',
 '6. Number of Attributes: 20 (including the class attribute)\n',
 '\n',
 '7. Attribute information: \n',
 '     1. Class: DIE, LIVE\n',
 '     2. AGE: 10, 20, 30, 40, 50, 60, 70, 80\n',
 '     3. SEX: male, female\n',
 '     4. STEROID: no, yes\n',
 '     5. ANTIVIRALS: no, yes\n',
 '     6. FATIGUE: no, yes\n',
 '     7. MALAISE: no, yes\n',
 '     8. ANOREXIA: no, yes\n',
 '     9. LIVER BIG: no, yes\n',
 '    10. LIVER FIRM: no, yes\n',
 '    11. SPLEEN PALPABLE: no, yes\n',
 '    12. SPIDERS: no, yes\n',
 '    13. ASCITES: no, yes\n',
 '    14. VARICES: no, yes\n',
 '    15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00\n',
 '        -- see the note below\n',
 '    16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250\n',
 '    17. SGOT: 13, 100, 200, 300, 400, 500, \n',
 '    18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0\n',
 '    19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90\n',
 '    20. HISTOLOGY: no, yes\n',
 '\n',
 '    The BILIRUBIN attribute appears to be continuously-valued.  I checked\n',
 '    this with the donater, Bojan Cestnik, who replied:\n',
 '\n',
 '      About the hepatitis database and BILIRUBIN problem I would like to '
 'say\n',
 '      the following: BILIRUBIN is continuous attribute (= the number of '
 "it's\n",
 '      "values" in the ASDOHEPA.DAT file is negative!!!); "values" are '
 'quoted\n',
 '      because when speaking about the continuous attribute there is no '
 'such \n',
 '      thing as all possible values. However, they represent so called\n',
 '      "boundary" values; according to these "boundary" values the '
 'attribute\n',
 '      can be discretized. At the same time, because of the continious\n',
 '      attribute, one can perform some other test since the continuous\n',
 '      information is preserved. I hope that these lines have at least '
 'roughly \n',
 '      answered your question. \n',
 '\n',
 '8. Missing Attribute Values: (indicated by "?")\n',
 '     Attribute Number:    Number of Missing Values:\n',
 '                    1:    0\n',
 '                    2:    0\n',
 '                    3:    0\n',
 '                    4:    1\n',
 '                    5:    0\n',
 '                    6:    1\n',
 '                    7:    1\n',
 '                    8:    1\n',
 '                    9:    10\n',
 '\t\t   10:    11\n',
 '\t\t   11:    5\n',
 '\t\t   12:    5\n',
 '\t\t   13:    5\n',
 '\t\t   14:    5\n',
 '\t\t   15:    6\n',
 '\t\t   16:    29\n',
 '\t\t   17:    4\n',
 '\t\t   18:    16\n',
 '\t\t   19:    67\n',
 '\t\t   20:    0\n',
 '\n',
 '9. Class Distribution:\n',
 '     DIE: 32\n',
 '    LIVE: 123\n']

In [10]:

hepatitis_df.head()

Out[10]:

	class	age	sex	steroid	antivirals	fatigue	malaise	anorexia	big_liver	firm_liver	palpable_spleen	spiders	ascites	varices	bilirubin	alk_phosphate	sgot	albumin	protime	histology
0	2	30	2	1.0	2	2.0	2.0	2.0	1.0	2.0	2.0	2.0	2.0	2.0	1.0	85.0	18.0	4.0	NaN	1
1	2	50	1	1.0	2	1.0	2.0	2.0	1.0	2.0	2.0	2.0	2.0	2.0	0.9	135.0	42.0	3.5	NaN	1
2	2	78	1	2.0	2	1.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	0.7	96.0	32.0	4.0	NaN	1
3	2	31	1	NaN	1	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	0.7	46.0	52.0	4.0	80.0	1
4	2	34	1	2.0	2	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	1.0	NaN	200.0	4.0	NaN	1

In [12]:

analyze.correlation_analyze(hepatitis_df, 'firm_liver', 'big_liver')

/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/nonparametric/kernels.py:128: RuntimeWarning: divide by zero encountered in true_divide
  return (1. / np.sqrt(2 * np.pi)) * np.exp(-(Xi - x)**2 / (h**2 * 2.))
/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/nonparametric/kernels.py:128: RuntimeWarning: invalid value encountered in true_divide
  return (1. / np.sqrt(2 * np.pi)) * np.exp(-(Xi - x)**2 / (h**2 * 2.))
/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/nonparametric/_kernel_base.py:514: RuntimeWarning: invalid value encountered in true_divide
  dens = Kval.prod(axis=1) / np.prod(bw[iscontinuous])

# Correlation btw Numerical Columns

/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/matplotlib/contour.py:1533: UserWarning: Warning: converting a masked element to nan.
  self.zmax = float(z.max())
/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/matplotlib/contour.py:1534: UserWarning: Warning: converting a masked element to nan.
  self.zmin = float(z.min())

In [19]:

hepatitis_df.big_liver.replace(np.inf, 9999999, inplace=True)
hepatitis_df.big_liver.replace(-np.inf, -9999999, inplace=True)

analyze.dist_analyze(hepatitis_df, 'big_liver')

Variance of big_liver
0.14367816092
Skewness of big_liver
-1.7526378546
Kolmogrov - Smirnov test with distribution norm
KstestResult(statistic=nan, pvalue=nan)
Anderson-Darling normality test on big_liver 
Statistic: nan 
 p-value: 0.000000

/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1735: RuntimeWarning: invalid value encountered in greater_equal
  cond2 = (x >= self.b) & cond0
/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py:185: RuntimeWarning: divide by zero encountered in double_scalars
  self.bin_count = np.ceil((self.values.max() - self.values.min())/self.bin_width)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/anaconda3/envs/analytics/lib/python3.6/site-packages/numpy/core/function_base.py in _index_deprecate(i, stacklevel)
     14     try:
---> 15         i = operator.index(i)
     16     except TypeError:

TypeError: 'numpy.float64' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

OverflowError                             Traceback (most recent call last)
<ipython-input-19-809fd4e45a6f> in <module>()
      2 hepatitis_df.big_liver.replace(-np.inf, -9999999, inplace=True)
      3 
----> 4 analyze.dist_analyze(hepatitis_df, 'big_liver')

~/playspace/data-science-utils/datascienceutils/analyze.py in dist_analyze(df, column, category, is_normal, bayesian_hist, kdeplot, violinplot)
     53         if violinplot:
     54             plots.append(plotter.sb_violinplot(df[column], inner='box'))
---> 55         plots.append(plotter.histogram(df, column, bayesian_bins=bayesian_hist))
     56     else:
     57         if df[column].nunique() < 7:

~/playspace/data-science-utils/datascienceutils/plotter.py in histogram(histDF, values, bayesian_bins, **kwargs)
    336     if not bayesian_bins:
    337         from bokeh.charts import Histogram
--> 338         return Histogram(histDF[values], **kwargs)
    339     else:
    340         import numpy as np

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/builder/histogram_builder.py in Histogram(data, values, label, color, agg, bins, yscale, xgrid, ygrid, continuous_range, **kw)
     51     kw['bins'] = bins
     52 
---> 53     return create_and_build(HistogramBuilder, data, **kw)
     54 
     55 

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_builder.py in create_and_build(builder_class, *data, **kws)
     62     chart_kws = { k:v for k,v in kws.items() if k not in builder_props}
     63     chart = Chart(**chart_kws)
---> 64     chart.add_builder(builder)
     65     chart.start_plot()
     66 

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_chart.py in add_builder(self, builder)
    132     def add_builder(self, builder):
    133         self._builders.append(builder)
--> 134         builder.create(self)
    135 
    136     def add_ranges(self, dim, range):

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_builder.py in create(self, chart)
    301         if chart is None:
    302             chart = Chart()
--> 303         chart.add_renderers(self, renderers)
    304 
    305         # handle ranges after renders, since ranges depend on aggregations

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_chart.py in add_renderers(self, builder, renderers)
    127 
    128     def add_renderers(self, builder, renderers):
--> 129         self.renderers += renderers
    130         self._renderer_map.extend({ r._id : builder for r in renderers })
    131 

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/builder/bar_builder.py in _yield_renderers(self)
    226                             stack_label=self.get_label(group['stack']),
    227                             dodge_label=self.get_label(group['group']),
--> 228                             **group_kwargs)
    229 
    230             self.add_glyph(group, bg)

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/glyphs.py in __init__(self, values, label, color, bin_count, **kwargs)
    467         kwargs.pop('width', None)
    468 
--> 469         super(HistogramGlyph, self).__init__(**kwargs)
    470 
    471     def _set_sources(self):

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_models.py in __init__(self, **kwargs)
     59 
     60         super(CompositeGlyph, self).__init__(**kwargs)
---> 61         self.setup()
     62 
     63     def setup(self):

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_models.py in setup(self)
     62 
     63     def setup(self):
---> 64         self.renderers = [renderer for renderer in self.build_renderers()]
     65         if self.renderers is not None:
     66             self.refresh()

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_models.py in <listcomp>(.0)
     62 
     63     def setup(self):
---> 64         self.renderers = [renderer for renderer in self.build_renderers()]
     65         if self.renderers is not None:
     66             self.refresh()

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/glyphs.py in build_renderers(self)
    476 
    477     def build_renderers(self):
--> 478         self.bins = Bins(values=self.values, bin_count=self.bin_count)
    479         self.centers = [bin.center for bin in self.bins.bins]
    480         self.bin_width = self.centers[1] - self.centers[0]

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py in __init__(self, values, column, bins, **properties)
    162         properties['column'] = column
    163         properties['bins'] = bins
--> 164         super(Bins, self).__init__(**properties)
    165 
    166     def update(self):

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py in __init__(self, **properties)
     21     def __init__(self, **properties):
     22         super(Stat, self).__init__(**properties)
---> 23         self._refresh()
     24 
     25     def _refresh(self):

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py in _refresh(self)
     27         if self.get_data() is not None:
     28             self.update()
---> 29             self.calculate()
     30 
     31     def set_data(self, data, column=None):

~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py in calculate(self)
    172 
    173     def calculate(self):
--> 174         binned, bin_edges = pd.cut(self.get_data(), self.bin_count, retbins=True, precision=0)
    175 
    176         df = pd.DataFrame(dict(values=self.get_data(), bins=binned))

~/anaconda3/envs/analytics/lib/python3.6/site-packages/pandas/core/reshape/tile.py in cut(x, bins, right, labels, retbins, precision, include_lowest)
    116             bins = np.linspace(mn, mx, bins + 1, endpoint=True)
    117         else:  # adjust end points after binning
--> 118             bins = np.linspace(mn, mx, bins + 1, endpoint=True)
    119             adj = (mx - mn) * 0.001  # 0.1% of the range
    120             if right:

~/anaconda3/envs/analytics/lib/python3.6/site-packages/numpy/core/function_base.py in linspace(start, stop, num, endpoint, retstep, dtype)
     99     """
    100     # 2016-02-25, 1.12
--> 101     num = _index_deprecate(num)
    102     if num < 0:
    103         raise ValueError("Number of samples, %s, must be non-negative." % num)

~/anaconda3/envs/analytics/lib/python3.6/site-packages/numpy/core/function_base.py in _index_deprecate(i, stacklevel)
     17         msg = ("object of type {} cannot be safely interpreted as "
     18                "an integer.".format(type(i)))
---> 19         i = int(i)
     20         stacklevel += 1
     21         warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel)

OverflowError: cannot convert float infinity to integer

In [20]:

hepatitis_df.big_liver.any(lambda x: x==np.inf)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-90bf37d0c1f3> in <module>()
----> 1 hepatitis_df.big_liver.any(lambda x: x==np.inf)

~/anaconda3/envs/analytics/lib/python3.6/site-packages/pandas/core/generic.py in logical_func(self, axis, bool_only, skipna, level, **kwargs)
   6419         return self._reduce(f, axis=axis, skipna=skipna,
   6420                             numeric_only=bool_only, filter_type='bool',
-> 6421                             name=name)
   6422 
   6423     return set_function_name(logical_func, name, cls)

~/anaconda3/envs/analytics/lib/python3.6/site-packages/pandas/core/series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   2374         if isinstance(delegate, np.ndarray):
   2375             # Validate that 'axis' is consistent with Series's single axis.
-> 2376             self._get_axis_number(axis)
   2377             if numeric_only:
   2378                 raise NotImplementedError('Series.{0} does not implement '

~/anaconda3/envs/analytics/lib/python3.6/site-packages/pandas/core/generic.py in _get_axis_number(self, axis)
    351                 pass
    352         raise ValueError('No axis named {0} for object type {1}'
--> 353                          .format(axis, type(self)))
    354 
    355     def _get_axis_name(self, axis):

ValueError: No axis named <function <lambda> at 0x7f6d4ea48ea0> for object type <class 'pandas.core.series.Series'>

In [ ]: