# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
from datascienceutils import predictiveModels as pm
from datascienceutils import sklearnUtils as sku
from IPython.display import Image
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from sklearn import cross_validation
from sklearn import metrics
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()
# Set pandas display options
#pd.set_option('display.width', pd.util.terminal.get_terminal_size()[0])
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)
# Data set from https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/
columns = ['class', 'age', 'sex', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia',
'big_liver', 'firm_liver', 'palpable_spleen', 'spiders', 'ascites', 'varices', 'bilirubin',
'alk_phosphate', 'sgot', 'albumin', 'protime', 'histology']
hepatitis_df = pd.read_csv('~/DataScientist/data/Hepatitis/hepatitis.data', names=columns, na_values=['?'])
from pprint import pprint
import os
with open(os.path.expanduser('~/DataScientist/data/Hepatitis/hepatitis.names'), 'r') as fd:
pprint(fd.readlines())
['1. Title: Hepatitis Domain\n', '\n', '2. Sources:\n', ' (a) unknown\n', ' (b) Donor: G.Gong (Carnegie-Mellon University) via \n', ' Bojan Cestnik\n', ' Jozef Stefan Institute\n', ' Jamova 39\n', ' 61000 Ljubljana\n', ' Yugoslavia (tel.: (38)(+61) 214-399 ext.287) }\n', ' (c) Date: November, 1988\n', '\n', '3. Past Usage:\n', ' 1. Diaconis,P. & Efron,B. (1983). Computer-Intensive Methods in \n', ' Statistics. Scientific American, Volume 248.\n', ' -- Gail Gong reported a 80% classfication accuracy\n', ' 2. Cestnik,G., Konenenko,I, & Bratko,I. (1987). Assistant-86: A\n', ' Knowledge-Elicitation Tool for Sophisticated Users. In I.Bratko\n', ' & N.Lavrac (Eds.) Progress in Machine Learning, 31-45, Sigma Press.\n', ' -- Assistant-86: 83% accuracy\n', '\n', '4. Relevant Information:\n', ' Please ask Gail Gong for further information on this database.\n', '\n', '5. Number of Instances: 155\n', '\n', '6. Number of Attributes: 20 (including the class attribute)\n', '\n', '7. Attribute information: \n', ' 1. Class: DIE, LIVE\n', ' 2. AGE: 10, 20, 30, 40, 50, 60, 70, 80\n', ' 3. SEX: male, female\n', ' 4. STEROID: no, yes\n', ' 5. ANTIVIRALS: no, yes\n', ' 6. FATIGUE: no, yes\n', ' 7. MALAISE: no, yes\n', ' 8. ANOREXIA: no, yes\n', ' 9. LIVER BIG: no, yes\n', ' 10. LIVER FIRM: no, yes\n', ' 11. SPLEEN PALPABLE: no, yes\n', ' 12. SPIDERS: no, yes\n', ' 13. ASCITES: no, yes\n', ' 14. VARICES: no, yes\n', ' 15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00\n', ' -- see the note below\n', ' 16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250\n', ' 17. SGOT: 13, 100, 200, 300, 400, 500, \n', ' 18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0\n', ' 19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90\n', ' 20. HISTOLOGY: no, yes\n', '\n', ' The BILIRUBIN attribute appears to be continuously-valued. I checked\n', ' this with the donater, Bojan Cestnik, who replied:\n', '\n', ' About the hepatitis database and BILIRUBIN problem I would like to ' 'say\n', ' the following: BILIRUBIN is continuous attribute (= the number of ' "it's\n", ' "values" in the ASDOHEPA.DAT file is negative!!!); "values" are ' 'quoted\n', ' because when speaking about the continuous attribute there is no ' 'such \n', ' thing as all possible values. However, they represent so called\n', ' "boundary" values; according to these "boundary" values the ' 'attribute\n', ' can be discretized. At the same time, because of the continious\n', ' attribute, one can perform some other test since the continuous\n', ' information is preserved. I hope that these lines have at least ' 'roughly \n', ' answered your question. \n', '\n', '8. Missing Attribute Values: (indicated by "?")\n', ' Attribute Number: Number of Missing Values:\n', ' 1: 0\n', ' 2: 0\n', ' 3: 0\n', ' 4: 1\n', ' 5: 0\n', ' 6: 1\n', ' 7: 1\n', ' 8: 1\n', ' 9: 10\n', '\t\t 10: 11\n', '\t\t 11: 5\n', '\t\t 12: 5\n', '\t\t 13: 5\n', '\t\t 14: 5\n', '\t\t 15: 6\n', '\t\t 16: 29\n', '\t\t 17: 4\n', '\t\t 18: 16\n', '\t\t 19: 67\n', '\t\t 20: 0\n', '\n', '9. Class Distribution:\n', ' DIE: 32\n', ' LIVE: 123\n']
hepatitis_df.head()
class | age | sex | steroid | antivirals | fatigue | malaise | anorexia | big_liver | firm_liver | palpable_spleen | spiders | ascites | varices | bilirubin | alk_phosphate | sgot | albumin | protime | histology | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 30 | 2 | 1.0 | 2 | 2.0 | 2.0 | 2.0 | 1.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 1.0 | 85.0 | 18.0 | 4.0 | NaN | 1 |
1 | 2 | 50 | 1 | 1.0 | 2 | 1.0 | 2.0 | 2.0 | 1.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 0.9 | 135.0 | 42.0 | 3.5 | NaN | 1 |
2 | 2 | 78 | 1 | 2.0 | 2 | 1.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 0.7 | 96.0 | 32.0 | 4.0 | NaN | 1 |
3 | 2 | 31 | 1 | NaN | 1 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 0.7 | 46.0 | 52.0 | 4.0 | 80.0 | 1 |
4 | 2 | 34 | 1 | 2.0 | 2 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 1.0 | NaN | 200.0 | 4.0 | NaN | 1 |
analyze.correlation_analyze(hepatitis_df, 'firm_liver', 'big_liver')
/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/nonparametric/kernels.py:128: RuntimeWarning: divide by zero encountered in true_divide return (1. / np.sqrt(2 * np.pi)) * np.exp(-(Xi - x)**2 / (h**2 * 2.)) /home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/nonparametric/kernels.py:128: RuntimeWarning: invalid value encountered in true_divide return (1. / np.sqrt(2 * np.pi)) * np.exp(-(Xi - x)**2 / (h**2 * 2.)) /home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/nonparametric/_kernel_base.py:514: RuntimeWarning: invalid value encountered in true_divide dens = Kval.prod(axis=1) / np.prod(bw[iscontinuous])
# Correlation btw Numerical Columns
/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/matplotlib/contour.py:1533: UserWarning: Warning: converting a masked element to nan. self.zmax = float(z.max()) /home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/matplotlib/contour.py:1534: UserWarning: Warning: converting a masked element to nan. self.zmin = float(z.min())
hepatitis_df.big_liver.replace(np.inf, 9999999, inplace=True)
hepatitis_df.big_liver.replace(-np.inf, -9999999, inplace=True)
analyze.dist_analyze(hepatitis_df, 'big_liver')
Variance of big_liver 0.14367816092 Skewness of big_liver -1.7526378546 Kolmogrov - Smirnov test with distribution norm KstestResult(statistic=nan, pvalue=nan) Anderson-Darling normality test on big_liver Statistic: nan p-value: 0.000000
/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater return (self.a < x) & (x < self.b) /home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less return (self.a < x) & (x < self.b) /home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/scipy/stats/_distn_infrastructure.py:1735: RuntimeWarning: invalid value encountered in greater_equal cond2 = (x >= self.b) & cond0 /home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py:185: RuntimeWarning: divide by zero encountered in double_scalars self.bin_count = np.ceil((self.values.max() - self.values.min())/self.bin_width)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~/anaconda3/envs/analytics/lib/python3.6/site-packages/numpy/core/function_base.py in _index_deprecate(i, stacklevel) 14 try: ---> 15 i = operator.index(i) 16 except TypeError: TypeError: 'numpy.float64' object cannot be interpreted as an integer During handling of the above exception, another exception occurred: OverflowError Traceback (most recent call last) <ipython-input-19-809fd4e45a6f> in <module>() 2 hepatitis_df.big_liver.replace(-np.inf, -9999999, inplace=True) 3 ----> 4 analyze.dist_analyze(hepatitis_df, 'big_liver') ~/playspace/data-science-utils/datascienceutils/analyze.py in dist_analyze(df, column, category, is_normal, bayesian_hist, kdeplot, violinplot) 53 if violinplot: 54 plots.append(plotter.sb_violinplot(df[column], inner='box')) ---> 55 plots.append(plotter.histogram(df, column, bayesian_bins=bayesian_hist)) 56 else: 57 if df[column].nunique() < 7: ~/playspace/data-science-utils/datascienceutils/plotter.py in histogram(histDF, values, bayesian_bins, **kwargs) 336 if not bayesian_bins: 337 from bokeh.charts import Histogram --> 338 return Histogram(histDF[values], **kwargs) 339 else: 340 import numpy as np ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/builder/histogram_builder.py in Histogram(data, values, label, color, agg, bins, yscale, xgrid, ygrid, continuous_range, **kw) 51 kw['bins'] = bins 52 ---> 53 return create_and_build(HistogramBuilder, data, **kw) 54 55 ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_builder.py in create_and_build(builder_class, *data, **kws) 62 chart_kws = { k:v for k,v in kws.items() if k not in builder_props} 63 chart = Chart(**chart_kws) ---> 64 chart.add_builder(builder) 65 chart.start_plot() 66 ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_chart.py in add_builder(self, builder) 132 def add_builder(self, builder): 133 self._builders.append(builder) --> 134 builder.create(self) 135 136 def add_ranges(self, dim, range): ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_builder.py in create(self, chart) 301 if chart is None: 302 chart = Chart() --> 303 chart.add_renderers(self, renderers) 304 305 # handle ranges after renders, since ranges depend on aggregations ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_chart.py in add_renderers(self, builder, renderers) 127 128 def add_renderers(self, builder, renderers): --> 129 self.renderers += renderers 130 self._renderer_map.extend({ r._id : builder for r in renderers }) 131 ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/builder/bar_builder.py in _yield_renderers(self) 226 stack_label=self.get_label(group['stack']), 227 dodge_label=self.get_label(group['group']), --> 228 **group_kwargs) 229 230 self.add_glyph(group, bg) ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/glyphs.py in __init__(self, values, label, color, bin_count, **kwargs) 467 kwargs.pop('width', None) 468 --> 469 super(HistogramGlyph, self).__init__(**kwargs) 470 471 def _set_sources(self): ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_models.py in __init__(self, **kwargs) 59 60 super(CompositeGlyph, self).__init__(**kwargs) ---> 61 self.setup() 62 63 def setup(self): ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_models.py in setup(self) 62 63 def setup(self): ---> 64 self.renderers = [renderer for renderer in self.build_renderers()] 65 if self.renderers is not None: 66 self.refresh() ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/_models.py in <listcomp>(.0) 62 63 def setup(self): ---> 64 self.renderers = [renderer for renderer in self.build_renderers()] 65 if self.renderers is not None: 66 self.refresh() ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/glyphs.py in build_renderers(self) 476 477 def build_renderers(self): --> 478 self.bins = Bins(values=self.values, bin_count=self.bin_count) 479 self.centers = [bin.center for bin in self.bins.bins] 480 self.bin_width = self.centers[1] - self.centers[0] ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py in __init__(self, values, column, bins, **properties) 162 properties['column'] = column 163 properties['bins'] = bins --> 164 super(Bins, self).__init__(**properties) 165 166 def update(self): ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py in __init__(self, **properties) 21 def __init__(self, **properties): 22 super(Stat, self).__init__(**properties) ---> 23 self._refresh() 24 25 def _refresh(self): ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py in _refresh(self) 27 if self.get_data() is not None: 28 self.update() ---> 29 self.calculate() 30 31 def set_data(self, data, column=None): ~/anaconda3/envs/analytics/lib/python3.6/site-packages/bokeh/charts/stats.py in calculate(self) 172 173 def calculate(self): --> 174 binned, bin_edges = pd.cut(self.get_data(), self.bin_count, retbins=True, precision=0) 175 176 df = pd.DataFrame(dict(values=self.get_data(), bins=binned)) ~/anaconda3/envs/analytics/lib/python3.6/site-packages/pandas/core/reshape/tile.py in cut(x, bins, right, labels, retbins, precision, include_lowest) 116 bins = np.linspace(mn, mx, bins + 1, endpoint=True) 117 else: # adjust end points after binning --> 118 bins = np.linspace(mn, mx, bins + 1, endpoint=True) 119 adj = (mx - mn) * 0.001 # 0.1% of the range 120 if right: ~/anaconda3/envs/analytics/lib/python3.6/site-packages/numpy/core/function_base.py in linspace(start, stop, num, endpoint, retstep, dtype) 99 """ 100 # 2016-02-25, 1.12 --> 101 num = _index_deprecate(num) 102 if num < 0: 103 raise ValueError("Number of samples, %s, must be non-negative." % num) ~/anaconda3/envs/analytics/lib/python3.6/site-packages/numpy/core/function_base.py in _index_deprecate(i, stacklevel) 17 msg = ("object of type {} cannot be safely interpreted as " 18 "an integer.".format(type(i))) ---> 19 i = int(i) 20 stacklevel += 1 21 warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel) OverflowError: cannot convert float infinity to integer
hepatitis_df.big_liver.any(lambda x: x==np.inf)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-20-90bf37d0c1f3> in <module>() ----> 1 hepatitis_df.big_liver.any(lambda x: x==np.inf) ~/anaconda3/envs/analytics/lib/python3.6/site-packages/pandas/core/generic.py in logical_func(self, axis, bool_only, skipna, level, **kwargs) 6419 return self._reduce(f, axis=axis, skipna=skipna, 6420 numeric_only=bool_only, filter_type='bool', -> 6421 name=name) 6422 6423 return set_function_name(logical_func, name, cls) ~/anaconda3/envs/analytics/lib/python3.6/site-packages/pandas/core/series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds) 2374 if isinstance(delegate, np.ndarray): 2375 # Validate that 'axis' is consistent with Series's single axis. -> 2376 self._get_axis_number(axis) 2377 if numeric_only: 2378 raise NotImplementedError('Series.{0} does not implement ' ~/anaconda3/envs/analytics/lib/python3.6/site-packages/pandas/core/generic.py in _get_axis_number(self, axis) 351 pass 352 raise ValueError('No axis named {0} for object type {1}' --> 353 .format(axis, type(self))) 354 355 def _get_axis_name(self, axis): ValueError: No axis named <function <lambda> at 0x7f6d4ea48ea0> for object type <class 'pandas.core.series.Series'>