# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook(bokeh.resources.INLINE)
irisDf = pd.read_excel('./data/titanic3.xls')
irisDf.describe()
pclass | survived | age | sibsp | parch | fare | body | |
---|---|---|---|---|---|---|---|
count | 1309.000000 | 1309.000000 | 1046.000000 | 1309.000000 | 1309.000000 | 1308.000000 | 121.000000 |
mean | 2.294882 | 0.381971 | 29.881135 | 0.498854 | 0.385027 | 33.295479 | 160.809917 |
std | 0.837836 | 0.486055 | 14.413500 | 1.041658 | 0.865560 | 51.758668 | 97.696922 |
min | 1.000000 | 0.000000 | 0.166700 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 2.000000 | 0.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 | 72.000000 |
50% | 3.000000 | 0.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 | 155.000000 |
75% | 3.000000 | 1.000000 | 39.000000 | 1.000000 | 0.000000 | 31.275000 | 256.000000 |
max | 3.000000 | 1.000000 | 80.000000 | 8.000000 | 9.000000 | 512.329200 | 328.000000 |
irisDf.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
3 | 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
4 | 1 | 0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
irisDf.var()
pclass 0.701969 survived 0.236250 age 207.748974 sibsp 1.085052 parch 0.749195 fare 2678.959738 body 9544.688567 dtype: float64
irisDf.skew()
pclass -0.598647 survived 0.486404 age 0.407672 sibsp 3.844220 parch 3.669078 fare 4.367709 body 0.091739 dtype: float64
irisDf.corr()
pclass | survived | age | sibsp | parch | fare | body | |
---|---|---|---|---|---|---|---|
pclass | 1.000000 | -0.312469 | -0.408106 | 0.060832 | 0.018322 | -0.558629 | -0.034642 |
survived | -0.312469 | 1.000000 | -0.055513 | -0.027825 | 0.082660 | 0.244265 | NaN |
age | -0.408106 | -0.055513 | 1.000000 | -0.243699 | -0.150917 | 0.178739 | 0.058809 |
sibsp | 0.060832 | -0.027825 | -0.243699 | 1.000000 | 0.373587 | 0.160238 | -0.099961 |
parch | 0.018322 | 0.082660 | -0.150917 | 0.373587 | 1.000000 | 0.221539 | 0.051099 |
fare | -0.558629 | 0.244265 | 0.178739 | 0.160238 | 0.221539 | 1.000000 | -0.043110 |
body | -0.034642 | NaN | 0.058809 | -0.099961 | 0.051099 | -0.043110 | 1.000000 |
irisDf.select_dtypes(include=[np.number]).columns
Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'body'], dtype='object')
irisDf.fillna(method='pad', inplace=True)
analyze.correlation_analyze(irisDf, #exclude_columns='Id',
'pclass',
'survived',)
#categories=['sex', 'cabin', 'home.dest'])
# Correlation btw Numerical Columns
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/numpy/ma/core.py:6385: MaskedArrayFutureWarning: In the future the default for ma.minimum.reduce will be axis=0, not the current None, to match np.minimum.reduce. Explicitly pass 0 or None to silence this warning. return self.reduce(a) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/numpy/ma/core.py:6385: MaskedArrayFutureWarning: In the future the default for ma.maximum.reduce will be axis=0, not the current None, to match np.maximum.reduce. Explicitly pass 0 or None to silence this warning. return self.reduce(a)
analyze.dist_analyze(irisDf, 'pclass')
Variance of pclass 0.701969194684 Skewness of pclass -0.59864711028 Kolmogrov - Smirnov test with distribution norm KstestResult(statistic=0.84134474606854293, pvalue=0.0) Anderson-Darling normality test on pclass Statistic: 157.928244 p-value: 0.000000
analyze.dist_analyze(irisDf, 'age')
Variance of age 207.7489736 Skewness of age 0.407671886498 Kolmogrov - Smirnov test with distribution norm KstestResult(statistic=nan, pvalue=nan) Anderson-Darling normality test on age Statistic: nan p-value: 0.000000
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater return (self.a < x) & (x < self.b) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less return (self.a < x) & (x < self.b) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:1735: RuntimeWarning: invalid value encountered in greater_equal cond2 = (x >= self.b) & cond0
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-20-64d6b3eb7283> in <module>() ----> 1 analyze.dist_analyze(irisDf, 'age') /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/analyze.py in dist_analyze(df, column, category, is_normal, bayesian_hist, kdeplot, violinplot) 53 if violinplot: 54 plots.append(plotter.sb_violinplot(df[column], inner='box')) ---> 55 plots.append(plotter.histogram(df, column, bayesian_bins=bayesian_hist)) 56 else: 57 if df[column].nunique() < 7: /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/plotter.py in histogram(histDF, values, bayesian_bins, **kwargs) 350 if not bayesian_bins: 351 from bokeh.charts import Histogram --> 352 return Histogram(histDF[values], **kwargs) 353 else: 354 import numpy as np /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/builders/histogram_builder.py in Histogram(data, values, label, color, agg, bins, yscale, xgrid, ygrid, continuous_range, **kw) 104 kw['bins'] = bins 105 --> 106 return create_and_build(HistogramBuilder, data, **kw) 107 108 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/builder.py in create_and_build(builder_class, *data, **kws) 71 chart_kws = {k: v for k, v in kws.items() if k not in builder_props} 72 chart = Chart(**chart_kws) ---> 73 chart.add_builder(builder) 74 chart.start_plot() 75 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/chart.py in add_builder(self, builder) 169 def add_builder(self, builder): 170 self._builders.append(builder) --> 171 builder.create(self) 172 173 def add_ranges(self, dim, range): /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/builder.py in create(self, chart) 535 if chart is None: 536 chart = Chart() --> 537 chart.add_renderers(self, renderers) 538 539 # handle ranges after renders, since ranges depend on aggregations /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/chart.py in add_renderers(self, builder, renderers) 164 165 def add_renderers(self, builder, renderers): --> 166 self.renderers += renderers 167 self._renderer_map.extend({ r._id : builder for r in renderers }) 168 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/property_containers.py in wrapper(*args, **kwargs) 16 self = args[0] 17 old = self._saved_copy() ---> 18 result = func(*args, **kwargs) 19 self._notify_owners(old) 20 return result /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/property_containers.py in __iadd__(self, y) 75 @notify_owner 76 def __iadd__(self, y): ---> 77 return super(PropertyValueList, self).__iadd__(y) 78 79 # x *= y /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/builders/bar_builder.py in yield_renderers(self) 218 stack_label=self._get_label(group['stack']), 219 dodge_label=self._get_label(group['group']), --> 220 **group_kwargs) 221 222 self.add_glyph(group, bg) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/glyphs.py in __init__(self, values, label, color, bins, **kwargs) 947 self._bins = bins 948 --> 949 super(HistogramGlyph, self).__init__(**kwargs) 950 self.setup() 951 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/glyphs.py in __init__(self, x_label, **kwargs) 487 kwargs['x_label'] = str(label) 488 --> 489 super(AggregateGlyph, self).__init__(**kwargs) 490 491 def get_dodge_label(self, shift=0.0): /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/models.py in __init__(self, **properties) 79 properties['values'] = [vals] 80 super(CompositeGlyph, self).__init__(**properties) ---> 81 self.setup() 82 83 def setup(self): /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/models.py in setup(self) 83 def setup(self): 84 """Build renderers and data source and set sources on renderers.""" ---> 85 self.renderers = [renderer for renderer in self.build_renderers()] 86 if self.renderers is not None: 87 self.refresh() /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/models.py in <listcomp>(.0) 83 def setup(self): 84 """Build renderers and data source and set sources on renderers.""" ---> 85 self.renderers = [renderer for renderer in self.build_renderers()] 86 if self.renderers is not None: 87 self.refresh() /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/glyphs.py in build_renderers(self) 963 # users specify other bins other the Histogram Stat 964 self.bins = Histogram(values=self.values, bins=self._bins, --> 965 density=self.density) 966 967 bars = [] /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/stats.py in __init__(self, values, column, bins, stat, source, **properties) 309 properties['source'] = source 310 self._bins = bins --> 311 super(BinnedStat, self).__init__(**properties) 312 313 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/stats.py in __init__(self, **properties) 54 55 super(Stat, self).__init__(**properties) ---> 56 self._refresh() 57 58 def _refresh(self): /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/stats.py in _refresh(self) 60 if self.get_data() is not None: 61 self.update() ---> 62 self.calculate() 63 64 def set_data(self, data, column=None): /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/charts/stats.py in calculate(self) 426 427 binned, bin_bounds = np.histogram( --> 428 np.array(data), density=self.density, bins=bins 429 ) 430 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/numpy/lib/function_base.py in histogram(a, bins, range, normed, weights, density) 668 if not np.all(np.isfinite([mn, mx])): 669 raise ValueError( --> 670 'range parameter must be finite.') 671 if mn == mx: 672 mn -= 0.5 ValueError: range parameter must be finite.
analyze.regression_analyze(irisDf, 'age', 'fare', check_vif=False, check_heteroskedasticity=False)
P-value and test statistic for distribution similarity between age and fare (0.98592999999999997, -3.2483728800611154) Regression Score: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) 0.0247252618632 Regression Score: Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001) 0.0247252618628 Regression Score: RidgeCV(alphas=(0.1, 1.0, 10.0), cv=None, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None, store_cv_values=False) 0.0247252618263 Regression Score: Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) 0.0247233709119 Regression Score: ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) 0.0247240899196 Regression Score: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) -0.0730409020908
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.2.19-py3.5.egg/datascienceutils/predictiveModels.py:31: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead source = dataframe[column].reshape((len(target), 1)).tolist()
irisDf.head()
pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
3 | 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
4 | 1 | 0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |