# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()
murder_2015 = pd.read_csv('./data/fivethirtyeight_data/murder_2016/murder_2015_final.csv')
murder_2016 = pd.read_csv('./data/fivethirtyeight_data/murder_2016/murder_2016_prelim.csv')
numericalCols = murder_2015.select_dtypes(include=[np.number]).columns
catCols = set(murder_2015.columns) -set(numericalCols)
murder_2015.describe()
2014_murders | 2015_murders | change | |
---|---|---|---|
count | 83.000000 | 83.000000 | 83.000000 |
mean | 65.746988 | 75.481928 | 9.734940 |
std | 79.011244 | 91.684289 | 21.858795 |
min | 0.000000 | 1.000000 | -19.000000 |
25% | 19.500000 | 22.500000 | -3.000000 |
50% | 32.000000 | 39.000000 | 4.000000 |
75% | 82.000000 | 94.000000 | 14.000000 |
max | 411.000000 | 478.000000 | 133.000000 |
murder_2016.describe()
2015_murders | 2016_murders | change | |
---|---|---|---|
count | 79.000000 | 79.000000 | 79.000000 |
mean | 56.468354 | 62.379747 | 5.911392 |
std | 70.158764 | 81.051472 | 21.962199 |
min | 0.000000 | 1.000000 | -21.000000 |
25% | 13.000000 | 14.500000 | -3.000000 |
50% | 30.000000 | 30.000000 | 2.000000 |
75% | 71.500000 | 81.500000 | 9.000000 |
max | 378.000000 | 536.000000 | 158.000000 |
murder_2015.head()
city | state | 2014_murders | 2015_murders | change | |
---|---|---|---|---|---|
0 | Baltimore | Maryland | 211 | 344 | 133 |
1 | Chicago | Illinois | 411 | 478 | 67 |
2 | Houston | Texas | 242 | 303 | 61 |
3 | Cleveland | Ohio | 63 | 120 | 57 |
4 | Washington | D.C. | 105 | 162 | 57 |
murder_2016.head()
city | state | 2015_murders | 2016_murders | change | source | as_of | |
---|---|---|---|---|---|---|---|
0 | Chicago | Illinois | 378 | 536 | 158 | https://portal.chicagopolice.org/portal/page/p... | 10/2/2016 |
1 | Orlando | Florida | 19 | 73 | 54 | OPD | 9/22/2016 |
2 | Memphis | Tennessee | 114 | 158 | 44 | MPD | 9/11/2016 |
3 | Phoenix | Arizona | 72 | 111 | 39 | PPD | 8/31/2016 |
4 | Las Vegas | Nevada | 90 | 125 | 35 | http://www.lvmpd.com/Sections/Homicide/Homicid... | 9/28/2016 |
murder_2015.var()
2014_murders 6242.776668 2015_murders 8406.008816 change 477.806935 dtype: float64
murder_2016.head()
city | state | 2015_murders | 2016_murders | change | source | as_of | |
---|---|---|---|---|---|---|---|
0 | Chicago | Illinois | 378 | 536 | 158 | https://portal.chicagopolice.org/portal/page/p... | 10/2/2016 |
1 | Orlando | Florida | 19 | 73 | 54 | OPD | 9/22/2016 |
2 | Memphis | Tennessee | 114 | 158 | 44 | MPD | 9/11/2016 |
3 | Phoenix | Arizona | 72 | 111 | 39 | PPD | 8/31/2016 |
4 | Las Vegas | Nevada | 90 | 125 | 35 | http://www.lvmpd.com/Sections/Homicide/Homicid... | 9/28/2016 |
murder_2015.head()
city | state | 2014_murders | 2015_murders | change | |
---|---|---|---|---|---|
0 | Baltimore | Maryland | 211 | 344 | 133 |
1 | Chicago | Illinois | 411 | 478 | 67 |
2 | Houston | Texas | 242 | 303 | 61 |
3 | Cleveland | Ohio | 63 | 120 | 57 |
4 | Washington | D.C. | 105 | 162 | 57 |
murder_2016.skew()
2015_murders 2.338236 2016_murders 3.194842 change 4.564821 dtype: float64
murder_2015.corr()
2014_murders | 2015_murders | change | |
---|---|---|---|
2014_murders | 1.000000 | 0.978106 | 0.487938 |
2015_murders | 0.978106 | 1.000000 | 0.658906 |
change | 0.487938 | 0.658906 | 1.000000 |
murder_2016.corr()
2015_murders | 2016_murders | change | |
---|---|---|---|
2015_murders | 1.000000 | 0.968022 | 0.377959 |
2016_murders | 0.968022 | 1.000000 | 0.598131 |
change | 0.377959 | 0.598131 | 1.000000 |
import itertools
for combo in itertools.combinations(numericalCols, 2):
analyze.correlation_analyze(murder_2015, combo[0], combo[1],
)
/home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/seaborn/axisgrid.py:2262: UserWarning: The `size` paramter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
# Correlation btw Numerical Columns
/home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/seaborn/axisgrid.py:2262: UserWarning: The `size` paramter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
# Correlation btw Numerical Columns
/home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/seaborn/axisgrid.py:2262: UserWarning: The `size` paramter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
# Correlation btw Numerical Columns
for col in numericalCols:
plotter.show(analyze.dist_analyze(murder_2015, col, kdeplot=True, bayesian_hist=True))
for col in catCols:
plotter.show(analyze.dist_analyze(murder_2015, col, kdeplot=False, bayesian_hist=False))
Variance of 2014_murders 6242.776667646194 Skewness of 2014_murders 2.3420698513083105 Anderson-Darling normality test on 2014_murders Statistic: 7.964539 p-value: 0.000000
/home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/utils.py:333: RuntimeWarning: divide by zero encountered in log fit_vec = count_vec * (np.log(count_vec) - np.log(width))
Variance of 2015_murders 8406.008815750807 Skewness of 2015_murders 2.2976683796413684 Anderson-Darling normality test on 2015_murders Statistic: 7.968532 p-value: 0.000000
Variance of change 477.80693505730244 Skewness of change 2.8851316617705973 Anderson-Darling normality test on change Statistic: 5.367976 p-value: 0.000000
W-1002 (EMPTY_LAYOUT): Layout has no children: GridPlot, ViewModel:GridPlot, ref _id: 9e09d50d-241c-4097-9396-f76b9976ba4e
Too many categories for col: city can't plot pie-chart
W-1002 (EMPTY_LAYOUT): Layout has no children: GridPlot, ViewModel:GridPlot, ref _id: 8b304036-828d-4475-8c03-2fa6ae4c9009
Too many categories for col: state can't plot pie-chart
#irisDf.drop('filename', inplace=True, axis=1)
for col in numericalCols:
plotter.show(analyze.dist_analyze(murder_2015, col, bayesian_hist=True))
Variance of 2014_murders 6242.776667646194 Skewness of 2014_murders 2.3420698513083105 Anderson-Darling normality test on 2014_murders Statistic: 7.964539 p-value: 0.000000
/home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/utils.py:333: RuntimeWarning: divide by zero encountered in log fit_vec = count_vec * (np.log(count_vec) - np.log(width))
Variance of 2015_murders 8406.008815750807 Skewness of 2015_murders 2.2976683796413684 Anderson-Darling normality test on 2015_murders Statistic: 7.968532 p-value: 0.000000
Variance of change 477.80693505730244 Skewness of change 2.8851316617705973 Anderson-Darling normality test on change Statistic: 5.367976 p-value: 0.000000
analyze.regression_analyze(murder_2015, numericalCols, check_vif=False, check_heteroskedasticity=False, check_dist_similarity=False)
Regression Score: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) 0.9566917970915009 Regression Score: Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001)
/home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
0.9566917970878501 Regression Score: RidgeCV(alphas=(0.1, 1.0, 10.0), cv=None, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None, store_cv_values=False) 0.9566917967264349 Regression Score: Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) 0.9566917775678587 Regression Score: ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) 0.9566917748470272 Regression Score: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) -0.14780946201574063 Regression Score: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) 0.23808319065208286 Regression Score: Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001) 0.23808319065117434 Regression Score: RidgeCV(alphas=(0.1, 1.0, 10.0), cv=None, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None, store_cv_values=False) 0.23808319056123206 Regression Score: Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) 0.23808284717463082 Regression Score: ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) 0.23808308005289935 Regression Score: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) -0.014026808221444975 Regression Score: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) 0.43415756700467234 Regression Score: Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001) 0.43415756700375835 Regression Score: RidgeCV(alphas=(0.1, 1.0, 10.0), cv=None, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None, store_cv_values=False) 0.4341575669132972 Regression Score: Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) 0.43415731191892704 Regression Score: ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) 0.43415748163375684 Regression Score: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) -0.003979130622483451
/home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:318: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead. /home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:325: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
analyze.non_linear_regression_analyze(murder_2015, numericalCols, check_vif=False, check_heteroskedasticity=False)
/home/anand/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py:241: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-18-b3e39473f127> in <module>() ----> 1 analyze.non_linear_regression_analyze(murder_2015, numericalCols, check_vif=False, check_heteroskedasticity=False) ~/.virtualenvs/dsutils/lib/python3.6/site-packages/datascienceutils-1.3.11-py3.6.egg/datascienceutils/analyze.py in non_linear_regression_analyze(df, target_cols, trainsize, **kwargs) 239 import ace 240 model = ace.model.Model() --> 241 model.build_model_from_xy([df[col1].as_matrix()], [df[col2].as_matrix()]) 242 243 print(" # Ace Models btw numerical cols") ~/.virtualenvs/dsutils/lib/python3.6/site-packages/ace/model.py in build_model_from_xy(self, x_values, y_values) 65 """ 66 self.init_ace(x_values, y_values) ---> 67 self.run_ace() 68 self.build_interpolators() 69 ~/.virtualenvs/dsutils/lib/python3.6/site-packages/ace/model.py in run_ace(self) 78 Perform the ACE calculation 79 """ ---> 80 self.ace.solve() 81 82 def build_interpolators(self): ~/.virtualenvs/dsutils/lib/python3.6/site-packages/ace/ace.py in solve(self) 67 """ 68 self._initialize() ---> 69 while self._outer_error_is_decreasing() and self._outer_iters < MAX_OUTERS: 70 print(('* Starting outer iteration {0:03d}. Current err = {1:12.5E}' 71 ''.format(self._outer_iters, self._last_outer_error))) ~/.virtualenvs/dsutils/lib/python3.6/site-packages/ace/ace.py in _outer_error_is_decreasing(self) 105 True if outer iteration error is decreasing 106 """ --> 107 is_decreasing, self._last_outer_error = self._error_is_decreasing(self._last_outer_error) 108 return is_decreasing 109 ~/.virtualenvs/dsutils/lib/python3.6/site-packages/ace/ace.py in _error_is_decreasing(self, last_error) 113 """ 114 current_error = self._compute_error() --> 115 if current_error < last_error: 116 is_decreasing = True 117 else: ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
analyze.bayesian_regression_analyze(murder_2015, numericalCols)
WARNING (theano.tensor.blas): We did not find a dynamic library in the library_dir of the library we use for blas. If you use ATLAS, make sure to compile it with dynamics library. logp = -374.11, ||grad|| = 0.16063: 100%|██████████| 34/34 [00:00<00:00, 2305.68it/s] Multiprocess sampling (4 chains in 4 jobs) NUTS: [sd, x, Intercept] logp = -374.11, ||grad|| = 0.10594: 100%|██████████| 33/33 [00:00<00:00, 1889.76it/s] Multiprocess sampling (4 chains in 4 jobs) NUTS: [sd, x, Intercept] logp = -361.53, ||grad|| = 0.0022813: 100%|██████████| 32/32 [00:00<00:00, 2206.95it/s] Multiprocess sampling (4 chains in 4 jobs) NUTS: [sd, x, Intercept]