Notebook

In [2]:

# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random

from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()

from datascienceutils import analyze

BokehJS successfully loaded.

/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [3]:

irisDf = pd.read_csv('./data/Iris.csv')

In [4]:

from itertools import combinations
numColumns = irisDf.select_dtypes(include=[np.number]).columns
for combo in combinations(numColumns,2):
    analyze.regression_analyze(irisDf, combo[0], combo[1], check_vif=False, check_heteroskedasticity=False)

P-value and test statistic for distribution similarity between SepalLengthCm and SepalWidthCm

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-4-e858ef51aacb> in <module>()
      2 numColumns = irisDf.select_dtypes(include=[np.number]).columns
      3 for combo in combinations(numColumns,2):
----> 4     analyze.regression_analyze(irisDf, combo[0], combo[1], check_vif=False, check_heteroskedasticity=False)

~/playspace/data-science-utils/datascienceutils/analyze.py in regression_analyze(df, col1, col2, trainsize, non_linear, check_heteroskedasticity, check_vif, check_dist_similarity, **kwargs)
    312     if check_dist_similarity:
    313         print("P-value and test statistic for distribution similarity between %s and %s"%(col1, col2))
--> 314         is_similar_distribution(df[col1], df[col2])
    315 
    316     new_df = df[[col1, col2]].copy(deep=True)

~/playspace/data-science-utils/datascienceutils/analyze.py in is_similar_distribution(original_dist, target_dist, test_type)
    190 def is_similar_distribution(original_dist, target_dist, test_type='permutation'):
    191     if test_type=='permutation':
--> 192         from permute.core import two_sample
    193         kwargs = {'stat':'t','alternative':'two-sided','seed':20}
    194         p_value = two_sample(original_dist, target_dist)

ModuleNotFoundError: No module named 'permute'