# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()
from datascienceutils import analyze
/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead. from pandas.core import datetools /home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
irisDf = pd.read_csv('./data/Iris.csv')
from itertools import combinations
numColumns = irisDf.select_dtypes(include=[np.number]).columns
for combo in combinations(numColumns,2):
analyze.regression_analyze(irisDf, combo[0], combo[1], check_vif=False, check_heteroskedasticity=False)
P-value and test statistic for distribution similarity between SepalLengthCm and SepalWidthCm
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-4-e858ef51aacb> in <module>() 2 numColumns = irisDf.select_dtypes(include=[np.number]).columns 3 for combo in combinations(numColumns,2): ----> 4 analyze.regression_analyze(irisDf, combo[0], combo[1], check_vif=False, check_heteroskedasticity=False) ~/playspace/data-science-utils/datascienceutils/analyze.py in regression_analyze(df, col1, col2, trainsize, non_linear, check_heteroskedasticity, check_vif, check_dist_similarity, **kwargs) 312 if check_dist_similarity: 313 print("P-value and test statistic for distribution similarity between %s and %s"%(col1, col2)) --> 314 is_similar_distribution(df[col1], df[col2]) 315 316 new_df = df[[col1, col2]].copy(deep=True) ~/playspace/data-science-utils/datascienceutils/analyze.py in is_similar_distribution(original_dist, target_dist, test_type) 190 def is_similar_distribution(original_dist, target_dist, test_type='permutation'): 191 if test_type=='permutation': --> 192 from permute.core import two_sample 193 kwargs = {'stat':'t','alternative':'two-sided','seed':20} 194 p_value = two_sample(original_dist, target_dist) ModuleNotFoundError: No module named 'permute'