# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
from datascienceutils import predictiveModels as pm
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from sklearn import cross_validation
from sklearn import metrics
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()
from sqlalchemy import create_engine
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
twoSigmaDf = None
with pd.HDFStore("/home/anand/DataScientist/data/train.h5", "r") as train:
# Note that the "train" dataframe is the only dataframe in the file
twoSigmaDf = train.get("train")
twoSigmaDf.describe()
twoSigmaDf.columns
twoSigmaDf[twoSigmaDf.id==10]['y'].describe()
grouped_ids = twoSigmaDf.groupby('id').count()
twoSigmaDf[twoSigmaDf.id==12]['y'].describe()
twoSigmaDf['timestamp'].nunique()
twoSigmaDf.head()
twoSigmaDf.corr()
a = twoSigmaDf.corr()
sampleDf = twoSigmaDf.sample(n=100000)
analyze.dist_analyze(twoSigmaDf, 'timestamp')
analyze.dist_analyze(twoSigmaDf, 'y')
neg_model = twoSigmaDf[twoSigmaDf.y< -0.08]
pos_model = twoSigmaDf[twoSigmaDf.y> 0.09]
central_model = twoSigmaDf[twoSigmaDf.y> -0.08][twoSigmaDf.y < 0.09]
non_central_model = pd.concat([neg_model, pos_model])
analyze.dist_analyze(non_central_model)
analyze.dist_analyze(neg_model)
analyze.dist_analyze(pos_model)
analyze.dist_analyze(central_model)
len(neg_model.columns)
neg_model.describe()
pos_model.describe()
central_model.describe()
central_model.corr()
prediction = 2.217474e-04
mean = twoSigmaDf['y'].mean()
sampleDf['rmse_pred'] = sampleDf['y'].apply(lambda x: (x-prediction)**2)
sampleDf['rmse_mean'] = sampleDf['y'].apply(lambda x: (x-mean)**2)
r_square = 1 - sampleDf['rmse_pred'].sum()/sampleDf['rmse_mean'].sum()
out = np.sqrt(abs(r_square))
print(out* -1)
sample2Df = twoSigmaDf.sample(n=300000)
prediction = 2.217474e-04
mean = sample2Df['y'].mean()
sample2Df['rmse_pred'] = sample2Df['y'].apply(lambda x: (x-prediction)**2)
sample2Df['rmse_mean'] = sample2Df['y'].apply(lambda x: (x-mean)**2)
r_square = 1 - sample2Df['rmse_pred'].sum()/sample2Df['rmse_mean'].sum()
out = np.sqrt(abs(r_square))
print(out, r_square)
sample6Df = twoSigmaDf.sample(n=600000)
prediction = 0
mean = sample6Df['y'].mean()
sample6Df['rmse_pred'] = sample6Df['y'].apply(lambda x: (x-prediction)**2)
sample6Df['rmse_mean'] = sample6Df['y'].apply(lambda x: (x-mean)**2)
r_square = 1 - sample6Df['rmse_pred'].sum()/sample6Df['rmse_mean'].sum()
out = np.sqrt(abs(r_square))
print(out, r_square)
prediction = 0
mean = twoSigmaDf['y'].mean()
central_mean = central_model['y'].mean()
def get_prediction(x):
if x <= -0.08:
return -0.085
elif x >= 0.09:
return 0.095
else:
return central_mean
def get_r_square(prediction, actual):
assert len(prediction) == len(actual), "Lengths of actuals and predictions should be equal"
rmse_pred = 0
rmse_mean = 0
mean = np.mean(actual)
for pred,ea in zip(prediction, actual):
rmse_pred += (ea - pred)**2
rmse_mean += (ea - mean)**2
return 1 - rmse_pred/rmse_mean
pass
#twoSigmaDf['prediction'] = twoSigmaDf['y'].apply(lambda x: get_prediction(x))
twoSigmaDf['rmse_pred'] = twoSigmaDf['y'].apply(lambda x: (x-get_prediction(x))**2)
twoSigmaDf['rmse_mean'] = twoSigmaDf['y'].apply(lambda x: (x-mean)**2)
r_square = 1 - twoSigmaDf['rmse_pred'].sum()/twoSigmaDf['rmse_mean'].sum()
out = np.sqrt(abs(r_square))
print(out, r_square)
print(get_r_square(twoSigmaDf['y'].apply(lambda x: get_prediction(x)), twoSigmaDf['y']))
def get_category(x):
if x <= -0.08:
return 'Neg'
elif x >= 0.09:
return "Pos"
else:
return "Mid"
twoSigmaDf['category'] = twoSigmaDf['y'].apply(lambda x: get_category(x))
a = twoSigmaDf.groupby('category').describe()
b = a.transpose().reset_index()
b.columns
from datascienceutils import utils
anova_dict = dict()
for col in twoSigmaDf.columns:
anova_dict[col] = utils.calculate_anova(sample2Df, 'y', col)
print(anova_dict)
filteredDf = twoSigmaDf[['id', 'timestamp', 'technical_34', 'technical_20']]
filteredDf.fillna(filteredDf.mean(), inplace=True)
target = twoSigmaDf['category']
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#gnb_model = pm.train(filteredDf, target, 'gaussianNB')
gnb.fit(filteredDf, target)
def get_pred_by_category(x):
if x == 'Mid':
return 0
elif x == 'Neg':
return -0.085
else:
return 0.095
num_predictions = list(map(get_pred_by_category, gnb.predict(filteredDf)))
get_r_square(num_predictions, list(twoSigmaDf['y']))