In [1]:

# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
from datascienceutils import predictiveModels as pm

# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random

from sklearn import cross_validation
from sklearn import metrics

from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()

from sqlalchemy import create_engine

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Loading BokehJS ...

In [4]:

twoSigmaDf = None
with pd.HDFStore("/home/anand/DataScientist/data/train.h5", "r") as train:
    # Note that the "train" dataframe is the only dataframe in the file
    twoSigmaDf = train.get("train")

In [ ]:

twoSigmaDf.describe()

In [ ]:

twoSigmaDf.columns

In [ ]:

twoSigmaDf[twoSigmaDf.id==10]['y'].describe()

In [ ]:

grouped_ids = twoSigmaDf.groupby('id').count()

In [ ]:

twoSigmaDf[twoSigmaDf.id==12]['y'].describe()

In [ ]:

twoSigmaDf['timestamp'].nunique()

In [ ]:

twoSigmaDf.head()

In [ ]:

twoSigmaDf.corr()

In [ ]:

a = twoSigmaDf.corr()

In [ ]:

sampleDf = twoSigmaDf.sample(n=100000)

In [ ]:

analyze.dist_analyze(twoSigmaDf, 'timestamp')

In [ ]:

analyze.dist_analyze(twoSigmaDf, 'y')

In [ ]:

neg_model = twoSigmaDf[twoSigmaDf.y< -0.08]
pos_model = twoSigmaDf[twoSigmaDf.y> 0.09]
central_model = twoSigmaDf[twoSigmaDf.y> -0.08][twoSigmaDf.y < 0.09]
non_central_model = pd.concat([neg_model, pos_model])

In [ ]:

analyze.dist_analyze(non_central_model)

In [ ]:

analyze.dist_analyze(neg_model)

In [ ]:

analyze.dist_analyze(pos_model)

In [ ]:

analyze.dist_analyze(central_model)

In [ ]:

len(neg_model.columns)

In [ ]:

neg_model.describe()

In [ ]:

pos_model.describe()

In [ ]:

central_model.describe()

In [ ]:

central_model.corr()

In [ ]:

prediction = 2.217474e-04
mean = twoSigmaDf['y'].mean()
sampleDf['rmse_pred'] = sampleDf['y'].apply(lambda x: (x-prediction)**2)
sampleDf['rmse_mean'] = sampleDf['y'].apply(lambda x: (x-mean)**2)

r_square = 1 - sampleDf['rmse_pred'].sum()/sampleDf['rmse_mean'].sum()

In [ ]:

out = np.sqrt(abs(r_square))
print(out* -1)

In [ ]:

sample2Df = twoSigmaDf.sample(n=300000)
prediction = 2.217474e-04
mean = sample2Df['y'].mean()
sample2Df['rmse_pred'] = sample2Df['y'].apply(lambda x: (x-prediction)**2)
sample2Df['rmse_mean'] = sample2Df['y'].apply(lambda x: (x-mean)**2)

r_square = 1 - sample2Df['rmse_pred'].sum()/sample2Df['rmse_mean'].sum()
out = np.sqrt(abs(r_square))
print(out, r_square)

In [ ]:

sample6Df = twoSigmaDf.sample(n=600000)
prediction = 0
mean = sample6Df['y'].mean()
sample6Df['rmse_pred'] = sample6Df['y'].apply(lambda x: (x-prediction)**2)
sample6Df['rmse_mean'] = sample6Df['y'].apply(lambda x: (x-mean)**2)

r_square = 1 - sample6Df['rmse_pred'].sum()/sample6Df['rmse_mean'].sum()
out = np.sqrt(abs(r_square))
print(out, r_square)

In [ ]:

prediction = 0
mean = twoSigmaDf['y'].mean()
central_mean = central_model['y'].mean()
def get_prediction(x):
    if x <= -0.08:
        return -0.085
    elif x >= 0.09:
        return 0.095
    else:
        return central_mean

def get_r_square(prediction, actual):
    assert len(prediction) == len(actual), "Lengths of actuals and predictions should be equal"
    rmse_pred = 0
    rmse_mean = 0
    mean = np.mean(actual)
    for pred,ea in zip(prediction, actual):
        rmse_pred += (ea - pred)**2
        rmse_mean += (ea - mean)**2
    return 1 - rmse_pred/rmse_mean
    
    
    pass
    
#twoSigmaDf['prediction'] = twoSigmaDf['y'].apply(lambda x: get_prediction(x))
twoSigmaDf['rmse_pred'] = twoSigmaDf['y'].apply(lambda x: (x-get_prediction(x))**2)
twoSigmaDf['rmse_mean'] = twoSigmaDf['y'].apply(lambda x: (x-mean)**2)

r_square = 1 - twoSigmaDf['rmse_pred'].sum()/twoSigmaDf['rmse_mean'].sum()
out = np.sqrt(abs(r_square))

print(out, r_square)
print(get_r_square(twoSigmaDf['y'].apply(lambda x: get_prediction(x)), twoSigmaDf['y']))

In [ ]:

def get_category(x):
    if x <= -0.08:
        return 'Neg'
    elif x >= 0.09:
        return "Pos"
    else:
        return "Mid"
twoSigmaDf['category'] = twoSigmaDf['y'].apply(lambda x: get_category(x))

In [ ]:

a = twoSigmaDf.groupby('category').describe()

In [ ]:

b = a.transpose().reset_index()
b.columns

In [ ]:

from datascienceutils import utils
anova_dict = dict()
for col in twoSigmaDf.columns:
    anova_dict[col] = utils.calculate_anova(sample2Df, 'y', col)
print(anova_dict)

In [ ]:

filteredDf = twoSigmaDf[['id', 'timestamp', 'technical_34', 'technical_20']]
filteredDf.fillna(filteredDf.mean(), inplace=True)
target = twoSigmaDf['category']
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#gnb_model = pm.train(filteredDf, target, 'gaussianNB')
gnb.fit(filteredDf, target)

In [ ]:

def get_pred_by_category(x):
    if x == 'Mid':
        return 0
    elif x == 'Neg':
        return -0.085
    else:
        return 0.095
num_predictions = list(map(get_pred_by_category, gnb.predict(filteredDf)))

get_r_square(num_predictions, list(twoSigmaDf['y']))