#!/usr/bin/env python
# coding: utf-8

# In[4]:


get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', '-a "Romell D.Z." -u -d -p numpy,pandas,matplotlib,sklearn')


# # 3. Machine Learning Tuning

# In[5]:


from __future__ import division
import warnings
warnings.simplefilter('ignore' )
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'")
plt.rcParams['figure.figsize'] = (18,6) 


# In[6]:


dataset = pd.read_json('dataset.json',orient='split')
dataset.head()


# In[7]:


dataset.info()


# In[8]:


# Value empty in 'text column'
dataset[dataset['text']==''].shape[0]


# In[9]:


dataset.describe().T


# In[10]:


dataset['label'].head()


# In[11]:


pd.get_dummies(dataset['label']).head()


# In[12]:


from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric']],
                                                    pd.get_dummies(dataset['label']))

pl = Pipeline([ ('model', OneVsRestClassifier(LogisticRegression())) ])

pl.fit(X_train, y_train)

accuracy = pl.score(X_test, y_test)
print("Accuracy on dataframe with Numerical & Label: ", accuracy)


# In[13]:


from sklearn.impute import SimpleImputer

X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric', 'with_missing']],
                                                    pd.get_dummies(dataset['label']))

pl = Pipeline([
        ('imputer', SimpleImputer()),
        ('model', OneVsRestClassifier(LogisticRegression()))
    ])

pl.fit(X_train,y_train)

# Compute and print accuracy
accuracy = pl.score(X_test,y_test)
print("Accuracy on dataframe with All Numerical: ", pl.score(X_test,y_test))


# In[14]:


from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train, y_test = train_test_split(dataset['text'],
                                                    pd.get_dummies(dataset['label']))

pl = Pipeline([
        ('vec', CountVectorizer()),
        ('model', OneVsRestClassifier(LogisticRegression())) ])

pl.fit(X_train,y_train)
print("Accuracy on dataframe with only text data: ", pl.score(X_test,y_test))


# In[15]:


from sklearn.preprocessing import FunctionTransformer

extract_text = FunctionTransformer(lambda x: x['text'], validate=False)
text_data = extract_text.fit_transform(dataset)

extract_numeric = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)
numeric_data = extract_numeric.fit_transform(dataset)

print('Extracting Text Data')
print(text_data.head())
print('Extracting Numeric Data')
print(numeric_data.head())


# In[16]:


from sklearn.pipeline import FeatureUnion

X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric', 'with_missing', 'text']],
                                                    pd.get_dummies(dataset['label']))

pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

pip.fit(X_train, y_train)
pred_prob_LR = pip.predict_proba(X_test)[:,0]

accuracy = pip.score(X_test, y_test)
print("Accuracy on all dataset: %.4f"% accuracy)


# In[17]:


from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import MaxAbsScaler
chi_k = 2 # 300
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()), 
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

pip.fit(X_train, y_train)
pred_prob_M_LR = pip.predict_proba(X_test)[:,0]

print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test))


# In[18]:


from sklearn.tree import DecisionTreeClassifier

pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', DecisionTreeClassifier())
    ])

pip.fit(X_train, y_train)
pred_prob_DT = pip.predict_proba(X_test)[0][:,1]

print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)  )


# In[19]:


from sklearn.preprocessing import StandardScaler
pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer()),
                    ('scaler', StandardScaler())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', DecisionTreeClassifier())
    ])

pip.fit(X_train, y_train)
pred_prob_SS_DT = pip.predict_proba(X_test)[0][:,1]

print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)  )


# In[20]:


from sklearn.ensemble import RandomForestClassifier

pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', RandomForestClassifier())
    ])

pip.fit(X_train, y_train) 
pred_prob_RFC = pip.predict_proba(X_test)[0][:,1]
print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)  )


# In[21]:


from sklearn.ensemble import RandomForestClassifier

pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', RandomForestClassifier(n_estimators=15))
    ])

pip.fit(X_train, y_train)
pred_prob_RFC_15 = pip.predict_proba(X_test)[0][:,1]

print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)  )


# In[22]:


from sklearn.metrics import roc_curve,roc_auc_score,auc
for label,y_pred_prob in zip(["LR","MLR","DTC","SS_DTC","RFC","RFC_15"],
                    [pred_prob_LR,pred_prob_M_LR,pred_prob_DT,pred_prob_SS_DT,pred_prob_RFC,pred_prob_RFC_15]):
    fpr,tpr, _ = roc_curve(y_test.iloc[:,0], y_pred_prob)
    plt.plot(fpr, tpr,label=label+str(", AUC: %.4f"%auc(fpr,tpr)))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')
plt.title('BentchMarking using ROC and AUC')
plt.savefig('snapshot/roc_curve',bbox_inches='tight',dpi=100);


# In[23]:


from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
plt.figure()
plt.title("RandomForestClassifier")
plt.xlabel("Training examples")
plt.ylabel("Score")
cv = ShuffleSplit(n_splits=100, test_size=0.2)
train_sizes, train_scores, test_scores = learning_curve(
    pip, dataset[['numeric', 'with_missing', 'text']], pd.get_dummies(dataset['label']),
    cv=cv,train_sizes=np.linspace(.1, 1.0, 15))
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
print('train_sizes:',train_sizes)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")
plt.legend(loc="lower right")
plt.savefig('snapshot/learning_curve',bbox_inches='tight',dpi=100);


# In[24]:


import scikitplot as skplt
skplt.estimators.plot_learning_curve(pip, cv=cv,
                                     X=dataset[['numeric', 'with_missing', 'text']],
                                     y=pd.get_dummies(dataset['label']))
plt.savefig('snapshot/learning_curve_plot',bbox_inches='tight',dpi=100);


# In[25]:


param_range = np.linspace(3,14,12,dtype=int)
from sklearn.model_selection import validation_curve
train_scores, test_scores = validation_curve(
    pip, dataset[['numeric', 'with_missing', 'text']], pd.get_dummies(dataset['label']),
    param_name="clf__max_depth", param_range=param_range,
    cv=cv, scoring="accuracy")
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

print(train_scores_mean, train_scores_std)

plt.title("Validation Curve")
plt.xlabel("Max Depth")
plt.ylabel("Score")
plt.ylim(.6, 1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.xticks(param_range,param_range)
plt.legend(loc="best")
plt.tight_layout()
plt.savefig('snapshot/validation_curve',bbox_inches='tight',dpi=100);


# In[26]:


skplt.metrics.plot_lift_curve(y_true=y_test.iloc[:,0].values, y_probas=pip.predict_proba(X_test)[0])
plt.savefig('snapshot/lift_curve',bbox_inches='tight',dpi=100);


# In[27]:


probas_list = [pred_prob_LR,pred_prob_M_LR,pred_prob_DT,pred_prob_SS_DT,
               pred_prob_RFC,pred_prob_RFC_15]
clf_names= ["LR","MLR","DTC","SS_DTC","RFC","RFC_15"]

import scikitplot as skplt
skplt.metrics.plot_calibration_curve(y_test.iloc[:,0],
                                     probas_list=probas_list,
                                     clf_names=clf_names,
                                     n_bins=10)
plt.savefig('snapshot/calibration_curve',bbox_inches='tight',dpi=100);


# In[ ]: