In [4]:

%load_ext watermark
%watermark -a "Romell D.Z." -u -d -p numpy,pandas,matplotlib,sklearn

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Romell D.Z. 
last updated: 2019-01-20 

numpy 1.14.6
pandas 0.23.4
matplotlib 2.2.2
sklearn 0.20.0

3. Machine Learning Tuning¶

In [5]:

from __future__ import division
import warnings
warnings.simplefilter('ignore' )
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = (18,6) 

In [6]:

dataset = pd.read_json('dataset.json',orient='split')
dataset.head()

Out[6]:

	numeric	text	with_missing	label
0	-10.856306		4.433240	b
1	9.973454	foo	4.310229	b
2	2.829785	foo bar	2.469828	a
3	-15.062947		2.852981	b
4	-5.786003	foo bar	1.826475	a

In [7]:

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 4 columns):
numeric         1000 non-null float64
text            1000 non-null object
with_missing    822 non-null float64
label           1000 non-null object
dtypes: float64(2), object(2)
memory usage: 39.1+ KB

In [8]:

# Value empty in 'text column'
dataset[dataset['text']==''].shape[0]

Out[8]:

In [9]:

dataset.describe().T

Out[9]:

	count	mean	std	min	25%	50%	75%	max
numeric	1000.0	-0.395641	10.012883	-32.310550	-6.845566	-0.411856	6.688657	35.715792
with_missing	822.0	3.025194	0.994960	-0.801378	2.386520	3.022887	3.693381	5.850708

In [10]:

dataset['label'].head()

Out[10]:

0    b
1    b
2    a
3    b
4    a
Name: label, dtype: object

In [11]:

pd.get_dummies(dataset['label']).head()

Out[11]:

	a	b
0	0	1
1	0	1
2	1	0
3	0	1
4	1	0

In [12]:

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric']],
                                                    pd.get_dummies(dataset['label']))

pl = Pipeline([ ('model', OneVsRestClassifier(LogisticRegression())) ])

pl.fit(X_train, y_train)

accuracy = pl.score(X_test, y_test)
print("Accuracy on dataframe with Numerical & Label: ", accuracy)

Accuracy on dataframe with Numerical & Label:  0.688

In [13]:

from sklearn.impute import SimpleImputer

X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric', 'with_missing']],
                                                    pd.get_dummies(dataset['label']))

pl = Pipeline([
        ('imputer', SimpleImputer()),
        ('model', OneVsRestClassifier(LogisticRegression()))
    ])

pl.fit(X_train,y_train)

# Compute and print accuracy
accuracy = pl.score(X_test,y_test)
print("Accuracy on dataframe with All Numerical: ", pl.score(X_test,y_test))

Accuracy on dataframe with All Numerical:  0.648

In [14]:

from sklearn.feature_extraction.text import CountVectorizer

X_train, X_test, y_train, y_test = train_test_split(dataset['text'],
                                                    pd.get_dummies(dataset['label']))

pl = Pipeline([
        ('vec', CountVectorizer()),
        ('model', OneVsRestClassifier(LogisticRegression())) ])

pl.fit(X_train,y_train)
print("Accuracy on dataframe with only text data: ", pl.score(X_test,y_test))

Accuracy on dataframe with only text data:  0.848

In [15]:

from sklearn.preprocessing import FunctionTransformer

extract_text = FunctionTransformer(lambda x: x['text'], validate=False)
text_data = extract_text.fit_transform(dataset)

extract_numeric = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)
numeric_data = extract_numeric.fit_transform(dataset)

print('Extracting Text Data')
print(text_data.head())
print('Extracting Numeric Data')
print(numeric_data.head())

Extracting Text Data
0           
1        foo
2    foo bar
3           
4    foo bar
Name: text, dtype: object
Extracting Numeric Data
     numeric  with_missing
0 -10.856306      4.433240
1   9.973454      4.310229
2   2.829785      2.469828
3 -15.062947      2.852981
4  -5.786003      1.826475

In [16]:

from sklearn.pipeline import FeatureUnion

X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric', 'with_missing', 'text']],
                                                    pd.get_dummies(dataset['label']))

pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

pip.fit(X_train, y_train)
pred_prob_LR = pip.predict_proba(X_test)[:,0]

accuracy = pip.score(X_test, y_test)
print("Accuracy on all dataset: %.4f"% accuracy)

Accuracy on all dataset: 0.9280

In [17]:

from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import MaxAbsScaler
chi_k = 2 # 300
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                   ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('scale', MaxAbsScaler()), 
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

pip.fit(X_train, y_train)
pred_prob_M_LR = pip.predict_proba(X_test)[:,0]

print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test))

Accuracy on all dataset: 0.7280

In [18]:

from sklearn.tree import DecisionTreeClassifier

pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', DecisionTreeClassifier())
    ])

pip.fit(X_train, y_train)
pred_prob_DT = pip.predict_proba(X_test)[0][:,1]

print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)  )

Accuracy on all dataset: 0.8800

In [19]:

from sklearn.preprocessing import StandardScaler
pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer()),
                    ('scaler', StandardScaler())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', DecisionTreeClassifier())
    ])

pip.fit(X_train, y_train)
pred_prob_SS_DT = pip.predict_proba(X_test)[0][:,1]

print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)  )

Accuracy on all dataset: 0.8840

In [20]:

from sklearn.ensemble import RandomForestClassifier

pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', RandomForestClassifier())
    ])

pip.fit(X_train, y_train) 
pred_prob_RFC = pip.predict_proba(X_test)[0][:,1]
print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)  )

Accuracy on all dataset: 0.8760

In [21]:

from sklearn.ensemble import RandomForestClassifier

pip = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', extract_numeric),
                    ('imputer', SimpleImputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', extract_text),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )),
        ('clf', RandomForestClassifier(n_estimators=15))
    ])

pip.fit(X_train, y_train)
pred_prob_RFC_15 = pip.predict_proba(X_test)[0][:,1]

print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)  )

Accuracy on all dataset: 0.9040

In [22]:

from sklearn.metrics import roc_curve,roc_auc_score,auc
for label,y_pred_prob in zip(["LR","MLR","DTC","SS_DTC","RFC","RFC_15"],
                    [pred_prob_LR,pred_prob_M_LR,pred_prob_DT,pred_prob_SS_DT,pred_prob_RFC,pred_prob_RFC_15]):
    fpr,tpr, _ = roc_curve(y_test.iloc[:,0], y_pred_prob)
    plt.plot(fpr, tpr,label=label+str(", AUC: %.4f"%auc(fpr,tpr)))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')
plt.title('BentchMarking using ROC and AUC')
plt.savefig('snapshot/roc_curve',bbox_inches='tight',dpi=100);

In [23]:

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
plt.figure()
plt.title("RandomForestClassifier")
plt.xlabel("Training examples")
plt.ylabel("Score")
cv = ShuffleSplit(n_splits=100, test_size=0.2)
train_sizes, train_scores, test_scores = learning_curve(
    pip, dataset[['numeric', 'with_missing', 'text']], pd.get_dummies(dataset['label']),
    cv=cv,train_sizes=np.linspace(.1, 1.0, 15))
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
print('train_sizes:',train_sizes)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")
plt.legend(loc="lower right")
plt.savefig('snapshot/learning_curve',bbox_inches='tight',dpi=100);

train_sizes: [ 80 131 182 234 285 337 388 440 491 542 594 645 697 748 800]

In [24]:

import scikitplot as skplt
skplt.estimators.plot_learning_curve(pip, cv=cv,
                                     X=dataset[['numeric', 'with_missing', 'text']],
                                     y=pd.get_dummies(dataset['label']))
plt.savefig('snapshot/learning_curve_plot',bbox_inches='tight',dpi=100);

In [25]:

param_range = np.linspace(3,14,12,dtype=int)
from sklearn.model_selection import validation_curve
train_scores, test_scores = validation_curve(
    pip, dataset[['numeric', 'with_missing', 'text']], pd.get_dummies(dataset['label']),
    param_name="clf__max_depth", param_range=param_range,
    cv=cv, scoring="accuracy")
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

print(train_scores_mean, train_scores_std)

plt.title("Validation Curve")
plt.xlabel("Max Depth")
plt.ylabel("Score")
plt.ylim(.6, 1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.xticks(param_range,param_range)
plt.legend(loc="best")
plt.tight_layout()
plt.savefig('snapshot/validation_curve',bbox_inches='tight',dpi=100);

[0.919675  0.9359625 0.947075  0.9578625 0.971225  0.9808    0.987475
 0.9915125 0.9938875 0.99495   0.9952    0.9955875] [0.01026313 0.00549959 0.00600578 0.00524492 0.00494653 0.00400983
 0.00330709 0.00298588 0.00265103 0.00247437 0.00246044 0.00200355]

In [26]:

skplt.metrics.plot_lift_curve(y_true=y_test.iloc[:,0].values, y_probas=pip.predict_proba(X_test)[0])
plt.savefig('snapshot/lift_curve',bbox_inches='tight',dpi=100);

In [27]:

probas_list = [pred_prob_LR,pred_prob_M_LR,pred_prob_DT,pred_prob_SS_DT,
               pred_prob_RFC,pred_prob_RFC_15]
clf_names= ["LR","MLR","DTC","SS_DTC","RFC","RFC_15"]

import scikitplot as skplt
skplt.metrics.plot_calibration_curve(y_test.iloc[:,0],
                                     probas_list=probas_list,
                                     clf_names=clf_names,
                                     n_bins=10)
plt.savefig('snapshot/calibration_curve',bbox_inches='tight',dpi=100);

In [ ]: