#!/usr/bin/env python # coding: utf-8 # In[4]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-a "Romell D.Z." -u -d -p numpy,pandas,matplotlib,sklearn') # # 3. Machine Learning Tuning # In[5]: from __future__ import division import warnings warnings.simplefilter('ignore' ) get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np import pandas as pd get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'") plt.rcParams['figure.figsize'] = (18,6) # In[6]: dataset = pd.read_json('dataset.json',orient='split') dataset.head() # In[7]: dataset.info() # In[8]: # Value empty in 'text column' dataset[dataset['text']==''].shape[0] # In[9]: dataset.describe().T # In[10]: dataset['label'].head() # In[11]: pd.get_dummies(dataset['label']).head() # In[12]: from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric']], pd.get_dummies(dataset['label'])) pl = Pipeline([ ('model', OneVsRestClassifier(LogisticRegression())) ]) pl.fit(X_train, y_train) accuracy = pl.score(X_test, y_test) print("Accuracy on dataframe with Numerical & Label: ", accuracy) # In[13]: from sklearn.impute import SimpleImputer X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric', 'with_missing']], pd.get_dummies(dataset['label'])) pl = Pipeline([ ('imputer', SimpleImputer()), ('model', OneVsRestClassifier(LogisticRegression())) ]) pl.fit(X_train,y_train) # Compute and print accuracy accuracy = pl.score(X_test,y_test) print("Accuracy on dataframe with All Numerical: ", pl.score(X_test,y_test)) # In[14]: from sklearn.feature_extraction.text import CountVectorizer X_train, X_test, y_train, y_test = train_test_split(dataset['text'], pd.get_dummies(dataset['label'])) pl = Pipeline([ ('vec', CountVectorizer()), ('model', OneVsRestClassifier(LogisticRegression())) ]) pl.fit(X_train,y_train) print("Accuracy on dataframe with only text data: ", pl.score(X_test,y_test)) # In[15]: from sklearn.preprocessing import FunctionTransformer extract_text = FunctionTransformer(lambda x: x['text'], validate=False) text_data = extract_text.fit_transform(dataset) extract_numeric = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False) numeric_data = extract_numeric.fit_transform(dataset) print('Extracting Text Data') print(text_data.head()) print('Extracting Numeric Data') print(numeric_data.head()) # In[16]: from sklearn.pipeline import FeatureUnion X_train, X_test, y_train, y_test = train_test_split(dataset[['numeric', 'with_missing', 'text']], pd.get_dummies(dataset['label'])) pip = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', extract_numeric), ('imputer', SimpleImputer()) ])), ('text_features', Pipeline([ ('selector', extract_text), ('vectorizer', CountVectorizer()) ])) ] )), ('clf', OneVsRestClassifier(LogisticRegression())) ]) pip.fit(X_train, y_train) pred_prob_LR = pip.predict_proba(X_test)[:,0] accuracy = pip.score(X_test, y_test) print("Accuracy on all dataset: %.4f"% accuracy) # In[17]: from sklearn.feature_selection import chi2, SelectKBest from sklearn.preprocessing import MaxAbsScaler chi_k = 2 # 300 TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' pip = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', extract_numeric), ('imputer', SimpleImputer()) ])), ('text_features', Pipeline([ ('selector', extract_text), ('vectorizer', CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC, ngram_range=(1,2))), ('dim_red', SelectKBest(chi2, chi_k)) ])) ] )), ('scale', MaxAbsScaler()), ('clf', OneVsRestClassifier(LogisticRegression())) ]) pip.fit(X_train, y_train) pred_prob_M_LR = pip.predict_proba(X_test)[:,0] print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test)) # In[18]: from sklearn.tree import DecisionTreeClassifier pip = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', extract_numeric), ('imputer', SimpleImputer()) ])), ('text_features', Pipeline([ ('selector', extract_text), ('vectorizer', CountVectorizer()) ])) ] )), ('clf', DecisionTreeClassifier()) ]) pip.fit(X_train, y_train) pred_prob_DT = pip.predict_proba(X_test)[0][:,1] print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test) ) # In[19]: from sklearn.preprocessing import StandardScaler pip = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', extract_numeric), ('imputer', SimpleImputer()), ('scaler', StandardScaler()) ])), ('text_features', Pipeline([ ('selector', extract_text), ('vectorizer', CountVectorizer()) ])) ] )), ('clf', DecisionTreeClassifier()) ]) pip.fit(X_train, y_train) pred_prob_SS_DT = pip.predict_proba(X_test)[0][:,1] print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test) ) # In[20]: from sklearn.ensemble import RandomForestClassifier pip = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', extract_numeric), ('imputer', SimpleImputer()) ])), ('text_features', Pipeline([ ('selector', extract_text), ('vectorizer', CountVectorizer()) ])) ] )), ('clf', RandomForestClassifier()) ]) pip.fit(X_train, y_train) pred_prob_RFC = pip.predict_proba(X_test)[0][:,1] print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test) ) # In[21]: from sklearn.ensemble import RandomForestClassifier pip = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', extract_numeric), ('imputer', SimpleImputer()) ])), ('text_features', Pipeline([ ('selector', extract_text), ('vectorizer', CountVectorizer()) ])) ] )), ('clf', RandomForestClassifier(n_estimators=15)) ]) pip.fit(X_train, y_train) pred_prob_RFC_15 = pip.predict_proba(X_test)[0][:,1] print("Accuracy on all dataset: %.4f"% pip.score(X_test, y_test) ) # In[22]: from sklearn.metrics import roc_curve,roc_auc_score,auc for label,y_pred_prob in zip(["LR","MLR","DTC","SS_DTC","RFC","RFC_15"], [pred_prob_LR,pred_prob_M_LR,pred_prob_DT,pred_prob_SS_DT,pred_prob_RFC,pred_prob_RFC_15]): fpr,tpr, _ = roc_curve(y_test.iloc[:,0], y_pred_prob) plt.plot(fpr, tpr,label=label+str(", AUC: %.4f"%auc(fpr,tpr))) plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc='best') plt.title('BentchMarking using ROC and AUC') plt.savefig('snapshot/roc_curve',bbox_inches='tight',dpi=100); # In[23]: from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit plt.figure() plt.title("RandomForestClassifier") plt.xlabel("Training examples") plt.ylabel("Score") cv = ShuffleSplit(n_splits=100, test_size=0.2) train_sizes, train_scores, test_scores = learning_curve( pip, dataset[['numeric', 'with_missing', 'text']], pd.get_dummies(dataset['label']), cv=cv,train_sizes=np.linspace(.1, 1.0, 15)) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() print('train_sizes:',train_sizes) plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="lower right") plt.savefig('snapshot/learning_curve',bbox_inches='tight',dpi=100); # In[24]: import scikitplot as skplt skplt.estimators.plot_learning_curve(pip, cv=cv, X=dataset[['numeric', 'with_missing', 'text']], y=pd.get_dummies(dataset['label'])) plt.savefig('snapshot/learning_curve_plot',bbox_inches='tight',dpi=100); # In[25]: param_range = np.linspace(3,14,12,dtype=int) from sklearn.model_selection import validation_curve train_scores, test_scores = validation_curve( pip, dataset[['numeric', 'with_missing', 'text']], pd.get_dummies(dataset['label']), param_name="clf__max_depth", param_range=param_range, cv=cv, scoring="accuracy") train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) print(train_scores_mean, train_scores_std) plt.title("Validation Curve") plt.xlabel("Max Depth") plt.ylabel("Score") plt.ylim(.6, 1) lw = 2 plt.semilogx(param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw) plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.2, color="darkorange", lw=lw) plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw) plt.fill_between(param_range, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.2, color="navy", lw=lw) plt.xticks(param_range,param_range) plt.legend(loc="best") plt.tight_layout() plt.savefig('snapshot/validation_curve',bbox_inches='tight',dpi=100); # In[26]: skplt.metrics.plot_lift_curve(y_true=y_test.iloc[:,0].values, y_probas=pip.predict_proba(X_test)[0]) plt.savefig('snapshot/lift_curve',bbox_inches='tight',dpi=100); # In[27]: probas_list = [pred_prob_LR,pred_prob_M_LR,pred_prob_DT,pred_prob_SS_DT, pred_prob_RFC,pred_prob_RFC_15] clf_names= ["LR","MLR","DTC","SS_DTC","RFC","RFC_15"] import scikitplot as skplt skplt.metrics.plot_calibration_curve(y_test.iloc[:,0], probas_list=probas_list, clf_names=clf_names, n_bins=10) plt.savefig('snapshot/calibration_curve',bbox_inches='tight',dpi=100); # In[ ]: