#!/usr/bin/env python # coding: utf-8 # #

Machine Learning Using Python (MEAFA Workshop)

#

Lesson 7: Ensembles and Stacking

#
# # Twitter Airline Sentiment Data
# Data Preparation
# Text Classification Methods
# Voting Classifier
# Model Stacking
# Model Evaluation
# # This notebook relies on the following imports and settings. # In[1]: # Packages import nltk import numpy as np from scipy import stats import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') # In[2]: # Plot settings sns.set_context('notebook') sns.set_style('ticks') colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF'] crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB'] sns.set_palette(colours) get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['figure.figsize'] = (9, 6) # In[3]: from sklearn.model_selection import train_test_split from sklearn.model_selection import RandomizedSearchCV, GridSearchCV from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, confusion_matrix, log_loss from sklearn.naive_bayes import BernoulliNB from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier import lightgbm as lgb # ##Twitter Airline Sentiment Data # # In this lesson we revisit the Twitter airline sentiment dataset. To save time, we directly load the processed dataset that we constructed in the earlier lesson. # In[4]: data = pd.read_pickle('Datasets/processed_tweets.pickle') data.head() # In[5]: data[['text','tokens']].tail(10) # In[6]: # Randomly split indexes index_train, index_test = train_test_split(np.array(data.index), train_size=0.7, random_state=1) # Write training and test sets train = data.loc[index_train,:].copy() test = data.loc[index_test,:].copy() y_train = train['positive'].values y_test = test['positive'].values # ##Data preparation # # Compute frequency distribution of tokens. # In[7]: fdist = nltk.FreqDist() for words in train['tokens']: for word in words: fdist[word] += 1 # Discard features with too few appearances, retrieve list of remaining tokens. # In[8]: features = pd.Series(dict(fdist)) features = features.sort_values(ascending=False) features = features[features>=5] len(features) # Rank features based on univariate performance, if we want to include screening. # In[9]: def univariate_design_matrix(feature, series): X=series.apply(lambda tokens: (feature in tokens)) X= X.astype(int) return X.values.reshape((-1,1)) # converting to a NumPy matrix, as requiresd def training_error(feature): X_train = univariate_design_matrix(feature, train['tokens']) nbc= BernoulliNB().fit(X_train, np.ravel(y_train)) prob = nbc.predict_proba(X_train) return log_loss(y_train, prob) losses=[] for feature in features.index: losses.append(training_error(feature)) ranked = pd.Series(losses, index=features.index) ranked = ranked.sort_values() ranked_features = list(ranked.index) # Build design matrix (slow to run). # In[10]: from scipy.sparse import lil_matrix def design_matrix(features, series): X = lil_matrix((len(series),len(features))) # initialise for i in range(len(series)): tokens = series.iloc[i] for j, feature in enumerate(features): # scan the list of features if feature in tokens: # if the feature is among the tokens, X[i, j]= 1.0 return X X_train = design_matrix(ranked_features, train['tokens']) X_test = design_matrix(ranked_features, test['tokens']) # ## Text Classification Methods # # ### Naive Bayes # In[11]: nbc= BernoulliNB() nbc.fit(X_train, y_train) # ### Regularised Logistic Regression # In[12]: logit_l1= LogisticRegressionCV(Cs = 50, penalty='l1', solver='liblinear', scoring='neg_log_loss') logit_l1.fit(X_train, y_train.ravel()) # In[13]: np.sum(np.abs(logit_l1.coef_) == 0.0) # In[14]: logit_l2= LogisticRegressionCV(Cs = 50, penalty='l2', scoring='neg_log_loss') logit_l2.fit(X_train, y_train.ravel()) # ### Random Forest # In[15]: #%%time model = RandomForestClassifier(criterion = 'entropy', n_estimators=100) tuning_parameters = { 'min_samples_leaf': [5, 10, 20, 50], 'max_features': np.arange(50, X_train.shape[1], 50), } rf_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 16, scoring='neg_log_loss', return_train_score=False, n_jobs=4) rf_search.fit(X_train, y_train) rf = rf_search.best_estimator_ print('Best parameters found by randomised search:', rf_search.best_params_, '\n') # ### Gradient Boosting # In[16]: #%%time from xgboost import XGBClassifier model = XGBClassifier() alphas = [0] + list(np.logspace(-10, 10, 81, base=2)) tuning_parameters = { 'learning_rate': [0.001, 0.01, 0.05, 0.1], 'n_estimators' : [250, 500, 750, 1000, 1500, 2000, 2500, 3000, 5000], 'max_depth' : [1, 2, 3, 4], 'reg_alpha': alphas, } gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 32, cv = 5, scoring='neg_log_loss', return_train_score=False, n_jobs=4, random_state = 10) gb_search.fit(X_train, y_train) gb = gb_search.best_estimator_ print('Best parameters found by randomised search:', gb_search.best_params_, '\n') # In[17]: from statlearning import plot_feature_importance plot_feature_importance(gb, ranked_features, max_features=30) plt.show() # ### AdaBoost # In[18]: #%%time from sklearn.ensemble import AdaBoostClassifier learner = DecisionTreeClassifier(criterion='gini') model = AdaBoostClassifier(base_estimator = learner) tuning_parameters = { 'base_estimator__max_depth' : [1,2,3,4], 'learning_rate' : [0.001, 0.01, 0.02, 0.05, 0.1], 'n_estimators' : [100, 250, 500, 750, 1000, 1500, 2000, 3000], } adaboost_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 4, cv = 5, scoring='neg_log_loss', return_train_score=False, n_jobs=4, random_state = 1) adaboost_search.fit(X_train, y_train) adaboost = adaboost_search.best_estimator_ print('Best parameters found by randomised search:', adaboost_search.best_params_, '\n') # ### Linear Support Vector Classifier # In[19]: #%%time from sklearn.svm import LinearSVC Cs = np.logspace(-10, 10, 81, base=2) model = LinearSVC(loss='hinge') tuning_parameters ={ 'C': Cs, } svm_search = GridSearchCV(model, tuning_parameters, cv=5, return_train_score=False, n_jobs=4) svm_search.fit(X_train, y_train) svm = svm_search.best_estimator_ print('Best parameters found by grid search:', svm_search.best_params_, '\n') # ## Voting Classifier # In[20]: #%%time from sklearn.ensemble import VotingClassifier clfs = [('clf1', nbc), ('clf2', logit_l1), ('clf3', logit_l2), ('clf4', gb), ('clf5', svm) ] vhard = VotingClassifier(clfs) vhard.fit(X_train, y_train) # In[21]: #%%time clfs = [('clf1', nbc), ('clf2', logit_l1), ('clf3', logit_l2), ('clf4', gb)] # We exclude SVM since it does not predict probabilities vsoft = VotingClassifier(clfs, voting='soft') vsoft.fit(X_train, y_train) # ## Model Stacking # # In[22]: get_ipython().run_cell_magic('time', '', 'from mlxtend.classifier import StackingCVClassifier\n\nstack = StackingCVClassifier([nbc, logit_l1, logit_l2, gb], use_probas=True, meta_classifier = LogisticRegression(C=1e4), cv=5)\nstack.fit(X_train.todense(), y_train) \n\n# The stacking class is not compatible with sparse matrices, which considerably slows down training\n# Remove XGboost, or replace it with LightGBM, for faster results\n') # ## Model Evaluation # # In[23]: columns=['Error rate', 'Sensitivity', 'Specificity', 'AUC', 'Precision'] rows=['Naive Bayes', 'Logistic L1', 'Logistic L2', 'Random Forest', 'Gradient Boosting', 'AdaBoost', 'Linear SVC', 'Hard Voting', 'Soft Voting', 'Stack'] results=pd.DataFrame(0.0, columns=columns, index=rows) methods=[nbc, logit_l1, logit_l2, rf, gb, adaboost, svm, vhard, vsoft, stack] y_prob = np.zeros((len(test), len(rows))) for i, method in enumerate(methods): if method != stack: y_pred = method.predict(X_test) else: y_pred = method.predict(X_test.todense()) if method not in [svm, vhard, stack]: # svm and vhard do not predict probabilities y_prob[:, i] = method.predict_proba(X_test)[:,1] results.iloc[i,3]= roc_auc_score(y_test, y_prob[:,i]) elif method == svm: y_df = method.decision_function(X_test) results.iloc[i,3]= roc_auc_score(y_test, y_df) elif method == stack: y_prob[:, i] = method.predict_proba(X_test.todense())[:,1] results.iloc[i,3]= roc_auc_score(y_test, y_prob[:,i]) confusion = confusion_matrix(y_test, y_pred) results.iloc[i,0]= 1 - accuracy_score(y_test, y_pred) results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:]) results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:]) results.iloc[i,4]= precision_score(y_test, y_pred) results.round(3)