#!/usr/bin/env python
# coding: utf-8
#
#
Machine Learning Using Python (MEAFA Workshop)
# Lesson 7: Ensembles and Stacking
#
#
# Twitter Airline Sentiment Data
# Data Preparation
# Text Classification Methods
# Voting Classifier
# Model Stacking
# Model Evaluation
#
# This notebook relies on the following imports and settings.
# In[1]:
# Packages
import nltk
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# In[2]:
# Plot settings
sns.set_context('notebook')
sns.set_style('ticks')
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours)
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['figure.figsize'] = (9, 6)
# In[3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, confusion_matrix, log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
# ##Twitter Airline Sentiment Data
#
# In this lesson we revisit the Twitter airline sentiment dataset. To save time, we directly load the processed dataset that we constructed in the earlier lesson.
# In[4]:
data = pd.read_pickle('Datasets/processed_tweets.pickle')
data.head()
# In[5]:
data[['text','tokens']].tail(10)
# In[6]:
# Randomly split indexes
index_train, index_test = train_test_split(np.array(data.index), train_size=0.7, random_state=1)
# Write training and test sets
train = data.loc[index_train,:].copy()
test = data.loc[index_test,:].copy()
y_train = train['positive'].values
y_test = test['positive'].values
# ##Data preparation
#
# Compute frequency distribution of tokens.
# In[7]:
fdist = nltk.FreqDist()
for words in train['tokens']:
for word in words:
fdist[word] += 1
# Discard features with too few appearances, retrieve list of remaining tokens.
# In[8]:
features = pd.Series(dict(fdist))
features = features.sort_values(ascending=False)
features = features[features>=5]
len(features)
# Rank features based on univariate performance, if we want to include screening.
# In[9]:
def univariate_design_matrix(feature, series):
X=series.apply(lambda tokens: (feature in tokens))
X= X.astype(int)
return X.values.reshape((-1,1)) # converting to a NumPy matrix, as requiresd
def training_error(feature):
X_train = univariate_design_matrix(feature, train['tokens'])
nbc= BernoulliNB().fit(X_train, np.ravel(y_train))
prob = nbc.predict_proba(X_train)
return log_loss(y_train, prob)
losses=[]
for feature in features.index:
losses.append(training_error(feature))
ranked = pd.Series(losses, index=features.index)
ranked = ranked.sort_values()
ranked_features = list(ranked.index)
# Build design matrix (slow to run).
# In[10]:
from scipy.sparse import lil_matrix
def design_matrix(features, series):
X = lil_matrix((len(series),len(features))) # initialise
for i in range(len(series)):
tokens = series.iloc[i]
for j, feature in enumerate(features): # scan the list of features
if feature in tokens: # if the feature is among the tokens,
X[i, j]= 1.0
return X
X_train = design_matrix(ranked_features, train['tokens'])
X_test = design_matrix(ranked_features, test['tokens'])
# ## Text Classification Methods
#
# ### Naive Bayes
# In[11]:
nbc= BernoulliNB()
nbc.fit(X_train, y_train)
# ### Regularised Logistic Regression
# In[12]:
logit_l1= LogisticRegressionCV(Cs = 50, penalty='l1', solver='liblinear', scoring='neg_log_loss')
logit_l1.fit(X_train, y_train.ravel())
# In[13]:
np.sum(np.abs(logit_l1.coef_) == 0.0)
# In[14]:
logit_l2= LogisticRegressionCV(Cs = 50, penalty='l2', scoring='neg_log_loss')
logit_l2.fit(X_train, y_train.ravel())
# ### Random Forest
# In[15]:
#%%time
model = RandomForestClassifier(criterion = 'entropy', n_estimators=100)
tuning_parameters = {
'min_samples_leaf': [5, 10, 20, 50],
'max_features': np.arange(50, X_train.shape[1], 50),
}
rf_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 16, scoring='neg_log_loss',
return_train_score=False, n_jobs=4)
rf_search.fit(X_train, y_train)
rf = rf_search.best_estimator_
print('Best parameters found by randomised search:', rf_search.best_params_, '\n')
# ### Gradient Boosting
# In[16]:
#%%time
from xgboost import XGBClassifier
model = XGBClassifier()
alphas = [0] + list(np.logspace(-10, 10, 81, base=2))
tuning_parameters = {
'learning_rate': [0.001, 0.01, 0.05, 0.1],
'n_estimators' : [250, 500, 750, 1000, 1500, 2000, 2500, 3000, 5000],
'max_depth' : [1, 2, 3, 4],
'reg_alpha': alphas,
}
gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 32, cv = 5, scoring='neg_log_loss',
return_train_score=False, n_jobs=4, random_state = 10)
gb_search.fit(X_train, y_train)
gb = gb_search.best_estimator_
print('Best parameters found by randomised search:', gb_search.best_params_, '\n')
# In[17]:
from statlearning import plot_feature_importance
plot_feature_importance(gb, ranked_features, max_features=30)
plt.show()
# ### AdaBoost
# In[18]:
#%%time
from sklearn.ensemble import AdaBoostClassifier
learner = DecisionTreeClassifier(criterion='gini')
model = AdaBoostClassifier(base_estimator = learner)
tuning_parameters = {
'base_estimator__max_depth' : [1,2,3,4],
'learning_rate' : [0.001, 0.01, 0.02, 0.05, 0.1],
'n_estimators' : [100, 250, 500, 750, 1000, 1500, 2000, 3000],
}
adaboost_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 4, cv = 5, scoring='neg_log_loss',
return_train_score=False, n_jobs=4, random_state = 1)
adaboost_search.fit(X_train, y_train)
adaboost = adaboost_search.best_estimator_
print('Best parameters found by randomised search:', adaboost_search.best_params_, '\n')
# ### Linear Support Vector Classifier
# In[19]:
#%%time
from sklearn.svm import LinearSVC
Cs = np.logspace(-10, 10, 81, base=2)
model = LinearSVC(loss='hinge')
tuning_parameters ={
'C': Cs,
}
svm_search = GridSearchCV(model, tuning_parameters, cv=5, return_train_score=False, n_jobs=4)
svm_search.fit(X_train, y_train)
svm = svm_search.best_estimator_
print('Best parameters found by grid search:', svm_search.best_params_, '\n')
# ## Voting Classifier
# In[20]:
#%%time
from sklearn.ensemble import VotingClassifier
clfs = [('clf1', nbc), ('clf2', logit_l1), ('clf3', logit_l2), ('clf4', gb), ('clf5', svm) ]
vhard = VotingClassifier(clfs)
vhard.fit(X_train, y_train)
# In[21]:
#%%time
clfs = [('clf1', nbc), ('clf2', logit_l1), ('clf3', logit_l2), ('clf4', gb)]
# We exclude SVM since it does not predict probabilities
vsoft = VotingClassifier(clfs, voting='soft')
vsoft.fit(X_train, y_train)
# ## Model Stacking
#
# In[22]:
get_ipython().run_cell_magic('time', '', 'from mlxtend.classifier import StackingCVClassifier\n\nstack = StackingCVClassifier([nbc, logit_l1, logit_l2, gb], use_probas=True, meta_classifier = LogisticRegression(C=1e4), cv=5)\nstack.fit(X_train.todense(), y_train) \n\n# The stacking class is not compatible with sparse matrices, which considerably slows down training\n# Remove XGboost, or replace it with LightGBM, for faster results\n')
# ## Model Evaluation
#
# In[23]:
columns=['Error rate', 'Sensitivity', 'Specificity', 'AUC', 'Precision']
rows=['Naive Bayes', 'Logistic L1', 'Logistic L2', 'Random Forest',
'Gradient Boosting', 'AdaBoost', 'Linear SVC', 'Hard Voting', 'Soft Voting', 'Stack']
results=pd.DataFrame(0.0, columns=columns, index=rows)
methods=[nbc, logit_l1, logit_l2, rf, gb, adaboost, svm, vhard, vsoft, stack]
y_prob = np.zeros((len(test), len(rows)))
for i, method in enumerate(methods):
if method != stack:
y_pred = method.predict(X_test)
else:
y_pred = method.predict(X_test.todense())
if method not in [svm, vhard, stack]: # svm and vhard do not predict probabilities
y_prob[:, i] = method.predict_proba(X_test)[:,1]
results.iloc[i,3]= roc_auc_score(y_test, y_prob[:,i])
elif method == svm:
y_df = method.decision_function(X_test)
results.iloc[i,3]= roc_auc_score(y_test, y_df)
elif method == stack:
y_prob[:, i] = method.predict_proba(X_test.todense())[:,1]
results.iloc[i,3]= roc_auc_score(y_test, y_prob[:,i])
confusion = confusion_matrix(y_test, y_pred)
results.iloc[i,0]= 1 - accuracy_score(y_test, y_pred)
results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:])
results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:])
results.iloc[i,4]= precision_score(y_test, y_pred)
results.round(3)