Задача состоит в том, чтобы построить модель классификации спам сообщений в SMS, на основе имеющихся данных.
Решаться задача будет на датасете взятом тут: https://www.kaggle.com/uciml/sms-spam-collection-dataset
Целевой признак является метка spam/ham является ли SMS спамом или нет
import pandas as pd
import numpy as np
import seaborn as sns
import scipy
import matplotlib.pyplot as plt
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
%matplotlib inline
import warnings
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
df = pd.read_csv('../../data/spam.csv', encoding='latin-1')
df.head()
df.info()
df['Unnamed: 2'].unique()[: 5]
df[df['Unnamed: 2'] == ' PO Box 5249']
df['v2'] = df['v2'] + df['Unnamed: 2'].fillna('') + df['Unnamed: 3'].fillna('') + df['Unnamed: 4'].fillna('')
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)
df.rename(columns = {'v1' : 'label', 'v2' : 'sms'}, inplace = True)
df['label'] = df['label'].map({'spam' : 1, 'ham' : 0})
df.head()
df.info()
df[df['label'] == 0].sample(3)
df[df['label'] == 1].sample(3)
_, ax = plt.subplots()
plt.bar(np.arange(2), df['label'].value_counts(), color = ['green', 'red'])
ax.set_xticks(np.arange(2))
ax.set_xticklabels(['ham', 'spam']);
df['label'].value_counts()[1] / df.shape[0], df['label'].value_counts()[0] / df.shape[0]
df['len'] = df['sms'].apply(lambda x : len(x.strip().split()))
import regex as re
df['punctuation'] = df['sms'].apply(lambda x : len(re.findall("[^\P{P}-]+", x)))
df['punctuation'] = df['sms'].apply(lambda x : len(re.findall("[^\P{P}-]+", x)))
df['sms'] = df['sms'].apply(lambda x : re.sub("[^\P{P}-]+", "", x))
df['capital'] = df['sms'].apply(lambda x : sum(1 for c in x if c.isupper()))
df['sms'] = df['sms'].apply(lambda x : str.lower(x))
symbols = {}
for x in [item for sublist in list(map(list, df['sms'].tolist())) for item in sublist] :
if x in symbols :
symbols[x] += 1
else :
symbols[x] = 1
symbols
volwes = 'aeiou'
consonant = 'bcdfghjklmnpqrstvwxyz'
digits = '0123456789'
alphabet = set(volwes) | set(consonant) | set(digits)
len(alphabet)
bad_symbols = [x for x in symbols if x not in alphabet]
bad_symbols = ''.join(set(bad_symbols) - set(' '))
bad_symbols
df['badsymbol'] = df['sms'].apply(lambda x :1 if len([s for s in x if s in bad_symbols]) > 0 else 0)
df['sms'] = df['sms'].str.replace('å', 'a').str.replace('ä', 'a').str.replace('â', 'a').str.replace('á', 'a')
df['sms'] = df['sms'].str.replace('õ', 'o').str.replace('ò', 'o').str.replace('ð', 'o').str.replace('ö', '0') \
.str.replace('ó', 'o').str.replace('ô', 'o')
df['sms'] = df['sms'].str.replace('û', 'u')
df['sms'] = df['sms'].str.replace('è', 'e')
df['sms'] = df['sms'].str.replace('ì', '1').str.replace('ï', 'l')
df['moneysign'] = df['sms'].apply(lambda x : 1 if ('$' in list(x)) or ('£' in list(x)) else 0 )
symbols = {}
for x in [item for sublist in list(map(list, df['sms'].tolist())) for item in sublist] :
if x in symbols :
symbols[x] += 1
else :
symbols[x] = 1
bad_symbols = [x for x in symbols if x not in alphabet]
bad_symbols = ''.join(set(bad_symbols) - set(' '))
bad_symbols
for symb in bad_symbols :
df['sms'] = df['sms'].str.replace(symb, '')
symbols = {}
for x in [item for sublist in list(map(list, df['sms'].tolist())) for item in sublist] :
if x in symbols :
symbols[x] += 1
else :
symbols[x] = 1
symbols
df.head()
df['num'] = df['sms'].apply(lambda x : 1 if len([s for s in x if s in digits]) > 0 else 0)
df.columns
target = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size = 0.2, stratify = target, random_state = 10)
y_train.sum() / len(y_train), y_test.sum() / len(y_test)
X_train.shape, X_test.shape
for col in X_train.columns[2 :] :
fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (20, 10))
# ax.set_ylabel('% фрагментов', fontsize=12)
# ax.set_xlabel('Имя автора', fontsize=12)
axes[0].set_title(col)
axes[0].hist(X_train[col], bins = 200);
axes[1].set_title(col)
axes[1].hist(X_train[col][X_train['label'] == 0], bins = 200, label = 'ham')
axes[1].hist(X_train[col][X_train['label'] == 1], bins = 200, label = 'spam')
plt.show()
fig, ax = plt.subplots(figsize = (20, 10))
sns.heatmap(X_train[['label', 'len', 'punctuation', 'capital', 'badsymbol',
'moneysign', 'num']].corr())
scaler = StandardScaler()
cols = ['len', 'punctuation', 'capital', 'badsymbol', 'moneysign', 'num']
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[cols]), columns = cols)
X_test_scaled = pd.DataFrame(scaler.transform(X_test[cols]), columns = cols)
def valid(model, n, bayes = False) :
skf = StratifiedKFold(n_splits = n, random_state = 17)
auc_scores = []
for train_index, valid_index in skf.split(X_train_scaled, y_train):
X_train_part, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[valid_index]
y_train_part, y_valid = y_train[train_index], y_train[valid_index]
X_train_sms, X_valid_sms = X_train.iloc[train_index]['sms'], X_train.iloc[valid_index]['sms']
cv = TfidfVectorizer(ngram_range = (1, 3))
X_train_bow = cv.fit_transform(X_train_sms)
X_valid_bow = cv.transform(X_valid_sms)
if bayes :
X_train_new = X_train_bow
X_valid_new = X_valid_bow
else :
X_train_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_bow, X_train_part]))
X_valid_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_valid_bow, X_valid]))
model.fit(X_train_new, y_train_part)
model_pred_for_auc = model.predict_proba(X_valid_new)
auc_scores.append(roc_auc_score(y_valid, model_pred_for_auc[:, 1]))
return np.mean(auc_scores)
logit = LogisticRegression(random_state = 17)
bayes = MultinomialNB()
scores_logit = valid(logit, 10)
print('Logistic regreession - rocauc : {}'.format(scores_logit))
scores_bayes = valid(bayes, 10, True)
print('Bayessian classfier - rocauc : {}'.format(scores_bayes))
def valid_for_valid_plots(model, n, bayes = False) :
skf = StratifiedKFold(n_splits = n, random_state = 17)
auc_scores_cv = []
auc_scores_valid = []
for train_index, valid_index in skf.split(X_train_scaled, y_train):
X_train_part, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[valid_index]
y_train_part, y_valid = y_train[train_index], y_train[valid_index]
X_train_sms, X_valid_sms = X_train.iloc[train_index]['sms'], X_train.iloc[valid_index]['sms']
cv = TfidfVectorizer(ngram_range = (1, 3))
X_train_bow = cv.fit_transform(X_train_sms)
X_valid_bow = cv.transform(X_valid_sms)
if bayes :
X_train_new = X_train_bow
X_valid_new = X_valid_bow
else :
X_train_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_bow, X_train_part]))
X_valid_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_valid_bow, X_valid]))
model.fit(X_train_new, y_train_part)
auc_scores_cv.append(roc_auc_score(y_train_part, model.predict_proba(X_train_new)[:, 1]))
model_pred_for_auc = model.predict_proba(X_valid_new)
auc_scores_valid.append(roc_auc_score(y_valid, model_pred_for_auc[:, 1]))
return 1 - np.mean(auc_scores_valid), 1 - np.mean(auc_scores_cv)
Cs = [0.1 * i for i in range(1, 21)]
scores = []
for c in Cs :
logit = LogisticRegression(C = c, random_state = 17)
scores.append(valid_for_valid_plots(logit, 10))
fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (20, 10))
plt.plot(Cs, [i[0] for i in scores], color = 'blue', label='holdout')
plt.plot(Cs, [i[1] for i in scores], color = 'red', label='CV')
plt.ylabel("ROCAUC")
plt.xlabel("C")
plt.title('Validation curve for C in (0.1, 2)');
Cs = np.linspace(0.5, 1.5, 10)
for c in Cs :
logit = LogisticRegression(C = c, random_state = 17)
print(c, valid(logit, 10))
C_opt = 1.5
def valid_for_train_plots(model, n, alpha, bayes = False) :
skf = StratifiedKFold(n_splits = n, random_state = 17)
auc_scores_cv = []
auc_scores_valid = []
for train_index, valid_index in skf.split(X_train_scaled[: int(X_train_scaled.shape[0] * alpha)], y_train[: int(X_train_scaled.shape[0] * alpha)]):
X_train_part, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[valid_index]
y_train_part, y_valid = y_train[train_index], y_train[valid_index]
X_train_sms, X_valid_sms = X_train.iloc[train_index]['sms'], X_train.iloc[valid_index]['sms']
cv = TfidfVectorizer(ngram_range = (1, 3))
X_train_bow = cv.fit_transform(X_train_sms)
X_valid_bow = cv.transform(X_valid_sms)
if bayes :
X_train_new = X_train_bow
X_valid_new = X_valid_bow
else :
X_train_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_bow, X_train_part]))
X_valid_new = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_valid_bow, X_valid]))
model.fit(X_train_new, y_train_part)
auc_scores_cv.append(roc_auc_score(y_train_part, model.predict_proba(X_train_new)[:, 1]))
model_pred_for_auc = model.predict_proba(X_valid_new)
auc_scores_valid.append(roc_auc_score(y_valid, model_pred_for_auc[:, 1]))
return np.mean(auc_scores_valid), np.mean(auc_scores_cv)
alphas = [0.1 * i for i in range(1, 11)]
scores = []
for alpha in alphas :
logit = LogisticRegression(C = C_opt, random_state = 17)
scores.append(valid_for_train_plots(logit, 10, alpha = alpha))
fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (20, 10))
plt.plot(alphas, [i[0] for i in scores], color = 'blue', label='holdout')
plt.plot(alphas, [i[1] for i in scores], color = 'red', label='CV')
plt.ylabel("ROCAUC")
plt.xlabel("C")
plt.title('Learnings curve with optimal C');
cv = TfidfVectorizer(ngram_range = (1, 3))
X_train_sms = cv.fit_transform(X_train['sms'])
X_test_sms = cv.transform(X_test['sms'])
train = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_train_sms, X_train_scaled]))
test = scipy.sparse.csr_matrix(scipy.sparse.hstack([X_test_sms, X_test_scaled]))
logit = LogisticRegression(C = C_opt, random_state = 17)
logit.fit(train, y_train)
for x, y in zip(cols, logit.coef_[0][len(cv.get_feature_names()) :]) :
print(x, y)
logit_pred = logit.predict_proba(test)
roc_auc_score(y_test, logit_pred[:, 1])