import numpy as np
import pandas as pd
import nltk
import spacy
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
Generative model: Naive Bayes models the joint distribution of the feature X and target Y, and then predicts the posterior probability given as P(y|x)
The naive bayes classifier uses Baye's rule to assign labels to input data, with the output as the label with the highest probability for a number of input features.
df_sms = pd.read_csv('data/spam.csv',encoding='latin-1')
df_sms = df_sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df_sms = df_sms.rename(columns={"v1":"label", "v2":"sms"})
df_sms.head()
label | sms | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
df_sms['label'].value_counts()
ham 4825 spam 747 Name: label, dtype: int64
df_sms['length'] = df_sms['sms'].apply(len)
sns.distplot(df_sms['length'])
plt.show()
df_sms.hist(column='length', by='label', bins=100,figsize=(10,4))
plt.show()
df_sms.loc[:,'label'] = df_sms.label.map({'ham':0, 'spam':1})
df_sms.head()
label | sms | length | |
---|---|---|---|
0 | 0 | Go until jurong point, crazy.. Available only ... | 111 |
1 | 0 | Ok lar... Joking wif u oni... | 29 |
2 | 1 | Free entry in 2 a wkly comp to win FA Cup fina... | 155 |
3 | 0 | U dun say so early hor... U c already then say... | 49 |
4 | 0 | Nah I don't think he goes to usf, he lives aro... | 61 |
Convert a collection of text documents to a matrix of token counts
This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
documents = ["Hi Mom, happy Thanksgiving, I know I'm your favorite.",
"However, I never read rubrics when I do peer reviews.",
"Epstein did not kill himself!"]
count_vector = CountVectorizer(stop_words=stopWords)
count_vector.fit(documents)
count_vector.get_feature_names()
['epstein', 'favorite', 'happy', 'hi', 'however', 'kill', 'know', 'mom', 'never', 'peer', 'read', 'reviews', 'rubrics', 'thanksgiving']
#convert the set of documents into an array using count_vector.transform
doc_array = count_vector.transform(documents).toarray()
#convert doc_array to dataframe for visualization
frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
frequency_matrix
epstein | favorite | happy | hi | however | kill | know | mom | never | peer | read | reviews | rubrics | thanksgiving | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 |
2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Now lets apply the count vectorizer to our dataset as part of our pre-processing
#train_test_split is used to split up our data into subsets for training and testing
X_train, X_test, y_train, y_test = train_test_split(df_sms['sms'], df_sms['label'],test_size=0.20)
count_vector = CountVectorizer()
# bag of words matrix for training data
training_data = count_vector.fit_transform(X_train)
# bag of words matrix for training data
testing_data = count_vector.transform(X_test)
Note that there are different Naive Bayes algorithms available in sklearn. Another commonly used one is Gaussian Naive Bayes which is useful for continuous attributes.
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
predictions = naive_bayes.predict(testing_data)
Accuracy: the ratio of correctly predicted observation to the total observations
Accuracy=TP+TNTP+FP+FN+TN
Precision: the ratio of correctly predicted positive observations to the total predicted positive observations
Precision=TPTP+FP
Recall: the ratio of correctly predicted positive observations to the all observations in actual class
Recall=TPTP+FN
F1: the weighted average of Precision and Recall
F1=2*(Recall * Precision)(Recall + Precision)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))
Accuracy score: 0.9847533632286996 Precision score: 0.9523809523809523 Recall score: 0.9333333333333333 F1 score: 0.9427609427609427
true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
true negatives (TN): We predicted no, and they don't have the disease.
false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")
false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, predictions)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Greens", fmt='d',
xticklabels=['ham','spam'],
yticklabels=['ham','spam'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("Confusion Matrix - Naive Bayes\n", size=16);
Image(filename='data/confusionmatrix.png')
Discriminative model: Logistic regression directly models the posterior probability of P(y|x) by learning the input to output mapping by minimising the error.
Uses logit (logistic unit): y = e^(b0 + b1x) / (1 + e^(b0 + b1x)) Used to model binary output variables
df = pd.read_csv('data/financial_product_complaints.csv', index_col = 0)
df.head()
Product | Consumer_complaint | |
---|---|---|
0 | Credit reporting, repair, or other | it is the repeated fraud attempt from experian... |
1 | Debt collection | We received multiple voice mails from Weltman,... |
2 | Debt collection | Deactivated my car whenever I was a day late m... |
3 | Student loan | My complaint is regarding my Student Loan Serv... |
4 | Debt collection | Syndicated Office Systems is currently reporti... |
pd.DataFrame(df.Product.unique())
0 | |
---|---|
0 | Credit reporting, repair, or other |
1 | Debt collection |
2 | Student loan |
3 | Checking or savings account |
4 | Mortgage |
5 | Payday loan, title loan, or personal loan |
6 | Vehicle loan or lease |
7 | Credit card or prepaid card |
8 | Bank account or service |
9 | Consumer Loan |
10 | Money transfer, virtual currency, or money ser... |
11 | Other financial service |
12 | Money transfers |
# Create a new column 'category_id' with encoded categories
df['category_id'] = df['Product'].factorize()[0]
category_id_df = df[['Product', 'category_id']].drop_duplicates()
category_id_df
Product | category_id | |
---|---|---|
0 | Credit reporting, repair, or other | 0 |
1 | Debt collection | 1 |
3 | Student loan | 2 |
7 | Checking or savings account | 3 |
10 | Mortgage | 4 |
14 | Payday loan, title loan, or personal loan | 5 |
24 | Vehicle loan or lease | 6 |
47 | Credit card or prepaid card | 7 |
67 | Bank account or service | 8 |
70 | Consumer Loan | 9 |
118 | Money transfer, virtual currency, or money ser... | 10 |
128 | Other financial service | 11 |
1283 | Money transfers | 12 |
# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Product']].values)
# New dataframe
df.head()
Product | Consumer_complaint | category_id | |
---|---|---|---|
0 | Credit reporting, repair, or other | it is the repeated fraud attempt from experian... | 0 |
1 | Debt collection | We received multiple voice mails from Weltman,... | 1 |
2 | Debt collection | Deactivated my car whenever I was a day late m... | 1 |
3 | Student loan | My complaint is regarding my Student Loan Serv... | 2 |
4 | Debt collection | Syndicated Office Systems is currently reporti... | 1 |
TF-IDF is the product of the TF and IDF scores of the term.
TF-IDF=TFIDF
Term Frequency : This summarizes how often a given word appears within a document.
TF=Number of times the term appears in the docTotal number of words in the doc
Inverse Document Frequency: This downscales words that appear a lot across documents. A term has a high IDF score if it appears in a few documents. Conversely, if the term is very common among documents (i.e., “the”, “a”, “is”), the term would have a low IDF score.
from sklearn.feature_extraction.text import TfidfVectorizer
n-gram is a contiguous sequence of n items from a given sample of text
unigrams are single words
bigram are double words
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words='english')
# use tfidf to transform complaints into vectors
features = tfidf.fit_transform(df.Consumer_complaint).toarray()
labels = df.category_id
features.shape
(10000, 28342)
from sklearn.feature_selection import chi2
# Finding the three most correlated terms with each of the product categories
N = 3
for Product, category_id in sorted(category_to_id.items()):
#calculate chi2 values for features
features_chi2 = chi2(features, labels == category_id)
#select indices of features
indices = np.argsort(features_chi2[0])
#get feature names corresponding to those indices
feature_names = np.array(tfidf.get_feature_names())[indices]
#for each value in feature, extract uni and bi grams
unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
#print top N uni and bigrams
print("\n %s:" %(Product),'\n------------')
print(" -->Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
print(" -->Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))
Bank account or service: ------------ -->Most Correlated Unigrams are: bank, deposit, overdraft -->Most Correlated Bigrams are: overdraft fee, checking account, overdraft fees Checking or savings account: ------------ -->Most Correlated Unigrams are: deposited, checking, overdraft -->Most Correlated Bigrams are: debit card, savings account, checking account Consumer Loan: ------------ -->Most Correlated Unigrams are: dealer, vehicle, car -->Most Correlated Bigrams are: credit acceptance, leased vehicle, acceptance corp Credit card or prepaid card: ------------ -->Most Correlated Unigrams are: express, citi, card -->Most Correlated Bigrams are: synchrony bank, american express, credit card Credit reporting, repair, or other: ------------ -->Most Correlated Unigrams are: experian, report, equifax -->Most Correlated Bigrams are: credit bureaus, credit file, credit report Debt collection: ------------ -->Most Correlated Unigrams are: collect, collection, debt -->Most Correlated Bigrams are: debt collector, collection agency, collect debt Money transfer, virtual currency, or money service: ------------ -->Most Correlated Unigrams are: wire, paypal, coinbase -->Most Correlated Bigrams are: xxxx coinbase, coinbase xxxx, coinbase account Money transfers: ------------ -->Most Correlated Unigrams are: paypal, gram, moneygram -->Most Correlated Bigrams are: contacted paypal, money gram, money soon Mortgage: ------------ -->Most Correlated Unigrams are: escrow, modification, mortgage -->Most Correlated Bigrams are: mortgage company, mortgage payment, loan modification Other financial service: ------------ -->Most Correlated Unigrams are: authorizations, screwed, global -->Most Correlated Bigrams are: according usps, rate 11, check services Payday loan, title loan, or personal loan: ------------ -->Most Correlated Unigrams are: loan, loaned, payday -->Most Correlated Bigrams are: loan lending, payday loans, payday loan Student loan: ------------ -->Most Correlated Unigrams are: student, loans, navient -->Most Correlated Bigrams are: student loan, loan forgiveness, student loans Vehicle loan or lease: ------------ -->Most Correlated Unigrams are: santander, car, vehicle -->Most Correlated Bigrams are: consumer usa, gap insurance, santander consumer
Before we can train and test a model, the data needs to be split up into train and test subsets. Scikitlearn's train_test_split package is useful for quickly completing this step - it also has parameters that you can alter to suit your data.
X = df['Consumer_complaint'] # complaint documents
y = df['Product'] # Target labels we want to predict (13 different product types)
#split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features,
labels,
df.index, test_size=0.20)
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
Accuracy score: 0.7705
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, predictions)
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_mat, annot=True, cmap="Greens", fmt='d',
xticklabels=category_id_df.Product.values,
yticklabels=category_id_df.Product.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("Confusion Matrix for Logistic Regression\n", size=16);