install nltk module via anaconda command line (or use pip install if you are not use jupyternotebook) with conda install nltk
# import module
import nltk
# nltk comes with a bunch of datasets
# now we're gonna download stopwords package
# nltk.download_shell()
importing sms collection datasets from computer with pandas module and assign it as messages variable
import pandas as pd
messages = pd.read_csv('../SMSSpamCollection', sep='\t', names=['label', 'message'])
messages.head()
label | message | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
checking statistical information about dataset
messages.describe()
label | message | |
---|---|---|
count | 5572 | 5572 |
unique | 2 | 5169 |
top | ham | Sorry, I'll call later |
freq | 4825 | 30 |
messages.groupby('label').describe()
message | ||||
---|---|---|---|---|
count | unique | top | freq | |
label | ||||
ham | 4825 | 4516 | Sorry, I'll call later | 30 |
spam | 747 | 653 | Please call our customer service representativ... | 4 |
for great accuracy, we need to analyze data and add more feature to gain the accuracy, so in this case we need to add length column into dataset
Add new column for length of the message as lenght into dataset
messages['length'] = messages['message'].apply(len)
messages.head()
label | message | length | |
---|---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... | 111 |
1 | ham | Ok lar... Joking wif u oni... | 29 |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 155 |
3 | ham | U dun say so early hor... U c already then say... | 49 |
4 | ham | Nah I don't think he goes to usf, he lives aro... | 61 |
Visualising the lenght column
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
Checking dataset distribution
messages['length'].plot.hist(bins=150)
<matplotlib.axes._subplots.AxesSubplot at 0x220b0282390>
Spotting outlier
messages['length'].describe()
count 5572.000000 mean 80.489950 std 59.942907 min 2.000000 25% 36.000000 50% 62.000000 75% 122.000000 max 910.000000 Name: length, dtype: float64
messages[messages['length'] == 910]
label | message | length | |
---|---|---|---|
1085 | ham | For me the love should start with attraction.i... | 910 |
messages[messages['length'] == 910]['message']
1085 For me the love should start with attraction.i... Name: message, dtype: object
messages[messages['length'] == 910]['message'].iloc[0]
"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."
Visualising the distribution of each label
messages.hist(column='length', by='label', bins=60, figsize=(12, 4))
array([<matplotlib.axes._subplots.AxesSubplot object at 0x00000220B03D10F0>, <matplotlib.axes._subplots.AxesSubplot object at 0x00000220B04D1FD0>], dtype=object)
as we can see on the histogram that the spam messages are commonly has longer text length than ham
removing punctuation
import string
mess = 'Sample message! Notice: it has punctuation.'
exploring string module
string.punctuation
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
nopunc = [c for c in mess if c not in string.punctuation]
# just a demonstration of removing punctuation from a message
# nopunc
from nltk.corpus import stopwords
# just demonstration of stopwords
# stopword.words('english')
# joining nopunc element into one sentance but with no longer punctuation on it
nopunc = ''.join(nopunc)
nopunc
'Sample message Notice it has punctuation'
Cleaning stopwords
nopunc.split()
['Sample', 'message', 'Notice', 'it', 'has', 'punctuation']
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
clean_mess
['Sample', 'message', 'Notice', 'punctuation']
Create function to remove punctuations and stopwords from message
def text_process(mess):
"""
1. remove punctuations
2. remove stopwords
3. return list of clean text words
"""
nopunc = [char for char in mess if char not in string.punctuation]
nopunc = ''.join(nopunc)
return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
removing punctuation and stopwords from dataset
messages.head()
label | message | length | |
---|---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... | 111 |
1 | ham | Ok lar... Joking wif u oni... | 29 |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 155 |
3 | ham | U dun say so early hor... U c already then say... | 49 |
4 | ham | Nah I don't think he goes to usf, he lives aro... | 61 |
messages['message'].head().apply(text_process)
0 [Go, jurong, point, crazy, Available, bugis, n... 1 [Ok, lar, Joking, wif, u, oni] 2 [Free, entry, 2, wkly, comp, win, FA, Cup, fin... 3 [U, dun, say, early, hor, U, c, already, say] 4 [Nah, dont, think, goes, usf, lives, around, t... Name: message, dtype: object
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])
printing how many vocabulary
print(len(bow_transformer.vocabulary_))
11425
get 1 message for example
mess4 = messages['message'][3]
print(mess4)
U dun say so early hor... U c already then say...
bow4 = bow_transformer.transform([mess4])
print(bow4)
(0, 4068) 2 (0, 4629) 1 (0, 5261) 1 (0, 6204) 1 (0, 6222) 1 (0, 7186) 1 (0, 9554) 2
print(bow4.shape)
(1, 11425)
get features names
bow_transformer.get_feature_names()[4068]
'U'
bow_transformer.get_feature_names()[9554]
'say'
messages_bow = bow_transformer.transform(messages['message'])
print('Shape of Sparse Matrix: ', messages_bow.shape)
Shape of Sparse Matrix: (5572, 11425)
messages_bow.nnz
50548
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format(sparsity))
sparsity: 0.07940295412668218
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)
(0, 9554) 0.5385626262927564 (0, 7186) 0.4389365653379857 (0, 6222) 0.3187216892949149 (0, 6204) 0.29953799723697416 (0, 5261) 0.29729957405868723 (0, 4629) 0.26619801906087187 (0, 4068) 0.40832589933384067
messages_tfidf = tfidf_transformer.transform(messages_bow)
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])
spam_detect_model.predict(tfidf4)[0]
'ham'
checking prediction is accurate or not
messages['label'][3]
'ham'
from sklearn.cross_validation import train_test_split
msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.3)
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('bow', CountVectorizer(analyzer=text_process)),
('tfidf', TfidfTransformer()),
('classifier', MultinomialNB())
])
pipeline.fit(msg_train, label_train)
Pipeline(memory=None, steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x00000220B0439A60>, binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocesso...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
predictions = pipeline.predict(msg_test)
classification report
from sklearn.metrics import classification_report
print(classification_report(label_test, predictions))
precision recall f1-score support ham 0.95 1.00 0.97 1425 spam 1.00 0.68 0.81 247 avg / total 0.95 0.95 0.95 1672
the accuracy is 95%