X_raw = ['That priduct is poor product!!...', 'I loving this product.', 'That is brilliant!!@#']
y = ['negative', 'positive', 'positive']
# Importing the libraries
import numpy as np
import pandas as pd
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
lema = WordNetLemmatizer()
# stopwordsleri sil
from nltk.corpus import stopwords
stop = stopwords.words('english')
from textblob import TextBlob
[nltk_data] Downloading package stopwords to [nltk_data] /Users/uzaycetin/nltk_data... [nltk_data] Package stopwords is already up-to-date!
def preprocessing(text):
text = text.lower()
# get rid of non-alphanumerical characters
text = re.sub(r'\W', ' ', text)
# get rid of spaces
text = re.sub(r'\s+', ' ', text)
# Correct mistakes
# and do the stemming
return " ".join([lema.lemmatize(str(TextBlob(word).correct()))
for word in text.split() if word not in stop])
X = [preprocessing(x) for x in X_raw]
X
['product poor product', 'loving product', 'brilliant']
words = {w for d in X for w in d.split()}
words
{'brilliant', 'loving', 'poor', 'product'}
def term_frequency(d):
tf = {w:0 for w in words}
for w in d.split():
if w in tf:
tf[w] += 1
return pd.Series(tf)
tf = pd.DataFrame(columns=words)
for i in range(len(X)):
tf.loc[i] = term_frequency(X[i])
tf
poor | product | loving | brilliant | |
---|---|---|---|---|
0 | 1 | 2 | 0 | 0 |
1 | 0 | 1 | 1 | 0 |
2 | 0 | 0 | 0 | 1 |
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
cX = vectorizer.fit_transform(X).toarray()
cX
array([[0, 0, 1, 2], [0, 1, 0, 1], [1, 0, 0, 0]], dtype=int64)
vectorizer.vocabulary_
{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}
N = len(X)
N
3
words
{'brilliant', 'loving', 'poor', 'product'}
def inverse_document_freq(w, X):
N = len(X)
N_d = 0
for d in X:
if w in d.split():
N_d += 1
return 1 + np.log((N+1)/(N_d+1))
idfs = [(w,inverse_document_freq(w, X)) for w in words]
idfs
[('poor', 1.6931471805599454), ('product', 1.2876820724517808), ('loving', 1.6931471805599454), ('brilliant', 1.6931471805599454)]
for c, idf in idfs:
tf[c] *= idf
tf
poor | product | loving | brilliant | |
---|---|---|---|---|
0 | 1.69315 | 2.57536 | 0 | 0 |
1 | 0 | 1.28768 | 1.69315 | 0 |
2 | 0 | 0 | 0 | 1.69315 |
from sklearn.preprocessing import normalize
normalize(tf)
array([[0.54935123, 0.83559154, 0. , 0. ], [0. , 0.60534851, 0.79596054, 0. ], [0. , 0. , 0. , 1. ]])
# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(norm = None, smooth_idf=True)
tX = vectorizer.fit_transform(X).toarray()
tX
array([[0. , 0. , 1.69314718, 2.57536414], [0. , 1.69314718, 0. , 1.28768207], [1.69314718, 0. , 0. , 0. ]])
vectorizer.vocabulary_
{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}
# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tX = vectorizer.fit_transform(X).toarray()
tX
array([[0. , 0. , 0.54935123, 0.83559154], [0. , 0.79596054, 0. , 0.60534851], [1. , 0. , 0. , 0. ]])