Raw Data¶

In [1]:

X_raw = ['That priduct is poor product!!...', 'I loving this product.', 'That is     brilliant!!@#']
y = ['negative', 'positive', 'positive']

Clean Data¶

In [2]:

# Importing the libraries
import numpy as np
import pandas as pd
import re
import pickle 
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
lema = WordNetLemmatizer()

# stopwordsleri sil
from nltk.corpus import stopwords
stop = stopwords.words('english')

from textblob import TextBlob

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/uzaycetin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [3]:

def preprocessing(text):
    text = text.lower()
    # get rid of non-alphanumerical characters
    text = re.sub(r'\W', ' ', text) 
    # get rid of spaces
    text = re.sub(r'\s+', ' ', text) 
    # Correct mistakes 
    # and do the stemming
    return " ".join([lema.lemmatize(str(TextBlob(word).correct())) 
                     for word in text.split() if word not in stop])

In [4]:

X = [preprocessing(x) for x in X_raw]
X

Out[4]:

['product poor product', 'loving product', 'brilliant']

Term Frequency¶

$tf(w,d) = \frac{count_d (w)}{|d|}$

In [5]:

words = {w for d in X for w in d.split()}
words

Out[5]:

{'brilliant', 'loving', 'poor', 'product'}

In [6]:

def term_frequency(d):
    tf = {w:0 for w in words}
    for w in d.split():
        if w in tf:
            tf[w] += 1
    return pd.Series(tf)

In [7]:

tf = pd.DataFrame(columns=words)
for i in range(len(X)):
    tf.loc[i] = term_frequency(X[i])

In [8]:

tf

Out[8]:

	poor	product	loving	brilliant
0	1	2	0	0
1	0	1	1	0
2	0	0	0	1

Count Vectorizer¶

In [9]:

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
cX = vectorizer.fit_transform(X).toarray()
cX

Out[9]:

array([[0, 0, 1, 2],
       [0, 1, 0, 1],
       [1, 0, 0, 0]], dtype=int64)

In [10]:

vectorizer.vocabulary_

Out[10]:

{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}

inverse document frequency¶

$idf(w) = 1 + log(\frac{ N+1 }{ N_w + 1})$

N: Number of documents in the corpus
N_w : Number of documents containing word w

Example¶

think of a corpus, with 1000 documents

word cat appears 5 of them
worf the appears 500 of then

Calculate $\frac{N}{N_w}$

In [11]:

N = len(X)
N

Out[11]:

In [12]:

words

Out[12]:

{'brilliant', 'loving', 'poor', 'product'}

In [11]:

def inverse_document_freq(w, X):
    N = len(X)
    N_d = 0
    for d in X: 
        if w in d.split():
            N_d += 1
    return 1 + np.log((N+1)/(N_d+1))
    

In [12]:

idfs = [(w,inverse_document_freq(w, X)) for w in words]
idfs

Out[12]:

[('poor', 1.6931471805599454),
 ('product', 1.2876820724517808),
 ('loving', 1.6931471805599454),
 ('brilliant', 1.6931471805599454)]

In [13]:

for c, idf in idfs:
    tf[c] *= idf

In [14]:

tf

Out[14]:

	poor	product	loving	brilliant
0	1.69315	2.57536	0	0
1	0	1.28768	1.69315	0
2	0	0	0	1.69315

In [15]:

from sklearn.preprocessing import normalize
normalize(tf)

Out[15]:

array([[0.54935123, 0.83559154, 0.        , 0.        ],
       [0.        , 0.60534851, 0.79596054, 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

SKLEARN TF-IDF Model¶

In [19]:

# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(norm = None, smooth_idf=True)
tX = vectorizer.fit_transform(X).toarray()
tX

Out[19]:

array([[0.        , 0.        , 1.69314718, 2.57536414],
       [0.        , 1.69314718, 0.        , 1.28768207],
       [1.69314718, 0.        , 0.        , 0.        ]])

In [18]:

vectorizer.vocabulary_

Out[18]:

{'brilliant': 0, 'loving': 1, 'poor': 2, 'product': 3}

In [16]:

# Creating the Tf-Idf model directly
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tX = vectorizer.fit_transform(X).toarray()
tX

Out[16]:

array([[0.        , 0.        , 0.54935123, 0.83559154],
       [0.        , 0.79596054, 0.        , 0.60534851],
       [1.        , 0.        , 0.        , 0.        ]])

In [ ]: