In [1]:

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline
# Load the "autoreload" extension
%load_ext autoreload 
# always reload modules marked with "%aimport"
%autoreload 1 

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

# import my method from the source code
%aimport data.read_data
from data.read_data import read_data

In [5]:

train, test = read_data(test=True)

In [6]:

print(train.shape)
train.head()

(60000, 6)

Out[6]:

	ID	review_content	review_title	review_stars	product	Target
0	0	En appelant un acheteur pour demander si l'écr...	La Police s'inscrit en acheteur privé sur Pric...	5	2fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5...	0
1	1	Alors, là, on a affaire au plus grand Navet ja...	Chef D'Oeuvre Absolu en vue...	5	7b56d9d378d9e999d293f301ac43d044cd7b4786d09afb...	1
2	2	Effet garanti sur la terrase. Ils donnent immé...	Effet garanti sur la terrase. Ils donnent immé...	3	7b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c...	0
3	3	tres bon rapport qualite prix tre pratique en ...	bon produit	4	77d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88...	1
4	4	Ordinateur de bureau trés bien pour quelqu'un ...	Apple Power MAC G4	3	f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b...	1

Number of target classes¶

In [7]:

fig, axr = plt.subplots(1,2, figsize=(14,5))
sns.countplot(train['Target'], ax=axr[0])
train.groupby(['Target', 'review_stars']).size().unstack('Target').plot(kind='bar', stacked=True, ax=axr[1])

Out[7]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f44f489ea20>

Not imbalanced classes
All classes of review starts have positive and negative samples

Duplicates¶

In [8]:

sns.countplot(y=train['product'], order=train['product'].value_counts()[:20].index)

Out[8]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f8f260fe550>

In [9]:

"Appear only one {}, More than one time {}".format((train.groupby('product').size() == 1).sum(),
                                                   (train.duplicated('product', keep=False).sum()))

Out[9]:

'Appear only one 40068, More than one time 19932'

In [32]:

train[train['Target'] == 0].groupby('product').size().plot(kind='hist', logy=True, bins=100, figsize=(10,5), alpha=0.5,
                                                          label='0. Mean product reviews : {:.2f}'.format(train[train['Target'] == 0].groupby('product').size().mean()))
train[train['Target'] == 1].groupby('product').size().plot(kind='hist', logy=True, bins=100, figsize=(10,5), alpha=0.5,
                                                          label='1. Mean product reviews : {:.2f}'.format(train[train['Target'] == 1].groupby('product').size().mean()))
plt.legend(title='Target')
plt.xlabel('Product reviews per product')
plt.ylabel('Number of products')

Out[32]:

Text(0,0.5,'Number of products')

Size of posts¶

In [161]:

fig, axr = plt.subplots(1,2, figsize=(14,5))
sns.distplot(np.log(train[train['Target'] == 0]['review_content'].apply(lambda x: len(x.split()))), label='Negative', ax=axr[0])
sns.distplot(np.log(train[train['Target'] == 1]['review_content'].apply(lambda x: len(x.split()))), label='Positive', ax=axr[0])
axr[0].legend()

sns.distplot(np.log(train[train['Target'] == 0]['review_title'].dropna().apply(lambda x: len(x.split() if x else np.NaN))), label='Negative', ax=axr[1])
sns.distplot(np.log(train[train['Target'] == 1]['review_title'].dropna().apply(lambda x: len(x.split() if x else np.NaN))), label='Positive', ax=axr[1])
axr[1].legend()

Out[161]:

<matplotlib.legend.Legend at 0x7f8f090c07b8>

Large review post tend to be more positive

Words title¶

In [20]:

plt.figure(figsize=(14,4))
plt.title('Uncleaned words from rewiews')
sns.countplot(train['review_title'].str.split(expand=True).unstack(),
             order=train['review_title'].str.split(expand=True).unstack().value_counts()[:20].index)

Out[20]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f8f26e73240>

In [35]:

with open('../data/external/fr-stopwords.txt') as fp:
    stopwords = fp.read().splitlines()

In [34]:

positive = train[train['Target'] == 1]['review_title'].dropna().values
negative = train[train['Target'] == 0]['review_title'].dropna().values

In [50]:

# The wordcloud of Cthulhu/squidy thing for HP Lovecraft
fig, axr = plt.subplots(1,2, figsize=(16,13))
wcP = WordCloud(background_color="white", max_words=10000,
               stopwords=stopwords, max_font_size= 40)
wcN = WordCloud(background_color="black", max_words=10000,
               stopwords=stopwords, max_font_size= 40)
wcP.generate(" ".join(positive))
wcN.generate(" ".join(negative))
#plt.title("HP Lovecraft (Cthulhu-Squidy)", fontsize=20)
axr[0].set_title('Positive')
axr[1].set_title('Negative')
axr[0].imshow(wcP.recolor( colormap= 'viridis' , random_state=17), alpha=0.98)
axr[1].imshow(wcN.recolor( colormap= 'Pastel1_r' , random_state=17), alpha=0.98)
axr[0].axis('off'); axr[1].axis('off')

Out[50]:

(-0.5, 399.5, 199.5, -0.5)

Words content¶

In [33]:

plt.figure(figsize=(14,4))
plt.title('Uncleaned words from rewiews')
sns.countplot(train['review_content'].str.split(expand=True).unstack(),
             order=train['review_content'].str.split(expand=True).unstack().value_counts()[:20].index)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Traceback (most recent call last):
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-33-4d58ad6b10a4>", line 4, in <module>
    order=train['review_content'].str.split(expand=True).unstack().value_counts()[:20].index)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/seaborn/categorical.py", line 3359, in countplot
    errcolor, errwidth, capsize, dodge)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/seaborn/categorical.py", line 1596, in __init__
    order, hue_order, units)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/seaborn/categorical.py", line 203, in establish_variables
    group_names)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/seaborn/categorical.py", line 249, in _group_longform
    g_vals = np.asarray(grouped_vals.get_group(g))
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/groupby.py", line 763, in get_group
    inds = self._get_index(name)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/groupby.py", line 609, in _get_index
    return self._get_indices([name])[0]
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/groupby.py", line 576, in _get_indices
    if len(self.indices) > 0:
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/groupby.py", line 555, in indices
    return self.grouper.indices
  File "pandas/_libs/properties.pyx", line 38, in pandas._libs.properties.cache_readonly.__get__
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/groupby.py", line 1980, in indices
    return self.groupings[0].indices
  File "pandas/_libs/properties.pyx", line 38, in pandas._libs.properties.cache_readonly.__get__
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/groupby.py", line 2744, in indices
    values = _ensure_categorical(self.grouper)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/dtypes/common.py", line 81, in _ensure_categorical
    arr = Categorical(arr)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/categorical.py", line 330, in __init__
    codes, categories = factorize(values, sort=True)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/algorithms.py", line 479, in factorize
    assume_unique=True)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/pandas/core/sorting.py", line 477, in safe_sort
    mask = (labels < -len(values)) | (labels >= len(values)) | \
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 1828, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1090, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/IPython/core/ultratb.py", line 311, in wrapped
    return f(*args, **kwargs)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/IPython/core/ultratb.py", line 345, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/usr/lib64/python3.6/inspect.py", line 1483, in getinnerframes
    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
  File "/usr/lib64/python3.6/inspect.py", line 1445, in getframeinfo
    lines, lnum = findsource(frame)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/site-packages/IPython/core/ultratb.py", line 177, in findsource
    lines = linecache.getlines(file, globals_dict)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/linecache.py", line 47, in getlines
    return updatecache(filename, module_globals)
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/linecache.py", line 137, in updatecache
    lines = fp.readlines()
  File "/home/cris/.virtualenvs/cdiscount/lib/python3.6/codecs.py", line 318, in decode
    def decode(self, input, final=False):
KeyboardInterrupt

---------------------------------------------------------------------------

In [36]:

positive = train[train['Target'] == 1]['review_content'].dropna().values
negative = train[train['Target'] == 0]['review_content'].dropna().values

# The wordcloud of Cthulhu/squidy thing for HP Lovecraft
fig, axr = plt.subplots(1,2, figsize=(16,13))
wcP = WordCloud(background_color="white", max_words=10000,
               stopwords=stopwords, max_font_size= 40)
wcN = WordCloud(background_color="black", max_words=10000,
               stopwords=stopwords, max_font_size= 40)
wcP.generate(" ".join(positive))
wcN.generate(" ".join(negative))
#plt.title("HP Lovecraft (Cthulhu-Squidy)", fontsize=20)
axr[0].set_title('Positive')
axr[1].set_title('Negative')
axr[0].imshow(wcP.recolor( colormap= 'viridis' , random_state=17), alpha=0.98)
axr[1].imshow(wcN.recolor( colormap= 'Pastel1_r' , random_state=17), alpha=0.98)
axr[0].axis('off'); axr[1].axis('off')

Out[36]:

(-0.5, 399.5, 199.5, -0.5)

Lemma¶

In [37]:

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/cris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Out[37]:

True

In [38]:

lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [42]:

def get_lemmas(text):
    tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words=stopwords,
                                     decode_error='ignore')
    tf = tf_vectorizer.fit_transform(text)
    feature_names = tf_vectorizer.get_feature_names()
    count_vec = np.asarray(tf.sum(axis=0)).ravel()
    zipped = list(zip(feature_names, count_vec))
    x,y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))
    return x,y
    

In [52]:

# Storing the entire training text in a list
text_pos = list(train[train['Target'] == 1]['review_content'].dropna().values)
text_neg = list(train[train['Target'] == 0]['review_content'].dropna().values)

In [53]:

x_pos, y_pos = get_lemmas(text_pos)
x_neg, y_neg = get_lemmas(text_neg)

In [60]:

fig, axr = plt.subplots(2,figsize=(15,5))
axr[0].set_title('Cleaned words from rewiews')
sns.barplot(x=x_pos[0:20], y=y_pos[0:20], ax=axr[0], label='Positive')
axr[0].legend()
sns.barplot(x=x_neg[0:20], y=y_neg[0:20], ax=axr[1], label='Negative')
axr[1].legend()

Out[60]:

<matplotlib.legend.Legend at 0x7f44db9975c0>

Tfidf¶

In [61]:

def get_tfidf(text):
    vectorizer = TfidfVectorizer(min_df=10, max_features=10000, ngram_range=(1, 2))
    vz = vectorizer.fit_transform(list(text))
    tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
    tfidf.columns = ['tfidf']
    return tfidf

In [91]:

vectorizer = TfidfVectorizer(min_df=10, max_features=10000, ngram_range=(1, 2))
vz = vectorizer.fit_transform(list(text))

In [135]:

tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [63]:

tfidf_pos = get_tfidf(text_pos)
tfidf_neg = get_tfidf(text_neg)

In [103]:

fig, axr = plt.subplots(2,figsize=(19,5))
axr[0].set_title('Tfidf from review content')
df = tfidf_pos.sort_values(by='tfidf', ascending=False).head(15)
sns.barplot(x=df.index, y=df.tfidf, ax=axr[0], label='Positive')
axr[0].set_ylim(8,9.5)
axr[0].legend()
df = tfidf_neg.sort_values(by='tfidf', ascending=False).head(15)
sns.barplot(x=df.index, y=df.tfidf, ax=axr[1], label='Negative')
axr[1].set_ylim(8,9.5)
axr[1].legend()

Out[103]:

<matplotlib.legend.Legend at 0x7f44e4909630>

In [110]:

train['review_title'].dropna().str.split().apply(lambda x: [item for item in x if item not in stopwords])

Missing values¶

In [73]:

fig, axr = plt.subplots(1,2,figsize=(14,5))
train.apply(lambda col: col.isnull().sum(), axis=0).plot(kind='barh', ax=axr[0])
test.apply(lambda col: col.isnull().sum(), axis=0).plot(kind='barh', ax=axr[1])

Out[73]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f8f26bc9240>