In [1]:

import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings 
warnings.filterwarnings('ignore')
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
import plotly.graph_objs as go
import plotly.plotly as py
import cufflinks
pd.options.display.max_columns = 30
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()
from collections import Counter
import scattertext as st
import spacy
from pprint import pprint
import en_core_web_sm
nlp = spacy.load('en_core_web_md')

Loading BokehJS ...

In [2]:

df = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df.head()

Out[2]:

	Unnamed: 0	Clothing ID	Age	Title	Review Text	Rating	Recommended IND	Positive Feedback Count	Division Name	Department Name	Class Name
0	0	767	33	NaN	Absolutely wonderful - silky and sexy and comf...	4	1	0	Initmates	Intimate	Intimates
1	1	1080	34	NaN	Love this dress! it's sooo pretty. i happene...	5	1	4	General	Dresses	Dresses
2	2	1077	60	Some major design flaws	I had such high hopes for this dress and reall...	3	0	0	General	Dresses	Dresses
3	3	1049	50	My favorite buy!	I love, love, love this jumpsuit. it's fun, fl...	5	1	0	General Petite	Bottoms	Pants
4	4	847	47	Flattering shirt	This shirt is very flattering to all due to th...	5	1	6	General	Tops	Blouses

We want to do as much as I can with this data set that contains text feature.

In [3]:

df.drop('Unnamed: 0', axis=1, inplace=True)

In [29]:

a = 0 
for i in range(a,a+4):
    print(df['Review Text'][i])
    print(df['Rating'][i])
    print(df['Recommended IND'][i])
    print()

Absolutely wonderful - silky and sexy and comfortable
4
1

Love this dress!  it's sooo pretty.  i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite.  i bought a petite and am 5'8".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite.
5
1

I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c
3
0

I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!
5
1

In [4]:

df.drop('Title', axis=1, inplace=True)
df = df[~df['Review Text'].isnull()]

In [5]:

def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace("(<br/>)", "")
    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = ReviewText.str.replace('(&amp)', '')
    ReviewText = ReviewText.str.replace('(&gt)', '')
    ReviewText = ReviewText.str.replace('(&lt)', '')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')  
    return ReviewText
df['Review Text'] = preprocess(df['Review Text'])

New column for sentiment polarity. Two new columns for lengths of the review and word count.

In [6]:

df['polarity'] = df['Review Text'].map(lambda text: TextBlob(text).sentiment.polarity)
df['review_len'] = df['Review Text'].astype(str).apply(len)
df['word_count'] = df['Review Text'].apply(lambda x: len(str(x).split()))

In [35]:

print('5 random reviews with the highest positive sentiment polarity: \n')
cl = df.loc[df.polarity == 1, ['Review Text']].sample(5).values
for c in cl:
    print(c[0])

5 random reviews with the highest positive sentiment polarity: 

Great style!
Absolutely gorgeous!
words can't describe the beauty of this dress!
i hope they will restock it cause i would order size s (didn't know how it runs).
Beautiful blend of colors and the texture of the shirt is so soft...perfect fall weather garment!
Perfect way to add warmth in transition weather
This is the perfect holiday dress. my best purchase of the season!

In [36]:

print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
for c in cl:
    print(c[0])

5 random reviews with the most neutral sentiment(zero) polarity: 

Just received pants and even though they are labeled "petite" they are not. the inseam is 28 for a waist size 26, which is not what the description states.
My daughter loves these, she wears them around the house all day when she doesn't have to go out.
I agree that this shirt is boxy and has no shape; it doesn't look at all like it does on the model. i returned it.
I don't like how short it is in the back.
i also prefer a solid vs. slubby material, 
back it goes.
One wash wonder...i'm a s to m and went with a m but could have gone with a s with ease. after washing it was way shorter. it shrunk more than a whole size in cold water! i assumed everything came preshrunk but sadly disappointed. great for one wear.

In [37]:

df.polarity.min()

Out[37]:

-0.97500000000000009

In [38]:

df.loc[df.polarity == -0.97500000000000009]

Out[38]:

	Clothing ID	Age	Review Text	Rating	Recommended IND	Positive Feedback Count	Division Name	Department Name	Class Name	polarity	review_len	word_count
14754	868	52	Cut out design, no seems or hems.\r\n very dis...	1	0	0	General	Tops	Knits	-0.975	65	11
20502	870	61	Received this product with a gaping hole in it...	1	0	1	General	Tops	Knits	-0.975	121	21

In [39]:

print('2 reviews with the most negative polarity: \n')
cl = df.loc[df.polarity == -0.97500000000000009, ['Review Text']].sample(2).values
for c in cl:
    print(c[0])

2 reviews with the most negative polarity: 

Cut out design, no seems or hems.
 very disappointed in retailer
Received this product with a gaping hole in it. very disappointed in the quality and the quality control at the warehouse

Dist plot of review polarity score¶

In [20]:

df['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

Vast majority of the polarity are greater than 0, means most of them are positive.

Distribution of ratings¶

In [21]:

df['Rating'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution')

The ratings are in align with the polarity, that is, most of the ratings are at 4 or 5 range.

Reviewers age distribution¶

In [33]:

# df['Age'].iplot(
#     kind='hist',
#     bins=50,
#     xTitle='age',
#     linecolor='black',
#     yTitle='count',
#     title='Reviewers Age Distribution')

Most reviewers are in their 30s to 40s.

In [23]:

df['review_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Review Text Length Distribution')

In [9]:

# df['word_count'].iplot(
#     kind='hist',
#     bins=100,
#     xTitle='word count',
#     linecolor='black',
#     yTitle='count',
#     title='Review Text Word Count Distribution')

There were quite number of people like to leave long reviews.

In [10]:

# df.groupby('Division Name').count()['Clothing ID'].iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8,
#                                                           title='Bar chart of Division Name', xTitle='Division Name')

In [16]:

# df.groupby('Department Name').count()['Clothing ID'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8,
#                                                            title='Bar chart of Department Name', xTitle='Department Name')

In [11]:

# df.groupby('Class Name').count()['Clothing ID'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8,
#                                                            title='Bar chart of Class Name', xTitle='Class Name')

2D Density Jointplot of Age and Rating¶

In [19]:

# trace1 = go.Scatter(
#     x=df['Age'], y=df['Rating'], mode='markers', name='points',
#     marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
# )
# trace2 = go.Histogram2dContour(
#     x=df['Age'], y=df['Rating'], name='density', ncontours=20,
#     colorscale='Hot', reversescale=True, showscale=False
# )
# trace3 = go.Histogram(
#     x=df['Age'], name='Age density',
#     marker=dict(color='rgb(102,0,0)'),
#     yaxis='y2'
# )
# trace4 = go.Histogram(
#     y=df['Rating'], name='Rating density', marker=dict(color='rgb(102,0,0)'),
#     xaxis='x2'
# )
# data = [trace1, trace2, trace3, trace4]

# layout = go.Layout(
#     showlegend=False,
#     autosize=False,
#     width=600,
#     height=550,
#     xaxis=dict(
#         domain=[0, 0.85],
#         showgrid=False,
#         zeroline=False
#     ),
#     yaxis=dict(
#         domain=[0, 0.85],
#         showgrid=False,
#         zeroline=False
#     ),
#     margin=dict(
#         t=50
#     ),
#     hovermode='closest',
#     bargap=0,
#     xaxis2=dict(
#         domain=[0.85, 1],
#         showgrid=False,
#         zeroline=False
#     ),
#     yaxis2=dict(
#         domain=[0.85, 1],
#         showgrid=False,
#         zeroline=False
#     )
# )

# fig = go.Figure(data=data, layout=layout)
# iplot(fig, filename='2dhistogram-2d-density-plot-subplots')

People in their 30s are likely to give high ratings.

2D Density Jointplot of Age and Sentiment Polarity¶

In [20]:

# trace1 = go.Scatter(
#     x=df['Age'], y=df['polarity'], mode='markers', name='points',
#     marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
# )
# trace2 = go.Histogram2dContour(
#     x=df['Age'], y=df['polarity'], name='density', ncontours=20,
#     colorscale='Hot', reversescale=True, showscale=False
# )
# trace3 = go.Histogram(
#     x=df['Age'], name='Age density',
#     marker=dict(color='rgb(102,0,0)'),
#     yaxis='y2'
# )
# trace4 = go.Histogram(
#     y=df['polarity'], name='Sentiment Polarity density', marker=dict(color='rgb(102,0,0)'),
#     xaxis='x2'
# )
# data = [trace1, trace2, trace3, trace4]

# layout = go.Layout(
#     showlegend=False,
#     autosize=False,
#     width=600,
#     height=550,
#     xaxis=dict(
#         domain=[0, 0.85],
#         showgrid=False,
#         zeroline=False
#     ),
#     yaxis=dict(
#         domain=[0, 0.85],
#         showgrid=False,
#         zeroline=False
#     ),
#     margin=dict(
#         t=50
#     ),
#     hovermode='closest',
#     bargap=0,
#     xaxis2=dict(
#         domain=[0.85, 1],
#         showgrid=False,
#         zeroline=False
#     ),
#     yaxis2=dict(
#         domain=[0.85, 1],
#         showgrid=False,
#         zeroline=False
#     )
# )

# fig = go.Figure(data=data, layout=layout)
# iplot(fig, filename='2dhistogram-2d-density-plot-subplots')

There were few people are very positive or very negative, people who give neutral to positive reviews are more likely to be in their 30s. Probably people at these age are likely to be active.

Ratings between age groups¶

In [12]:

# df[['Rating', 'Age']].iplot(secondary_y='Age', secondary_y_title='Age',
#     kind='box', yTitle='Rating', title='Box Plot of Age and Rating')

2D Density Jointplot of Sentiment Polarity vs. Rating¶

In [30]:

trace1 = go.Scatter(
    x=df['polarity'], y=df['Rating'], mode='markers', name='points',
    marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
)
trace2 = go.Histogram2dContour(
    x=df['polarity'], y=df['Rating'], name='density', ncontours=20,
    colorscale='Hot', reversescale=True, showscale=False
)
trace3 = go.Histogram(
    x=df['polarity'], name='Sentiment polarity density',
    marker=dict(color='rgb(102,0,0)'),
    yaxis='y2'
)
trace4 = go.Histogram(
    y=df['Rating'], name='Rating density', marker=dict(color='rgb(102,0,0)'),
    xaxis='x2'
)
data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    showlegend=False,
    autosize=False,
    width=600,
    height=550,
    xaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    yaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    margin=dict(
        t=50
    ),
    hovermode='closest',
    bargap=0,
    xaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    ),
    yaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    )
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='2dhistogram-2d-density-plot-subplots')

Distribution of sentiment polarity score by recommendations¶

In [45]:

x1 = df.loc[df['Recommended IND'] == 1, 'polarity']
x0 = df.loc[df['Recommended IND'] == 0, 'polarity']

trace1 = go.Histogram(
    x=x0, name='Not recommended',
    opacity=0.75
)
trace2 = go.Histogram(
    x=x1, name = 'Recommended',
    opacity=0.75
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity of reviews based on Recommendation')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='overlaid histogram')

It is obvious that reviews have higher polarity score are more likely to be recommended.

Apparently, the polarity score for recommended reviews is higher than the polarity score for not recommended reviews.

Distribution of review ratings based on recommendations¶

In [43]:

x1 = df.loc[df['Recommended IND'] == 1, 'Rating']
x0 = df.loc[df['Recommended IND'] == 0, 'Rating']

trace1 = go.Histogram(
    x=x0, name='Not recommended',
    opacity=0.75
)
trace2 = go.Histogram(
    x=x1, name = 'Recommended',
    opacity=0.75
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity of reviews based on Recommendation')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='overlaid histogram')

Recommended reviews have higher ratings than otherwise.

Top unigrams before removing stop words¶

In [50]:

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

the 76166
it 49284
and 49009
is 30641
this 25762
to 24583
in 20723
but 16556
on 15328
for 14000
of 13429
with 12803
was 12221
so 12023
my 11028
dress 10567
that 10014
not 9799
love 8951
size 8772

In [17]:

# df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
#     kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review before removing stop words')

Top unigrams after removing stop words¶

In [54]:

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

dress 10567
love 8951
size 8772
fit 7325
like 7149
wear 6439
great 6117
just 5608
fabric 4798
small 4729
color 4605
look 4039
really 3925
ordered 3850
little 3775
perfect 3774
flattering 3519
soft 3343
comfortable 3060
cute 3042

In [18]:

# df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
#     kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review after removing stop words')

Top bigrams before removing stop words¶

In [17]:

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

in the 7169
it is 6169
and the 5614
this dress 4758
on the 4338
of the 3932
and it 3725
it was 3100
this top 2939
this is 2750
the fabric 2655
love the 2641
love this 2333
is very 2321
to wear 2267
but it 2196
the dress 2180
with the 2057
for me 2026
but the 1974

In [57]:

df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in review before removing stop words')

Top bigrams after removing stop words¶

In [58]:

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

true size 1347
love dress 766
usually wear 691
looks great 620
fit perfectly 609
size small 540
fits perfectly 489
usual size 479
look like 425
just right 423
ordered size 375
love love 374
looks like 373
runs large 367
highly recommend 365
super cute 363
wear size 361
fabric soft 350
feel like 349
fit great 348

In [59]:

df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in review after removing stop words')

Top trigrams before removing stop words¶

In [60]:

def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

true to size 1316
the fabric is 1291
this dress is 1115
the material is 886
in the store 728
on the model 725
the fit is 693
this top is 667
the color is 625
love this dress 605
tried it on 574
it is very 572
and it is 569
and it was 537
in the back 528
up or down 518
the dress is 511
to wear it 490
love this top 480
the colors are 472

In [61]:

df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in review before removing stop words')

Top trigrams after removing stop words¶

In [62]:

def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])

fits true size 280
fit true size 229
runs true size 148
love love love 143
usually wear size 136
ordered usual size 107
does run large 95
really wanted love 94
wanted love dress 88
usually wear small 80
small fit perfectly 77
looks great jeans 70
fit like glove 69
usually wear medium 68
normally wear size 68
small fits perfectly 65
size fit perfectly 65
fits like glove 65
usual size small 64
usually wear xs 63

In [63]:

df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in review after removing stop words')

Top 20 part-of-speech tagging of review corpus¶

In [66]:

blob = TextBlob(str(df['Review Text']))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.iplot(
    kind='bar',
    xTitle='POS',
    yTitle='count', 
    title='Top 20 Part-of-speech tagging for review corpus')

Distribution of review lengths by recommendations¶

In [52]:

x1 = df.loc[df['Recommended IND'] == 1, 'review_len']
x0 = df.loc[df['Recommended IND'] == 0, 'review_len']

trace1 = go.Histogram(
    x=x0, name='Not recommended',
    opacity=0.75
)
trace2 = go.Histogram(
    x=x1, name = 'Recommended',
    opacity=0.75
)

data = [trace1, trace2]
layout = go.Layout(barmode = 'group', title='Distribution of Review Lengths Based on Recommendation')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='stacked histogram')

Distribution of word count by recommendations¶

In [13]:

# x1 = df.loc[df['Recommended IND'] == 1, 'word_count']
# x0 = df.loc[df['Recommended IND'] == 0, 'word_count']

# trace1 = go.Histogram(
#     x=x0, name='Not recommended',
#     opacity=0.75
# )
# trace2 = go.Histogram(
#     x=x1, name = 'Recommended',
#     opacity=0.75
# )

# data = [trace1, trace2]
# layout = go.Layout(barmode = 'group', title='Distribution of Word Count Based on Recommendation')
# fig = go.Figure(data=data, layout=layout)

# iplot(fig, filename='grouped histogram')

Recommended reviews tend to be more lengthier than those not recommended reviews.

In [14]:

# y0 = df.loc[df['Division Name'] == 'General']['polarity']
# y1 = df.loc[df['Division Name'] == 'General Petite']['polarity']
# y2 = df.loc[df['Division Name'] == 'Initmates']['polarity']

# trace0 = go.Box(
#     y=y0,
#     name = 'General',
#     marker = dict(
#         color = 'rgb(214, 12, 140)',
#     )
# )
# trace1 = go.Box(
#     y=y1,
#     name = 'General Petite',
#     marker = dict(
#         color = 'rgb(0, 128, 128)',
#     )
# )
# trace2 = go.Box(
#     y=y2,
#     name = 'Initmates',
#     marker = dict(
#         color = 'rgb(10, 140, 208)',
#     )
# )
# data = [trace0, trace1, trace2]
# layout = go.Layout(
#     title = "Sentiment Polarity Boxplot of Division Name"
# )

# fig = go.Figure(data=data,layout=layout)
# iplot(fig, filename = "Sentiment Polarity Boxplot of Division Name")

The highest sentiment polarity score was achieved by all of three divisions, and the lowest sentiment polarity score was collected by General division.

We don't see any significant difference in terms of sentiment polarity between division names.

In [14]:

y0 = df.loc[df['Department Name'] == 'Tops']['polarity']
y1 = df.loc[df['Department Name'] == 'Dresses']['polarity']
y2 = df.loc[df['Department Name'] == 'Bottoms']['polarity']
y3 = df.loc[df['Department Name'] == 'Intimate']['polarity']
y4 = df.loc[df['Department Name'] == 'Jackets']['polarity']
y5 = df.loc[df['Department Name'] == 'Trend']['polarity']

trace0 = go.Box(
    y=y0,
    name = 'Tops',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace1 = go.Box(
    y=y1,
    name = 'Dresses',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace2 = go.Box(
    y=y2,
    name = 'Bottoms',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
trace3 = go.Box(
    y=y3,
    name = 'Intimate',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)
trace4 = go.Box(
    y=y4,
    name = 'Jackets',
    marker = dict(
        color = 'rgb(10, 0, 100)',
    )
)
trace5 = go.Box(
    y=y5,
    name = 'Trend',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]
layout = go.Layout(
    title = "Sentiment Polarity Boxplot of Department Name"
)

fig = go.Figure(data=data,layout=layout)
iplot(fig, filename = "Sentiment Polarity Boxplot of Department Name")

In [9]:

y0 = df.loc[df['Department Name'] == 'Tops']['review_len']
y1 = df.loc[df['Department Name'] == 'Dresses']['review_len']
y2 = df.loc[df['Department Name'] == 'Bottoms']['review_len']
y3 = df.loc[df['Department Name'] == 'Intimate']['review_len']
y4 = df.loc[df['Department Name'] == 'Jackets']['review_len']
y5 = df.loc[df['Department Name'] == 'Trend']['review_len']

trace0 = go.Box(
    y=y0,
    name = 'Tops',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace1 = go.Box(
    y=y1,
    name = 'Dresses',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace2 = go.Box(
    y=y2,
    name = 'Bottoms',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
trace3 = go.Box(
    y=y3,
    name = 'Intimate',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)
trace4 = go.Box(
    y=y4,
    name = 'Jackets',
    marker = dict(
        color = 'rgb(10, 0, 100)',
    )
)
trace5 = go.Box(
    y=y5,
    name = 'Trend',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]
layout = go.Layout(
    title = "Review length Boxplot of Department Name"
)

fig = go.Figure(data=data,layout=layout)
iplot(fig, filename = "Review Length Boxplot of Department Name")

In [21]:

# y0 = df.loc[df['Department Name'] == 'Tops']['Rating']
# y1 = df.loc[df['Department Name'] == 'Dresses']['Rating']
# y2 = df.loc[df['Department Name'] == 'Bottoms']['Rating']
# y3 = df.loc[df['Department Name'] == 'Intimate']['Rating']
# y4 = df.loc[df['Department Name'] == 'Jackets']['Rating']
# y5 = df.loc[df['Department Name'] == 'Trend']['Rating']

# trace0 = go.Box(
#     y=y0,
#     name = 'Tops',
#     marker = dict(
#         color = 'rgb(214, 12, 140)',
#     )
# )
# trace1 = go.Box(
#     y=y1,
#     name = 'Dresses',
#     marker = dict(
#        color = 'rgb(0, 128, 128)',
#     )
# )
# trace2 = go.Box(
#  y=y2,
#     name = 'Bottoms',
#     marker = dict(
#         color = 'rgb(10, 140, 208)',
#     )
# )
# trace3 = go.Box(
#     y=y3,
#     name = 'Intimate',
#     marker = dict(
#         color = 'rgb(12, 102, 14)',
#     )
# )
# trace4 = go.Box(
#     y=y4,
#     name = 'Jackets',
#     marker = dict(
#         color = 'rgb(10, 0, 100)',
#     )
# )
# trace5 = go.Box(
#     y=y5,
#     name = 'Trend',
#    marker = dict(
#         color = 'rgb(100, 0, 10)',
#     )
# )
# data = [trace0, trace1, trace2, trace3, trace4, trace5]
# layout = go.Layout(
#     title = "Rating Boxplot of Department Name"
# )

# fig = go.Figure(data=data,layout=layout)
# iplot(fig, filename = "Rating Boxplot of Department Name")

Topic Modeling with LSA¶

In [35]:

reindexed_data = df['Review Text']
tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
reindexed_data = reindexed_data.values
document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)

In [36]:

n_topics = 6
lsa_model = TruncatedSVD(n_components=n_topics)
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)

In [37]:

def get_keys(topic_matrix):
    '''
    returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys

def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

In [38]:

lsa_keys = get_keys(lsa_topic_matrix)
lsa_categories, lsa_counts = keys_to_counts(lsa_keys)

In [39]:

def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
    '''
    returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(n_topics):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words

In [46]:

top_n_words_lsa = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)

for i in range(len(top_n_words_lsa)):
    print("Topic {}: ".format(i+1), top_n_words_lsa[i])

Topic 1:  dress size love
Topic 2:  dress beautiful love
Topic 3:  great comfortable love
Topic 4:  size true small
Topic 5:  shirt love cute
Topic 6:  sweater love beautiful

In [41]:

top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
labels = ['Topic {}: \n'.format(i) + top_3_words[i] for i in lsa_categories]

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(lsa_categories, lsa_counts);
ax.set_xticks(lsa_categories);
ax.set_xticklabels(labels);
ax.set_ylabel('Number of review text');
ax.set_title('LSA topic counts');
plt.show();

In [42]:

tsne_lsa_model = TSNE(n_components=2, perplexity=50, learning_rate=100, 
                        n_iter=2000, verbose=1, random_state=0, angle=0.75)
tsne_lsa_vectors = tsne_lsa_model.fit_transform(lsa_topic_matrix)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 22641
[t-SNE] Computed conditional probabilities for sample 2000 / 22641
[t-SNE] Computed conditional probabilities for sample 3000 / 22641
[t-SNE] Computed conditional probabilities for sample 4000 / 22641
[t-SNE] Computed conditional probabilities for sample 5000 / 22641
[t-SNE] Computed conditional probabilities for sample 6000 / 22641
[t-SNE] Computed conditional probabilities for sample 7000 / 22641
[t-SNE] Computed conditional probabilities for sample 8000 / 22641
[t-SNE] Computed conditional probabilities for sample 9000 / 22641
[t-SNE] Computed conditional probabilities for sample 10000 / 22641
[t-SNE] Computed conditional probabilities for sample 11000 / 22641
[t-SNE] Computed conditional probabilities for sample 12000 / 22641
[t-SNE] Computed conditional probabilities for sample 13000 / 22641
[t-SNE] Computed conditional probabilities for sample 14000 / 22641
[t-SNE] Computed conditional probabilities for sample 15000 / 22641
[t-SNE] Computed conditional probabilities for sample 16000 / 22641
[t-SNE] Computed conditional probabilities for sample 17000 / 22641
[t-SNE] Computed conditional probabilities for sample 18000 / 22641
[t-SNE] Computed conditional probabilities for sample 19000 / 22641
[t-SNE] Computed conditional probabilities for sample 20000 / 22641
[t-SNE] Computed conditional probabilities for sample 21000 / 22641
[t-SNE] Computed conditional probabilities for sample 22000 / 22641
[t-SNE] Computed conditional probabilities for sample 22641 / 22641
[t-SNE] Mean sigma: 0.026059
[t-SNE] KL divergence after 50 iterations with early exaggeration: 0.772885
[t-SNE] Error after 75 iterations: 0.772885

In [43]:

def get_mean_topic_vectors(keys, two_dim_vectors):
    '''
    returns a list of centroid vectors from each predicted topic category
    '''
    mean_topic_vectors = []
    for t in range(n_topics):
        reviews_in_that_topic = []
        for i in range(len(keys)):
            if keys[i] == t:
                reviews_in_that_topic.append(two_dim_vectors[i])    
        
        reviews_in_that_topic = np.vstack(reviews_in_that_topic)
        mean_review_in_that_topic = np.mean(reviews_in_that_topic, axis=0)
        mean_topic_vectors.append(mean_review_in_that_topic)
    return mean_topic_vectors

In [44]:

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
colormap = colormap[:n_topics]

In [45]:

top_3_words_lsa = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
lsa_mean_topic_vectors = get_mean_topic_vectors(lsa_keys, tsne_lsa_vectors)

plot = figure(title="t-SNE Clustering of {} LSA Topics".format(n_topics), plot_width=700, plot_height=700)
plot.scatter(x=tsne_lsa_vectors[:,0], y=tsne_lsa_vectors[:,1], color=colormap[lsa_keys])

for t in range(n_topics):
    label = Label(x=lsa_mean_topic_vectors[t][0], y=lsa_mean_topic_vectors[t][1], 
                  text=top_3_words_lsa[t], text_color=colormap[t])
    plot.add_layout(label)
    
show(plot)

Out[45]:

GlyphRenderer(

id = 'd7d4b987-5b53-4d78-9056-ddfbdefe4675', …)

Here are the terms that differentiate the review text from a general English corpus.

In [34]:

corpus = st.CorpusFromPandas(df, category_col='Department Name', text_col='Review Text', nlp=nlp).build()
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

['flattering', 'sweater', 'boxy', 'flowy', 'leggings', 'xs', 'waist', 'dress', 'blouse', 'comfy']

Here are the terms in review text that are most associated with Tops department:

In [35]:

term_freq_df = corpus.get_term_freq_df()
term_freq_df['Tops Score'] = corpus.get_scaled_f_scores('Tops')
pprint(list(term_freq_df.sort_values(by='Tops Score', ascending=False).index[:10]))

['this shirt',
 'the shirt',
 'this blouse',
 'this top',
 'shirt is',
 'this sweater',
 'sweater is',
 'blouse',
 'the sweater',
 'shirt']

Here are the terms that are most associated with Dresses department:

In [36]:

term_freq_df['Dresses Score'] = corpus.get_scaled_f_scores('Dresses')
pprint(list(term_freq_df.sort_values(by='Dresses Score', ascending=False).index[:10]))

['this dress',
 'the dress',
 'dress is',
 'dress in',
 'dress for',
 'dress',
 'dress and',
 'dress was',
 'dress but',
 'dress to']

In [ ]: