# !pip install PyDrive
# !pip install gensim
# !pip install pyldavis
# !python -m spacy download en
# !pip install nltk
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
# from google.colab import auth
from oauth2client.client import GoogleCredentials
import nltk
from nltk import FreqDist
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\lijiaxiang\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)
# download = drive.CreateFile({'id': '1-a2dbUDDHsmkWOXoDYvgvCa8WBkEMzbr'})
# ls
# download.GetContentFile('automotive_rev_json.gz')
import pandas as pd
pd.set_option("display.max_colwidth", 200)
import numpy as np
import json
import re
import gzip
import spacy
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
def parse(path):
g = gzip.open(path, 'rb')
for l in g:
yield eval(l)
def getDF(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
df = getDF('reviews_Automotive_5.json.gz')
df.shape
(20473, 9)
df.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | |
---|---|---|---|---|---|---|---|---|---|
0 | A3F73SC1LY51OO | B00002243X | Alan Montgomery | [4, 4] | I needed a set of jumper cables for my new car and these had good reviews and were at a good price. They have been used a few times already and do what they are supposed to - no complaints there.... | 5.0 | Work Well - Should Have Bought Longer Ones | 1313539200 | 08 17, 2011 |
1 | A20S66SKYXULG2 | B00002243X | alphonse | [1, 1] | These long cables work fine for my truck, but the quality seems a little on the shabby side. For the money I was not expecting 200 dollar snap-on jumper cables but these seem more like what you wo... | 4.0 | Okay long cables | 1315094400 | 09 4, 2011 |
2 | A2I8LFSN2IS5EO | B00002243X | Chris | [0, 0] | Can't comment much on these since they have not yet been used (I will come back and update my review is I find any issues after first use) ... but they are built solid, nice tough big hard clamps ... | 5.0 | Looks and feels heavy Duty | 1374710400 | 07 25, 2013 |
3 | A3GT2EWQSO45ZG | B00002243X | DeusEx | [19, 19] | I absolutley love Amazon!!! For the price of a set of cheap Booster/Jumper Cables in a brick and morter store, you can buy extra long and heavy duty jumpers! First off, don't be the person that ... | 5.0 | Excellent choice for Jumper Cables!!! | 1292889600 | 12 21, 2010 |
4 | A3ESWJPAVRPWB4 | B00002243X | E. Hernandez | [0, 0] | I purchased the 12' feet long cable set and they arrived in a retail cardboard box with handle, which could be used as a storage and carrying case for these cables, although I will not need to sto... | 5.0 | Excellent, High Quality Starter Cables | 1341360000 | 07 4, 2012 |
def freq_words(x, terms = 30):
all_words = ' '.join([text for text in x])
all_words = all_words.split()
fdist = FreqDist(all_words)
words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})
# selecting top 20 most frequent words
d = words_df.nlargest(columns="count", n = terms)
plt.figure(figsize=(20,5))
ax = sns.barplot(data=d, x= "word", y = "count")
ax.set(ylabel = 'Count')
plt.show()
freq_words(df['reviewText'])
# replace "n't" with " not"
df['reviewText'] = df['reviewText'].str.replace("n\'t", " not")
# remove unwanted characters, numbers and symbols
df['reviewText'] = df['reviewText'].str.replace("[^a-zA-Z#]", " ")
freq_words(df['reviewText'])
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(rev):
rev_new = " ".join([i for i in rev if i not in stop_words])
return rev_new
# remove short words (length < 3)
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
# remove stopwords from the text
reviews = [remove_stopwords(r.split()) for r in df['reviewText']]
# make entire text lowercase
reviews = [r.lower() for r in reviews]
freq_words(reviews, 35)
# nlp = spacy.load('en', disable=['parser', 'ner'])
# def lemmatization(texts, tags=['NOUN', 'ADJ']):
# output = []
# for sent in texts:
# doc = nlp(" ".join(sent))
# output.append([token.lemma_ for token in doc if token.pos_ in tags])
# return output
tokenized_reviews = pd.Series(reviews).apply(lambda x: x.split())
print(tokenized_reviews[1])
['these', 'long', 'cables', 'work', 'fine', 'truck', 'quality', 'seems', 'little', 'shabby', 'side', 'for', 'money', 'expecting', 'dollar', 'snap', 'jumper', 'cables', 'seem', 'like', 'would', 'see', 'chinese', 'knock', 'shop', 'like', 'harbor', 'freight', 'bucks']
print(tokenized_reviews[1])
len(tokenized_reviews[1])
['these', 'long', 'cables', 'work', 'fine', 'truck', 'quality', 'seems', 'little', 'shabby', 'side', 'for', 'money', 'expecting', 'dollar', 'snap', 'jumper', 'cables', 'seem', 'like', 'would', 'see', 'chinese', 'knock', 'shop', 'like', 'harbor', 'freight', 'bucks']
29
# reviews_2 = lemmatization(tokenized_reviews)
reviews_2 = tokenized_reviews
print(reviews_2[1])
['these', 'long', 'cables', 'work', 'fine', 'truck', 'quality', 'seems', 'little', 'shabby', 'side', 'for', 'money', 'expecting', 'dollar', 'snap', 'jumper', 'cables', 'seem', 'like', 'would', 'see', 'chinese', 'knock', 'shop', 'like', 'harbor', 'freight', 'bucks']
print(reviews_2[1])
len(reviews_2[1])
['these', 'long', 'cables', 'work', 'fine', 'truck', 'quality', 'seems', 'little', 'shabby', 'side', 'for', 'money', 'expecting', 'dollar', 'snap', 'jumper', 'cables', 'seem', 'like', 'would', 'see', 'chinese', 'knock', 'shop', 'like', 'harbor', 'freight', 'bucks']
29
set(tokenized_reviews[1]) - set(reviews_2[1])
set()
reviews_3 = []
for i in range(len(reviews_2)):
reviews_3.append(' '.join(reviews_2[i]))
df['reviews'] = reviews_3
freq_words(df['reviews'], 35)
# Create the term dictionary of our corpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(reviews_2)
# Convert list of reviews (reviews_2) into a Document Term Matrix using the dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews_2]
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel
# Build LDA model
lda_model = LDA(corpus=doc_term_matrix,
id2word=dictionary,
num_topics=7,
random_state=100,
chunksize=1000,
passes=50)
# 执行时间很长
lda_model.print_topics()
[(0, '0.019*"brush" + 0.019*"wheels" + 0.016*"wheel" + 0.012*"ice" + 0.011*"snow" + 0.008*"carpet" + 0.007*"cold" + 0.006*"rust" + 0.006*"cup" + 0.006*"metal"'), (1, '0.020*"great" + 0.019*"well" + 0.018*"wiper" + 0.016*"blade" + 0.016*"blades" + 0.015*"easy" + 0.013*"good" + 0.012*"they" + 0.012*"fit" + 0.011*"windshield"'), (2, '0.019*"the" + 0.011*"light" + 0.010*"one" + 0.009*"would" + 0.007*"get" + 0.007*"like" + 0.006*"use" + 0.006*"lights" + 0.006*"this" + 0.005*"well"'), (3, '0.022*"car" + 0.013*"product" + 0.013*"use" + 0.011*"the" + 0.010*"like" + 0.009*"used" + 0.008*"wax" + 0.008*"good" + 0.008*"great" + 0.008*"this"'), (4, '0.013*"car" + 0.012*"one" + 0.008*"price" + 0.008*"the" + 0.008*"product" + 0.007*"good" + 0.007*"oil" + 0.007*"time" + 0.007*"amazon" + 0.007*"get"'), (5, '0.031*"battery" + 0.020*"power" + 0.015*"car" + 0.014*"use" + 0.012*"plug" + 0.011*"unit" + 0.011*"charge" + 0.010*"the" + 0.010*"charger" + 0.009*"this"'), (6, '0.039*"hose" + 0.034*"water" + 0.016*"tank" + 0.012*"oil" + 0.011*"pump" + 0.010*"fluid" + 0.009*"sewer" + 0.009*"drain" + 0.008*"valve" + 0.008*"use"')]
# Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
# vis
D:\install\miniconda\lib\site-packages\pyLDAvis\_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. return pd.concat([default_term_info] + list(topic_dfs))
lda_model.__class__
gensim.models.ldamodel.LdaModel
lda_model.save("model/lda.model")
lda_model2 = gensim.models.ldamodel.LdaModel.load("model/lda.model")
lda_model2.print_topics()
[(0, '0.019*"brush" + 0.019*"wheels" + 0.016*"wheel" + 0.012*"ice" + 0.011*"snow" + 0.008*"carpet" + 0.007*"cold" + 0.006*"rust" + 0.006*"cup" + 0.006*"metal"'), (1, '0.020*"great" + 0.019*"well" + 0.018*"wiper" + 0.016*"blade" + 0.016*"blades" + 0.015*"easy" + 0.013*"good" + 0.012*"they" + 0.012*"fit" + 0.011*"windshield"'), (2, '0.019*"the" + 0.011*"light" + 0.010*"one" + 0.009*"would" + 0.007*"get" + 0.007*"like" + 0.006*"use" + 0.006*"lights" + 0.006*"this" + 0.005*"well"'), (3, '0.022*"car" + 0.013*"product" + 0.013*"use" + 0.011*"the" + 0.010*"like" + 0.009*"used" + 0.008*"wax" + 0.008*"good" + 0.008*"great" + 0.008*"this"'), (4, '0.013*"car" + 0.012*"one" + 0.008*"price" + 0.008*"the" + 0.008*"product" + 0.007*"good" + 0.007*"oil" + 0.007*"time" + 0.007*"amazon" + 0.007*"get"'), (5, '0.031*"battery" + 0.020*"power" + 0.015*"car" + 0.014*"use" + 0.012*"plug" + 0.011*"unit" + 0.011*"charge" + 0.010*"the" + 0.010*"charger" + 0.009*"this"'), (6, '0.039*"hose" + 0.034*"water" + 0.016*"tank" + 0.012*"oil" + 0.011*"pump" + 0.010*"fluid" + 0.009*"sewer" + 0.009*"drain" + 0.008*"valve" + 0.008*"use"')]
把相关规则进行存储。