#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd # In[2]: usecols = ["title", "datetime", "content"] # In[3]: ofo = pd.read_csv("data/ofo.csv", usecols = usecols) # In[4]: ofo.head() # In[5]: ofo.shape # In[6]: mobike = pd.read_csv("data/mobike.csv", usecols = usecols) # In[7]: mobike.head() # In[8]: mobike.shape # In[9]: merged = pd.concat([ofo, mobike]).reset_index().drop('index', axis=1) # In[10]: merged.head() # In[11]: merged.shape # In[13]: merged[merged.title.duplicated()].shape # 重复新闻 # In[14]: merged = merged[~merged.title.duplicated()] # In[15]: merged.shape # In[16]: def count_ofo(mystr): mycount = mystr.count('OFO') + mystr.count('ofo') return mycount # In[17]: merged["count_ofo"] = merged.content.apply(count_ofo) # In[18]: def count_mobike(mystr): mycount = mystr.count('摩拜') + mystr.count('mobike') return mycount # In[19]: merged["count_mobike"] = merged.content.apply(count_mobike) # In[20]: merged.head() # In[21]: merged[merged.count_mobike > merged.count_ofo].shape # In[22]: mobike = merged[merged.count_mobike > merged.count_ofo] # In[23]: mobike = mobike.reset_index().drop(['index','count_ofo','count_mobike'], axis=1) # In[24]: mobike.head() # In[25]: mobike.shape # In[26]: merged[merged.count_mobike <= merged.count_ofo].shape # In[27]: ofo = merged[merged.count_mobike <= merged.count_ofo] # In[28]: ofo = ofo.reset_index().drop(['index','count_ofo','count_mobike'], axis=1) # In[29]: ofo.head() # In[30]: ofo.shape # # Sentiment Analysis # ## snownlp # In[31]: from snownlp import SnowNLP def get_sentiment(text): return SnowNLP(text).sentiments # In[32]: ofo["sentiment"] = ofo.content.apply(get_sentiment) mobike["sentiment"] = mobike.content.apply(get_sentiment) # In[33]: import numpy as np # In[34]: pd.Series(np.where(ofo['sentiment'] > 0.5, True, False)).value_counts() # In[35]: pd.Series(np.where(mobike['sentiment'] > 0.5, True, False)).value_counts() # ## bosonnlp # In[36]: # from bosonnlp import BosonNLP # import json # with open("boson_api_key.json") as f: # secret = json.load(f) # nlp = BosonNLP(secret["boson_api"]) # def get_sentiment_bosonnlp(text): # return nlp.sentiment(text, model="news")[0][0] # ofo["sentiment"] = ofo.content.apply(get_sentiment_bosonnlp) # mobike["sentiment"] = mobike.content.apply(get_sentiment_bosonnlp) # In[37]: import pickle # In[38]: # with open('sentiment_bosonnlp.pickle', 'wb') as f: # pickle.dump([ofo, mobike], f) # In[39]: with open('data/sentiment_bosonnlp.pickle', 'rb') as f: [ofo, mobike] = pickle.load(f) # In[40]: pd.Series(np.where(ofo['sentiment'] > 0.5, True, False)).value_counts() # In[41]: pd.Series(np.where(mobike['sentiment'] > 0.5, True, False)).value_counts() # # Visualization # In[42]: get_ipython().run_line_magic('matplotlib', 'inline') # In[43]: ofo.set_index('datetime', inplace=True) # In[44]: mobike.set_index('datetime', inplace=True) # In[45]: (ofo[['sentiment']] - 0.5).plot(kind='bar', figsize=(24, 6)) # In[46]: (mobike[['sentiment']] - 0.5).plot(kind='bar', figsize=(24, 6)) # In[47]: import seaborn as sns # In[48]: ofo["brand"] = "ofo" # In[49]: mobike["brand"] = "mobike" # In[50]: ofo.head() # In[51]: df = pd.concat([ofo, mobike]) # In[52]: sns.boxplot(x='brand', y='sentiment', data=df) # In[53]: ofo_positive = ofo[ofo.sentiment > 0.5] ofo_negative = ofo[ofo.sentiment <= 0.5] mobike_positive = mobike[mobike.sentiment > 0.5] mobike_negative = mobike[mobike.sentiment <= 0.5] # In[54]: ofo_positive.shape # In[55]: ofo_negative.shape # In[56]: mobike_positive.shape # In[57]: mobike_negative.shape # # LDA # In[58]: import pyLDAvis import pyLDAvis.sklearn # In[59]: pyLDAvis.enable_notebook() # In[60]: from helper import * # In[61]: stopwords = get_custom_stopwords("stopwordsHIT.txt") # HIT停用词词典 max_df = 0.7 # 在超过这一比例的文档中出现的关键词(过于平凡),去除掉。 min_df = 2 # 在低于这一数量的文档中出现的关键词(过于独特),去除掉。 n_features = 1000 # 最大提取特征数量 n_top_words = 20 # 显示主题下关键词的时候,显示多少个 col_content = "content" # 说明其中的文本信息所在列名称 # In[62]: def lda_on_chinese_articles(df, n_topics): return lda_on_chinese_articles_with_param(df, n_topics, col_content = col_content, stopwords = stopwords, n_features = n_features, max_df = max_df, min_df = min_df, n_top_words = n_top_words) # In[63]: df = ofo_positive n_topics = 4 lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics) pyLDAvis.sklearn.prepare(lda, tf, vect) # In[64]: df = ofo_negative n_topics = 3 lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics) pyLDAvis.sklearn.prepare(lda, tf, vect) # In[65]: df = mobike_positive n_topics = 4 lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics) pyLDAvis.sklearn.prepare(lda, tf, vect) # In[66]: df = mobike_negative n_topics = 3 lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics) pyLDAvis.sklearn.prepare(lda, tf, vect) # In[ ]: