#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd


# In[2]:


usecols = ["title", "datetime", "content"]


# In[3]:


ofo = pd.read_csv("data/ofo.csv", usecols = usecols)


# In[4]:


ofo.head()


# In[5]:


ofo.shape


# In[6]:


mobike = pd.read_csv("data/mobike.csv", usecols = usecols)


# In[7]:


mobike.head()


# In[8]:


mobike.shape


# In[9]:


merged = pd.concat([ofo, mobike]).reset_index().drop('index', axis=1)


# In[10]:


merged.head()


# In[11]:


merged.shape


# In[13]:


merged[merged.title.duplicated()].shape # 重复新闻


# In[14]:


merged = merged[~merged.title.duplicated()]


# In[15]:


merged.shape


# In[16]:


def count_ofo(mystr):
    mycount = mystr.count('OFO') +  mystr.count('ofo')
    return mycount


# In[17]:


merged["count_ofo"] = merged.content.apply(count_ofo)


# In[18]:


def count_mobike(mystr):
    mycount = mystr.count('摩拜') + mystr.count('mobike')
    return mycount


# In[19]:


merged["count_mobike"] = merged.content.apply(count_mobike)


# In[20]:


merged.head()


# In[21]:


merged[merged.count_mobike > merged.count_ofo].shape


# In[22]:


mobike = merged[merged.count_mobike > merged.count_ofo]


# In[23]:


mobike = mobike.reset_index().drop(['index','count_ofo','count_mobike'], axis=1)


# In[24]:


mobike.head()


# In[25]:


mobike.shape


# In[26]:


merged[merged.count_mobike <= merged.count_ofo].shape


# In[27]:


ofo = merged[merged.count_mobike <= merged.count_ofo]


# In[28]:


ofo = ofo.reset_index().drop(['index','count_ofo','count_mobike'], axis=1)


# In[29]:


ofo.head()


# In[30]:


ofo.shape


# # Sentiment Analysis

# ## snownlp

# In[31]:


from snownlp import SnowNLP
def get_sentiment(text):
    return SnowNLP(text).sentiments


# In[32]:


ofo["sentiment"] = ofo.content.apply(get_sentiment)
mobike["sentiment"] = mobike.content.apply(get_sentiment)


# In[33]:


import numpy as np


# In[34]:


pd.Series(np.where(ofo['sentiment'] > 0.5, True, False)).value_counts()


# In[35]:


pd.Series(np.where(mobike['sentiment'] > 0.5, True, False)).value_counts()


# ## bosonnlp

# In[36]:


# from bosonnlp import BosonNLP

# import json

# with open("boson_api_key.json") as f:
#     secret = json.load(f)

# nlp = BosonNLP(secret["boson_api"])

# def get_sentiment_bosonnlp(text):
#     return nlp.sentiment(text, model="news")[0][0]

# ofo["sentiment"] = ofo.content.apply(get_sentiment_bosonnlp)

# mobike["sentiment"] = mobike.content.apply(get_sentiment_bosonnlp)


# In[37]:


import pickle


# In[38]:


# with open('sentiment_bosonnlp.pickle', 'wb') as f:
#     pickle.dump([ofo, mobike], f)


# In[39]:


with open('data/sentiment_bosonnlp.pickle', 'rb') as f:
    [ofo, mobike] = pickle.load(f)


# In[40]:


pd.Series(np.where(ofo['sentiment'] > 0.5, True, False)).value_counts()


# In[41]:


pd.Series(np.where(mobike['sentiment'] > 0.5, True, False)).value_counts()


# # Visualization

# In[42]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[43]:


ofo.set_index('datetime', inplace=True)


# In[44]:


mobike.set_index('datetime', inplace=True)


# In[45]:


(ofo[['sentiment']] - 0.5).plot(kind='bar', figsize=(24, 6))


# In[46]:


(mobike[['sentiment']] - 0.5).plot(kind='bar', figsize=(24, 6))


# In[47]:


import seaborn as sns


# In[48]:


ofo["brand"] = "ofo"


# In[49]:


mobike["brand"] = "mobike"


# In[50]:


ofo.head()


# In[51]:


df = pd.concat([ofo, mobike])


# In[52]:


sns.boxplot(x='brand', y='sentiment', data=df)


# In[53]:


ofo_positive = ofo[ofo.sentiment > 0.5]
ofo_negative = ofo[ofo.sentiment <= 0.5]
mobike_positive = mobike[mobike.sentiment > 0.5]
mobike_negative = mobike[mobike.sentiment <= 0.5]


# In[54]:


ofo_positive.shape


# In[55]:


ofo_negative.shape


# In[56]:


mobike_positive.shape


# In[57]:


mobike_negative.shape


# # LDA

# In[58]:


import pyLDAvis 
import pyLDAvis.sklearn 


# In[59]:


pyLDAvis.enable_notebook()


# In[60]:


from helper import *


# In[61]:


stopwords = get_custom_stopwords("stopwordsHIT.txt") # HIT停用词词典
max_df = 0.7 # 在超过这一比例的文档中出现的关键词（过于平凡），去除掉。
min_df = 2 # 在低于这一数量的文档中出现的关键词（过于独特），去除掉。
n_features = 1000 # 最大提取特征数量
n_top_words = 20 # 显示主题下关键词的时候，显示多少个
col_content = "content" # 说明其中的文本信息所在列名称


# In[62]:


def lda_on_chinese_articles(df, n_topics):
    return lda_on_chinese_articles_with_param(df, n_topics, 
                            col_content = col_content, 
                            stopwords = stopwords, 
                            n_features = n_features, 
                            max_df = max_df, 
                            min_df = min_df,
                            n_top_words = n_top_words)


# In[63]:


df = ofo_positive
n_topics = 4
lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics)
pyLDAvis.sklearn.prepare(lda, tf, vect)


# In[64]:


df = ofo_negative
n_topics = 3
lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics)
pyLDAvis.sklearn.prepare(lda, tf, vect)


# In[65]:


df = mobike_positive
n_topics = 4
lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics)
pyLDAvis.sklearn.prepare(lda, tf, vect)


# In[66]:


df = mobike_negative
n_topics = 3
lda, tf, vect = lda_on_chinese_articles(df = df, n_topics = n_topics)
pyLDAvis.sklearn.prepare(lda, tf, vect)


# In[ ]: