#!/usr/bin/env python # coding: utf-8 # # 用Python做情感分析 #

文本情感分析(也称为意见挖掘)是指用自然语言处理、文本挖掘以及计算机语言学等方法来识别和提取原素材中的主观信息。(维基百科)

#

简单的文本情感分析可借助已有工具包,以黑箱式操作完成。

# In[2]: from textblob import TextBlob text="I am happy today. I feel sad today." blob=TextBlob(text) for sentence in blob.sentences: print(sentence,sentence.sentiment) # TextBlob的情感极性取值范围是[-1, 1],-1代表完全负面,1代表完全正面。 #

训练集为影评。

# In[4]: from snownlp import SnowNLP text="我今天很快乐。我今天很愤怒。" s=SnowNLP(text) for sentence in s.sentences: print(sentence,SnowNLP(sentence).sentiments) #

SnowNLP的情感分析取值范围是[0,1],表达的是“这句话代表正面情感的概率”。

#

训练集为购物评价。

# ### 0 预处理 # #### 0.1 分词 # In[5]: import jieba sentence='这样的酒店配这样的价格还算不错' wordList=jieba.cut(sentence) for word in wordList: print (word) # #### 0.2 移除停用词 # 中科院计算所中文自然语言处理开放平台的[中文停用词表](http://www.datatang.com/data/43894) # In[6]: import jieba sentence='这样的酒店配这样的价格还算不错' wordList=jieba.cut(sentence) #####read in the stop word file file='ChineseStopWord.txt' f=open(file,'r',encoding='utf-8') stopList=[] for line in f: line=line.strip() stopList.append(line) f.close() #####remove stop words from the wordList newWordList=[] for word in wordList: if word not in stopList: newWordList.append(word) #####examine the results for word in newWordList: print (word) # ### 1,基于词典的方法 # 情感词典:[BosonNLP数据](http://bosonnlp.com/dev/resource) # #### 算法设计 # “假设情感值满足线性叠加原理;然后我们将句子进行分词,如果句子分词后的词语向量包含相应的词语,就加上向前的权值,其中,否定词和程度副词会有特殊的判别规则,否定词会导致权值反号,而程度副词则让权值加倍。最后,根据总权值的正负性来判断句子的情感。” #
(Source: http://spaces.ac.cn/archives/3360/) # In[7]: def classifyWords(wordDict): # (1) 情感词 senList = readLines('BosonNLP_sentiment_score.txt') senDict = defaultdict() for s in senList: senDict[s.split(' ')[0]] = s.split(' ')[1] # (2) 否定词 notList = readLines('notDict.txt') # (3) 程度副词 degreeList = readLines('degreeDict.txt') degreeDict = defaultdict() for d in degreeList: degreeDict[d.split(',')[0]] = d.split(',')[1] senWord = defaultdict() notWord = defaultdict() degreeWord = defaultdict() for word in wordDict.keys(): if word in senDict.keys() and word not in notList and word not in degreeDict.keys(): senWord[wordDict[word]] = senDict[word] elif word in notList and word not in degreeDict.keys(): notWord[wordDict[word]] = -1 elif word in degreeDict.keys(): degreeWord[wordDict[word]] = degreeDict[word] return senWord, notWord, degreeWord def scoreSent(senWord, notWord, degreeWord, segResult): W = 1 score = 0 # 存所有情感词的位置的列表 senLoc = senWord.keys() notLoc = notWord.keys() degreeLoc = degreeWord.keys() senloc = -1 # notloc = -1 # degreeloc = -1 # 遍历句中所有单词segResult,i为单词绝对位置 for i in range(0, len(segResult)): # 如果该词为情感词 if i in senLoc: # loc为情感词位置列表的序号 senloc += 1 # 直接添加该情感词分数 score += W * float(senWord[i]) # print "score = %f" % score if senloc < len(senLoc) - 1: # 判断该情感词与下一情感词之间是否有否定词或程度副词 # j为绝对位置 for j in range(senLoc[senloc], senLoc[senloc + 1]): # 如果有否定词 if j in notLoc: W *= -1 # 如果有程度副词 elif j in degreeLoc: W *= float(degreeWord[j]) # i定位至下一个情感词 if senloc < len(senLoc) - 1: i = senLoc[senloc + 1] return score ###Source:http://www.jianshu.com/p/4cfcf1610a73 # ### 2,朴素贝叶斯分类 #

1,准备训练集,即分类已知的文本

#

2,基于训练集,计算词语对于每个分类的贡献

#

3,对于分类未知的文本,基于文本中所含词语决定文本的分类

# In[7]: from textblob.classifiers import NaiveBayesClassifier train = [ ('I love this car', 'pos'), ('This view is amazing', 'pos'), ('I feel great', 'pos'), ('I am so excited about the concert', 'pos'), ("He is my best friend", 'pos'), ('I do not like this car', 'neg'), ('This view is horrible', 'neg'), ("I feel tired this morning", 'neg'), ('I am not looking forward to the concert', 'neg'), ('He is an annoying enemy', 'neg') ] test = [ ('feel happy this morning', 'pos'), ('Oh I love my friend', 'pos'), ('not like that man', 'neg'), ("this hourse not great", 'neg'), ('your song annoying', 'neg') ] cl = NaiveBayesClassifier(train) for sentence in test: print (sentence[0],':',cl.classify(sentence[0])) print ('accuracy is:', cl.accuracy(test)) # ### 3,获取情感语料 # #### 3.1 获取网页 # In[1]: from urllib.request import urlopen html=urlopen("http://pythonscraping.com/pages/page1.html") print(html.read()) # #### 3.2 拆分网页 # In[2]: from bs4 import BeautifulSoup html=urlopen("http://pythonscraping.com/pages/page1.html") bsobj=BeautifulSoup(html.read(),"html.parser") print (bsobj.html.body.h1) print (bsobj.body.h1) print (bsobj.html.h1) # #### 3.3 保存网页 # In[3]: fileName="D://example.txt" p=open(fileName,"w") #open for writing, truncating the file first print("hello",file=p) print("world",file=p) p.close() p=open(fileName,"a") #open for writing, appending to the end of the file if it exists print("hello world again",file=p) p.close() # In[ ]: