#!/usr/bin/env python # coding: utf-8 # #python 文档聚类和主题建模 # 易红发 yihongfa@yeah.net # 文档聚类和文本分类是文本挖掘的基本任务，本文主要针对的是无监督的聚类算法，包括K-means聚类、谱系聚类和LDA主题建模。 # Python环境下对文本的处理主要用到以下模块：nltk、pandas、sklearn、gensim等。 # 对于想利用Python来处理文本的挖掘者来说，本文应该是不错的借鉴。 # 本文的主要任务是通过电影简介为电影聚类，数据可在[此处](https://github.com/yihongfa/pythondata/tree/master/data)下载。分为title、synopses和genres三部分。 # In[1]: #导入要用到的模块 import numpy as np import pandas as pd import nltk import re from bs4 import BeautifulSoup from sklearn import feature_extraction # # 数据预处理 # In[2]: #导入三部分数据：电影名列表、链接以及简介，支取前100部电影 titles = open('title_list.txt').read().split('\n') #保证只有前一百条被读入 titles = titles[:100] synopses = open('synopses_list_wiki.txt').read().split('\n BREAKS HERE') synopses = synopses[:100] # In[3]: #清洗电影简介 synopses_clean = [] for text in synopses: text = BeautifulSoup(text, 'html.parser').getText() #将html格式的转化为无格式文本（unicode） synopses_clean.append(text) synopses = synopses_clean # In[4]: titles[:5]#查看前5部电影的电影名 # In[5]: synopses[0][:200]#查看第一条简介的前200个字符 # In[6]: #导入电影类型数据 genres = open('genres_list.txt').read().split('\n') genres =genres[:100] # In[8]: #总览所有数据 print(str(len(titles)) + ' titles') print(str(len(synopses)) + ' synopses') print(str(len(genres)) + ' genres') # In[9]: #生成索引 ranks = [] for i in range(0,len(titles)): ranks.append(i) # 通过nltk清洗数据 # In[10]: #导入nltk英文止停词 stopwords = nltk.corpus.stopwords.words('english') # In[11]: #导入 SnowballStemmer from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer('english') # 下文将定义两种函数： # #

tokenize_and_stem: 分词并且将词“词干化”，所谓“词干化”是将单词根据词根归一，比如take和took，经过“词干化”均表示为take。 #
tokenize_only: 只分词。 #

# In[12]: def tokenize_and_stem(text): tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] #过滤掉非字母，比如数字和间隔等 for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems def tokenize_only(text): tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) return filtered_tokens # 用词干化和未词干化的结果构建DataFrame，这样使得后文的分析更为精确。虽然是文档聚类，其实最小单位还是词，词准确了，聚类才会准确。 # In[13]: totalvocab_stemmed = [] totalvocab_tokenized = [] for i in synopses: allwords_stemmed = tokenize_and_stem(i) totalvocab_stemmed.extend(allwords_stemmed) allwords_tokenized = tokenize_only(i) totalvocab_tokenized.extend(allwords_tokenized) vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed) # # TF-IDF模型和文档相似度 # 本部分的主要内容是将原始文档映射到词向量空间，形成TF-IDF，并计算文档相似度或距离。 # In[14]: #通过scikit-learn中的文本特征抽取模块中的TF-IDF向量模型进行文档向量化。 #其中max_df=0.8和min_df=0.2的意思是过滤掉文档频率高于80%和文档频率低于20%的词。 from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3)) tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) # In[15]: terms = tfidf_vectorizer.get_feature_names()#获取词（特征）名 # In[16]: terms[:5] # In[17]: from sklearn.metrics.pairwise import cosine_similarity dist = 1 - cosine_similarity(tfidf_matrix) # In[18]: dist[1,9] #某两个文档的距离 # # K-means 聚类 # 利用TF-IDF向量空间和文档距离 # In[19]: #利用scikit-learn中的Kmeans模型进行聚类，类别数为5 from sklearn.cluster import KMeans num_clusters = 5 km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) clusters = km.labels_.tolist() # In[20]: clusters[:10]#前10个文档的类 # In[21]: #构造数据框 import pandas as pd films = {'title':titles, 'rank':ranks, 'synopses':synopses, 'cluster':clusters, 'genre':genres} frame = pd.DataFrame(films, index=[clusters], columns=['rank', 'title', 'cluster', 'genre']) # In[22]: frame['rank'] += 1 # In[23]: frame.to_excel('cluster.xlsx') #结果写入文件 # In[24]: frame #聚类结果 # #谱系聚类 # In[25]: from scipy.cluster.hierarchy import ward, dendrogram import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') linkage_matrix = ward(dist) #通过ward法构建矩阵 fig, ax = plt.subplots(figsize=(15,20)) ax = dendrogram(linkage_matrix, orientation='right', labels=titles) plt.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off') plt.tight_layout() #紧凑布局 #保存图像 plt.savefig('ward_clusters.png', dpi=200) # # LDA主题建模 # LDA主题建模需要并不依赖TF-IDF模型，所以需要重新进行数据预处理 # In[26]: #分词 import string def strip_proppers(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()] return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip() # In[27]: #文本准备 from gensim import corpora, models, similarities #remove proper names preprocess = [strip_proppers(doc) for doc in synopses] tokenized_text = [tokenize_and_stem(text) for text in preprocess] texts = [[word for word in text if word not in stopwords] for text in tokenized_text] # In[31]: #词典和语料库准备 dictionary = corpora.Dictionary(texts) #构造词典 dictionary.filter_extremes(no_below=1, no_above=0.8)#去高频词 corpus = [dictionary.doc2bow(text) for text in texts] #构造语料库 # In[32]: len(corpus) # In[33]: #训练一个LDA模型 get_ipython().run_line_magic('time', 'lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)') # In[34]: print(lda[corpus[0]])#打印第一个文档的主题建模结果 # In[35]: topics = lda.print_topics(5, num_words=20)#结果中的主题 # In[36]: topics #主题 # In[37]: topics_matrix = lda.show_topics(formatted=False, num_words=20) # In[38]: topics_matrix = np.array(topics_matrix) # In[39]: topics_matrix #主题矩阵 # In[ ]: