# 生成文档,9本书名 from gensim import corpora, models, similarities documents = ['The Neatest Little Guide to Stock Market Investing', 'Investing For Dummies, 4th Edition', 'The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns', 'The Little Book of Value Investing', 'Value Investing: From Graham to Buffett and Beyond', 'Rich Dad\'s Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!', 'Investing in Real Estate, 5th Edition', 'Stock Investing For Dummies', 'Rich Dad\'s Advisors: The ABC\'s of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss' ] # 去除停用词 from nltk.corpus import stopwords texts = [[word for word in document.lower().split() if word not in stopwords.words("english")] for document in documents] #texts = [[word for word in document.lower().split() if word not in stoplist] # for document in documents] print texts # 将列表中的词转为word-id映射字典 dictionary = corpora.Dictionary(texts) print dictionary.token2id # 原始文档被转为现有字典中词编号和频数 # 表示1号词(investing)和9号词(book)各出现1次 new_doc = "Investing book" new_vec = dictionary.doc2bow(new_doc.lower().split()) print new_vec # 所有文档被转为上述形式的词料库 corpus = [dictionary.doc2bow(text) for text in texts] print corpus # 语料和numpy可以兼容互,在转换时需设置特征数 from gensim import matutils numpy_matrix = matutils.corpus2dense(corpus,num_terms=43) # corpus = gensim.matutils.Dense2Corpus(numpy_matrix) # 经典的词项-文档矩阵,其中值为词频 numpy_matrix[:10,:] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # 观察结果 for doc in corpus_tfidf: print doc # 也可转为矩阵 # numpy_matrix = gensim.matutils.corpus2dense(corpus_tfidf,num_terms=43) lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # 初始化一个LSA模型,两维语义空间 corpus_lsi = lsi[corpus_tfidf] # 在原始语料上将我们的Tf-Idf语料库转换到潜在二维语义空间 for doc in corpus_lsi: # 各文档在2维主题中的空间坐标 print doc import pandas as pd x1 = [doc[0][1] for doc in corpus_lsi] x2 = [doc[1][1] for doc in corpus_lsi] names = range(9) df = pd.DataFrame({'x1':x1,'x2':x2,'doc':names}) df # 0,2,3号文档有比较高的相似性 %matplotlib inline from ggplot import * p = ggplot(df, aes(x = 'x1', y = 'x2', label='doc')) + geom_point() + geom_text(size=20) print(p) # 两个潜语义的维度表示 lsi.print_topics(2) # 将查询文档转到LSI空间 new_doc = "Investing book" vec_bow = dictionary.doc2bow(new_doc .lower().split()) vec_lsi = lsi[vec_bow] print vec_lsi # 对转换到LSI空间的语料建相似索引 index = similarities.MatrixSimilarity(corpus_lsi, num_features=43) sims = index[vec_lsi] # 进行语料的相似查询, 余弦距离 print list(enumerate(sims)) # 查询文档和0,7,2,3号文档相似性较高 # 排序输出 sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sims # 模型保存 #lsi.save('/tmp/model.lsi') # tfidf,lda...也一样 #lsi = models.LsiModel.load('/tmp/model.lsi') lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2) lda.print_topics(2) # 主题表示词的概率分布 corpus_lda = lda[corpus_tfidf] # 在原始语料上将我们的Tf-Idf语料库转换到lda 2-D空间 for doc in corpus_lda: # 各文档在主题中的空间坐标,概率分布 print doc #model = models.HdpModel(corpus_tfidf, id2word=dictionary) # LDA的扩展-HDA 不需要主题数