用gemsim包做LSA和LDA

  • 建立语料
  • 转成tfidf
  • 建立LSA(浅层语义分析)
  • 查询相似文档
  • 建立LDA (主题模型)
  • 建立语料
In [72]:
# 生成文档,9本书名
from gensim import corpora, models, similarities
documents = ['The Neatest Little Guide to Stock Market Investing',
                    'Investing For Dummies, 4th Edition',
                    'The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns',
                    'The Little Book of Value Investing',
                    'Value Investing: From Graham to Buffett and Beyond',
                    'Rich Dad\'s Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!',
                    'Investing in Real Estate, 5th Edition',
                    'Stock Investing For Dummies',
                    'Rich Dad\'s Advisors: The ABC\'s of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss'
]
In [75]:
# 去除停用词
from nltk.corpus import stopwords
texts = [[word for word in document.lower().split() if word not in stopwords.words("english")] for document in documents]
#texts = [[word for word in document.lower().split() if word not in stoplist]
 #         for document in documents]
print texts
[['neatest', 'little', 'guide', 'stock', 'market', 'investing'], ['investing', 'dummies,', '4th', 'edition'], ['little', 'book', 'common', 'sense', 'investing:', 'way', 'guarantee', 'fair', 'share', 'stock', 'market', 'returns'], ['little', 'book', 'value', 'investing'], ['value', 'investing:', 'graham', 'buffett', 'beyond'], ['rich', "dad's", 'guide', 'investing:', 'rich', 'invest', 'in,', 'poor', 'middle', 'class', 'not!'], ['investing', 'real', 'estate,', '5th', 'edition'], ['stock', 'investing', 'dummies'], ['rich', "dad's", 'advisors:', "abc's", 'real', 'estate', 'investing:', 'secrets', 'finding', 'hidden', 'profits', 'investors', 'miss']]
In [78]:
# 将列表中的词转为word-id映射字典
dictionary = corpora.Dictionary(texts)
print dictionary.token2id
{u'real': 32, u'estate,': 31, u'fair': 11, u'share': 16, u'edition': 8, u'investing:': 13, u'investors': 39, u'sense': 15, u'beyond': 19, u'graham': 21, u'market': 3, u'guarantee': 12, u'little': 2, u'estate': 36, u'investing': 1, u'miss': 40, u'5th': 30, u'buffett': 20, u'returns': 14, u'book': 9, u'way': 17, u'finding': 37, u'hidden': 38, u'dummies': 33, u'stock': 5, u'poor': 28, u'rich': 29, u'in,': 24, u'4th': 6, u'class': 22, u"abc's": 34, u'middle': 26, u'secrets': 42, u'invest': 25, u'dummies,': 7, u'value': 18, u'not!': 27, u'common': 10, u'neatest': 4, u'advisors:': 35, u"dad's": 23, u'guide': 0, u'profits': 41}
In [80]:
# 原始文档被转为现有字典中词编号和频数
# 表示1号词(investing)和9号词(book)各出现1次
new_doc = "Investing book"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print new_vec
[(1, 1), (9, 1)]
In [81]:
# 所有文档被转为上述形式的词料库
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)], [(1, 1), (6, 1), (7, 1), (8, 1)], [(2, 1), (3, 1), (5, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(1, 1), (2, 1), (9, 1), (18, 1)], [(13, 1), (18, 1), (19, 1), (20, 1), (21, 1)], [(0, 1), (13, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 2)], [(1, 1), (8, 1), (30, 1), (31, 1), (32, 1)], [(1, 1), (5, 1), (33, 1)], [(13, 1), (23, 1), (29, 1), (32, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]]
In [87]:
# 语料和numpy可以兼容互,在转换时需设置特征数
from gensim import matutils
numpy_matrix = matutils.corpus2dense(corpus,num_terms=43)
# corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
In [89]:
# 经典的词项-文档矩阵,其中值为词频
numpy_matrix[:10,:]
Out[89]:
array([[ 1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)
  • 转成tfidf
In [92]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
In [93]:
# 观察结果
for doc in corpus_tfidf:
    print doc
# 也可转为矩阵
# numpy_matrix = gensim.matutils.corpus2dense(corpus_tfidf,num_terms=43)
[(0, 0.432183228105567), (1, 0.16889525686595316), (2, 0.315676444823041), (3, 0.432183228105567), (4, 0.631352889646082), (5, 0.315676444823041)]
[(1, 0.1678477239832813), (6, 0.6274370726791256), (7, 0.6274370726791256), (8, 0.42950271385560695)]
[(2, 0.17076298056699674), (3, 0.23378651588573063), (5, 0.17076298056699674), (9, 0.23378651588573063), (10, 0.34152596113399347), (11, 0.34152596113399347), (12, 0.34152596113399347), (13, 0.1260470706374678), (14, 0.34152596113399347), (15, 0.34152596113399347), (16, 0.34152596113399347), (17, 0.34152596113399347)]
[(1, 0.2384389845229864), (2, 0.44565828749810027), (9, 0.6101375014879464), (18, 0.6101375014879464)]
[(13, 0.1943875188252588), (18, 0.3605413479900455), (19, 0.5266951771548322), (20, 0.5266951771548322), (21, 0.5266951771548322)]
[(0, 0.22884371488266889), (13, 0.12338213684169494), (22, 0.3343052929236428), (23, 0.22884371488266889), (24, 0.3343052929236428), (25, 0.3343052929236428), (26, 0.3343052929236428), (27, 0.3343052929236428), (28, 0.3343052929236428), (29, 0.45768742976533777)]
[(1, 0.15422435074989552), (8, 0.39464209354603486), (30, 0.5765110951399715), (31, 0.5765110951399715), (32, 0.39464209354603486)]
[(1, 0.2327026293256009), (5, 0.43493665890677735), (33, 0.8698733178135547)]
[(13, 0.11367055621369232), (23, 0.21083110588444598), (29, 0.21083110588444598), (32, 0.21083110588444598), (34, 0.30799165555519964), (35, 0.30799165555519964), (36, 0.30799165555519964), (37, 0.30799165555519964), (38, 0.30799165555519964), (39, 0.30799165555519964), (40, 0.30799165555519964), (41, 0.30799165555519964), (42, 0.30799165555519964)]
  • LSA 潜在语义分析
In [94]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # 初始化一个LSA模型,两维语义空间
corpus_lsi = lsi[corpus_tfidf] # 在原始语料上将我们的Tf-Idf语料库转换到潜在二维语义空间
In [95]:
for doc in corpus_lsi: # 各文档在2维主题中的空间坐标
    print doc
[(0, 0.63347241325167303), (1, -0.061896725129924259)]
[(0, 0.17868461025638538), (1, 0.59723455114094137)]
[(0, 0.59206256115968325), (1, -0.22058707251542273)]
[(0, 0.65195370541457509), (1, -0.19985332890364949)]
[(0, 0.30911487606210863), (1, -0.15735262094227312)]
[(0, 0.18203461585802136), (1, 0.26529717841569633)]
[(0, 0.18486893062294102), (1, 0.68829562498621732)]
[(0, 0.38579940702314952), (1, 0.045443408403883161)]
[(0, 0.1118111644377808), (1, 0.43788078154125371)]
In [162]:
import pandas as pd
x1 = [doc[0][1] for doc in corpus_lsi]
x2 = [doc[1][1] for doc in corpus_lsi]
names = range(9)
df = pd.DataFrame({'x1':x1,'x2':x2,'doc':names})
df
Out[162]:
doc x1 x2
0 0 0.633472 -0.061897
1 1 0.178685 0.597235
2 2 0.592063 -0.220587
3 3 0.651954 -0.199853
4 4 0.309115 -0.157353
5 5 0.182035 0.265297
6 6 0.184869 0.688296
7 7 0.385799 0.045443
8 8 0.111811 0.437881
  • 9个文档在二维语义空间中的位置
In [170]:
# 0,2,3号文档有比较高的相似性
%matplotlib inline
from ggplot import *
p = ggplot(df, aes(x = 'x1', y = 'x2', label='doc')) + geom_point() + geom_text(size=20)
print(p)  
<ggplot: (275962833)>
In [171]:
# 两个潜语义的维度表示
lsi.print_topics(2)
Out[171]:
[u'0.386*"little" + 0.350*"book" + 0.332*"value" + 0.306*"stock" + 0.269*"market" + 0.268*"investing" + 0.261*"neatest" + 0.219*"dummies" + 0.206*"guide" + 0.132*"common"',
 u'0.436*"edition" + 0.327*"estate," + 0.327*"5th" + 0.309*"4th" + 0.309*"dummies," + 0.300*"real" + 0.176*"rich" + -0.147*"value" + -0.143*"book" + 0.131*"investing"']
  • 相似性查询
In [176]:
# 将查询文档转到LSI空间
new_doc = "Investing book"
vec_bow = dictionary.doc2bow(new_doc .lower().split())
vec_lsi = lsi[vec_bow] 
print vec_lsi
[(0, 0.61768800065123997), (1, -0.012084299454548031)]
In [178]:
# 对转换到LSI空间的语料建相似索引
index = similarities.MatrixSimilarity(corpus_lsi, num_features=43) 
In [179]:
sims = index[vec_lsi] # 进行语料的相似查询, 余弦距离
print list(enumerate(sims)) # 查询文档和0,7,2,3号文档相似性较高
[(0, 0.99697202), (1, 0.26783881), (2, 0.94372416), (3, 0.96163654), (4, 0.89988339), (5, 0.54953808), (6, 0.24045581), (7, 0.9906559), (8, 0.22840852)]
In [182]:
# 排序输出
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sims 
[(0, 0.99697202), (7, 0.9906559), (3, 0.96163654), (2, 0.94372416), (4, 0.89988339), (5, 0.54953808), (1, 0.26783881), (6, 0.24045581), (8, 0.22840852)]
In [ ]:
# 模型保存
#lsi.save('/tmp/model.lsi') # tfidf,lda...也一样
#lsi = models.LsiModel.load('/tmp/model.lsi')
  • LDA 主题模型
In [184]:
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2)
lda.print_topics(2) #  主题表示词的概率分布
WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
Out[184]:
[u'0.035*dummies + 0.035*value + 0.033*investing + 0.031*stock + 0.031*little + 0.031*book + 0.030*edition + 0.028*estate, + 0.028*5th + 0.026*rich',
 u'0.029*little + 0.029*stock + 0.028*market + 0.028*investing + 0.028*neatest + 0.027*guide + 0.026*book + 0.026*edition + 0.026*value + 0.026*dummies,']
In [173]:
corpus_lda = lda[corpus_tfidf] # 在原始语料上将我们的Tf-Idf语料库转换到lda 2-D空间
for doc in corpus_lda: # 各文档在主题中的空间坐标,概率分布
    print doc
[(0, 0.61454672021557166), (1, 0.38545327978442823)]
[(0, 0.22675513712573009), (1, 0.77324486287426986)]
[(0, 0.81528087845709996), (1, 0.18471912154290002)]
[(0, 0.30096891139721366), (1, 0.69903108860278629)]
[(0, 0.20729131202399478), (1, 0.79270868797600513)]
[(0, 0.78866099283751523), (1, 0.21133900716248485)]
[(0, 0.73842987801214077), (1, 0.26157012198785928)]
[(0, 0.40103949433052083), (1, 0.59896050566947912)]
[(0, 0.78546668424881949), (1, 0.21453331575118056)]
In [175]:
#model = models.HdpModel(corpus_tfidf, id2word=dictionary) # LDA的扩展-HDA 不需要主题数