用gemsim包做LSA和LDA¶

建立语料
转成tfidf
建立LSA（浅层语义分析）
查询相似文档
建立LDA （主题模型）

建立语料

In [72]:

# 生成文档，9本书名
from gensim import corpora, models, similarities
documents = ['The Neatest Little Guide to Stock Market Investing',
                    'Investing For Dummies, 4th Edition',
                    'The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns',
                    'The Little Book of Value Investing',
                    'Value Investing: From Graham to Buffett and Beyond',
                    'Rich Dad\'s Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!',
                    'Investing in Real Estate, 5th Edition',
                    'Stock Investing For Dummies',
                    'Rich Dad\'s Advisors: The ABC\'s of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss'
]

In [75]:

# 去除停用词
from nltk.corpus import stopwords
texts = [[word for word in document.lower().split() if word not in stopwords.words("english")] for document in documents]
#texts = [[word for word in document.lower().split() if word not in stoplist]
 #         for document in documents]
print texts

[['neatest', 'little', 'guide', 'stock', 'market', 'investing'], ['investing', 'dummies,', '4th', 'edition'], ['little', 'book', 'common', 'sense', 'investing:', 'way', 'guarantee', 'fair', 'share', 'stock', 'market', 'returns'], ['little', 'book', 'value', 'investing'], ['value', 'investing:', 'graham', 'buffett', 'beyond'], ['rich', "dad's", 'guide', 'investing:', 'rich', 'invest', 'in,', 'poor', 'middle', 'class', 'not!'], ['investing', 'real', 'estate,', '5th', 'edition'], ['stock', 'investing', 'dummies'], ['rich', "dad's", 'advisors:', "abc's", 'real', 'estate', 'investing:', 'secrets', 'finding', 'hidden', 'profits', 'investors', 'miss']]

In [78]:

# 将列表中的词转为word-id映射字典
dictionary = corpora.Dictionary(texts)
print dictionary.token2id

{u'real': 32, u'estate,': 31, u'fair': 11, u'share': 16, u'edition': 8, u'investing:': 13, u'investors': 39, u'sense': 15, u'beyond': 19, u'graham': 21, u'market': 3, u'guarantee': 12, u'little': 2, u'estate': 36, u'investing': 1, u'miss': 40, u'5th': 30, u'buffett': 20, u'returns': 14, u'book': 9, u'way': 17, u'finding': 37, u'hidden': 38, u'dummies': 33, u'stock': 5, u'poor': 28, u'rich': 29, u'in,': 24, u'4th': 6, u'class': 22, u"abc's": 34, u'middle': 26, u'secrets': 42, u'invest': 25, u'dummies,': 7, u'value': 18, u'not!': 27, u'common': 10, u'neatest': 4, u'advisors:': 35, u"dad's": 23, u'guide': 0, u'profits': 41}

In [80]:

# 原始文档被转为现有字典中词编号和频数
# 表示1号词(investing)和9号词(book)各出现1次
new_doc = "Investing book"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print new_vec

[(1, 1), (9, 1)]

In [81]:

# 所有文档被转为上述形式的词料库
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)], [(1, 1), (6, 1), (7, 1), (8, 1)], [(2, 1), (3, 1), (5, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(1, 1), (2, 1), (9, 1), (18, 1)], [(13, 1), (18, 1), (19, 1), (20, 1), (21, 1)], [(0, 1), (13, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 2)], [(1, 1), (8, 1), (30, 1), (31, 1), (32, 1)], [(1, 1), (5, 1), (33, 1)], [(13, 1), (23, 1), (29, 1), (32, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)]]

In [87]:

# 语料和numpy可以兼容互，在转换时需设置特征数
from gensim import matutils
numpy_matrix = matutils.corpus2dense(corpus,num_terms=43)
# corpus = gensim.matutils.Dense2Corpus(numpy_matrix)

In [89]:

# 经典的词项-文档矩阵，其中值为词频
numpy_matrix[:10,:]

Out[89]:

array([[ 1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

转成tfidf

In [92]:

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [93]:

# 观察结果
for doc in corpus_tfidf:
    print doc
# 也可转为矩阵
# numpy_matrix = gensim.matutils.corpus2dense(corpus_tfidf,num_terms=43)

[(0, 0.432183228105567), (1, 0.16889525686595316), (2, 0.315676444823041), (3, 0.432183228105567), (4, 0.631352889646082), (5, 0.315676444823041)]
[(1, 0.1678477239832813), (6, 0.6274370726791256), (7, 0.6274370726791256), (8, 0.42950271385560695)]
[(2, 0.17076298056699674), (3, 0.23378651588573063), (5, 0.17076298056699674), (9, 0.23378651588573063), (10, 0.34152596113399347), (11, 0.34152596113399347), (12, 0.34152596113399347), (13, 0.1260470706374678), (14, 0.34152596113399347), (15, 0.34152596113399347), (16, 0.34152596113399347), (17, 0.34152596113399347)]
[(1, 0.2384389845229864), (2, 0.44565828749810027), (9, 0.6101375014879464), (18, 0.6101375014879464)]
[(13, 0.1943875188252588), (18, 0.3605413479900455), (19, 0.5266951771548322), (20, 0.5266951771548322), (21, 0.5266951771548322)]
[(0, 0.22884371488266889), (13, 0.12338213684169494), (22, 0.3343052929236428), (23, 0.22884371488266889), (24, 0.3343052929236428), (25, 0.3343052929236428), (26, 0.3343052929236428), (27, 0.3343052929236428), (28, 0.3343052929236428), (29, 0.45768742976533777)]
[(1, 0.15422435074989552), (8, 0.39464209354603486), (30, 0.5765110951399715), (31, 0.5765110951399715), (32, 0.39464209354603486)]
[(1, 0.2327026293256009), (5, 0.43493665890677735), (33, 0.8698733178135547)]
[(13, 0.11367055621369232), (23, 0.21083110588444598), (29, 0.21083110588444598), (32, 0.21083110588444598), (34, 0.30799165555519964), (35, 0.30799165555519964), (36, 0.30799165555519964), (37, 0.30799165555519964), (38, 0.30799165555519964), (39, 0.30799165555519964), (40, 0.30799165555519964), (41, 0.30799165555519964), (42, 0.30799165555519964)]

LSA 潜在语义分析

In [94]:

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # 初始化一个LSA模型，两维语义空间
corpus_lsi = lsi[corpus_tfidf] # 在原始语料上将我们的Tf-Idf语料库转换到潜在二维语义空间

In [95]:

for doc in corpus_lsi: # 各文档在2维主题中的空间坐标
    print doc

[(0, 0.63347241325167303), (1, -0.061896725129924259)]
[(0, 0.17868461025638538), (1, 0.59723455114094137)]
[(0, 0.59206256115968325), (1, -0.22058707251542273)]
[(0, 0.65195370541457509), (1, -0.19985332890364949)]
[(0, 0.30911487606210863), (1, -0.15735262094227312)]
[(0, 0.18203461585802136), (1, 0.26529717841569633)]
[(0, 0.18486893062294102), (1, 0.68829562498621732)]
[(0, 0.38579940702314952), (1, 0.045443408403883161)]
[(0, 0.1118111644377808), (1, 0.43788078154125371)]

In [162]:

import pandas as pd
x1 = [doc[0][1] for doc in corpus_lsi]
x2 = [doc[1][1] for doc in corpus_lsi]
names = range(9)
df = pd.DataFrame({'x1':x1,'x2':x2,'doc':names})
df

Out[162]:

	doc	x1	x2
0	0	0.633472	-0.061897
1	1	0.178685	0.597235
2	2	0.592063	-0.220587
3	3	0.651954	-0.199853
4	4	0.309115	-0.157353
5	5	0.182035	0.265297
6	6	0.184869	0.688296
7	7	0.385799	0.045443
8	8	0.111811	0.437881

9个文档在二维语义空间中的位置

In [170]:

# 0,2,3号文档有比较高的相似性
%matplotlib inline
from ggplot import *
p = ggplot(df, aes(x = 'x1', y = 'x2', label='doc')) + geom_point() + geom_text(size=20)
print(p)  

<ggplot: (275962833)>

In [171]:

# 两个潜语义的维度表示
lsi.print_topics(2)

Out[171]:

[u'0.386*"little" + 0.350*"book" + 0.332*"value" + 0.306*"stock" + 0.269*"market" + 0.268*"investing" + 0.261*"neatest" + 0.219*"dummies" + 0.206*"guide" + 0.132*"common"',
 u'0.436*"edition" + 0.327*"estate," + 0.327*"5th" + 0.309*"4th" + 0.309*"dummies," + 0.300*"real" + 0.176*"rich" + -0.147*"value" + -0.143*"book" + 0.131*"investing"']

相似性查询

In [176]:

# 将查询文档转到LSI空间
new_doc = "Investing book"
vec_bow = dictionary.doc2bow(new_doc .lower().split())
vec_lsi = lsi[vec_bow] 
print vec_lsi

[(0, 0.61768800065123997), (1, -0.012084299454548031)]

In [178]:

# 对转换到LSI空间的语料建相似索引
index = similarities.MatrixSimilarity(corpus_lsi, num_features=43) 

In [179]:

sims = index[vec_lsi] # 进行语料的相似查询, 余弦距离
print list(enumerate(sims)) # 查询文档和0，7，2，3号文档相似性较高

[(0, 0.99697202), (1, 0.26783881), (2, 0.94372416), (3, 0.96163654), (4, 0.89988339), (5, 0.54953808), (6, 0.24045581), (7, 0.9906559), (8, 0.22840852)]

In [182]:

# 排序输出
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sims 

[(0, 0.99697202), (7, 0.9906559), (3, 0.96163654), (2, 0.94372416), (4, 0.89988339), (5, 0.54953808), (1, 0.26783881), (6, 0.24045581), (8, 0.22840852)]

In [ ]:

# 模型保存
#lsi.save('/tmp/model.lsi') # tfidf，lda...也一样
#lsi = models.LsiModel.load('/tmp/model.lsi')

LDA 主题模型

In [184]:

lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2)
lda.print_topics(2) #  主题表示词的概率分布

WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy

Out[184]:

[u'0.035*dummies + 0.035*value + 0.033*investing + 0.031*stock + 0.031*little + 0.031*book + 0.030*edition + 0.028*estate, + 0.028*5th + 0.026*rich',
 u'0.029*little + 0.029*stock + 0.028*market + 0.028*investing + 0.028*neatest + 0.027*guide + 0.026*book + 0.026*edition + 0.026*value + 0.026*dummies,']

In [173]:

corpus_lda = lda[corpus_tfidf] # 在原始语料上将我们的Tf-Idf语料库转换到lda 2-D空间
for doc in corpus_lda: # 各文档在主题中的空间坐标，概率分布
    print doc

[(0, 0.61454672021557166), (1, 0.38545327978442823)]
[(0, 0.22675513712573009), (1, 0.77324486287426986)]
[(0, 0.81528087845709996), (1, 0.18471912154290002)]
[(0, 0.30096891139721366), (1, 0.69903108860278629)]
[(0, 0.20729131202399478), (1, 0.79270868797600513)]
[(0, 0.78866099283751523), (1, 0.21133900716248485)]
[(0, 0.73842987801214077), (1, 0.26157012198785928)]
[(0, 0.40103949433052083), (1, 0.59896050566947912)]
[(0, 0.78546668424881949), (1, 0.21453331575118056)]

In [175]:

#model = models.HdpModel(corpus_tfidf, id2word=dictionary) # LDA的扩展-HDA 不需要主题数