import pandas as pd
from gensim import models,corpora
import pyLDAvis.gensim
from gensim.models.coherencemodel import CoherenceModel
import warnings
pd.set_option('max_colwidth',400)
pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')
Lda = models.LdaMulticore
lda_final =Lda.load('lda_final')
dictionary = corpora.Dictionary.load('dictionary')
doc_term_matrix = corpora.MmCorpus('doc_term_matrix.mm')
One stupid thing with LDA in gensim is when it shows the top coherent topics, it shows the word representation and the coherence score but it doesnt map with the topic id. The next cell is to overcome that issue and correctly map coherence score with the right topic id
a = lda_final.show_topics(num_topics=12,formatted=False,num_words=10)
b = lda_final.top_topics(doc_term_matrix,dictionary=dictionary,topn=10) # This orders the topics in the decreasing order of coherence score
topic2skillb = {}
topic2csb = {}
topic2skilla = {}
topic2csa = {}
num_topics =lda_final.num_topics
cnt =1
for ws in b:
wset = set(w[1] for w in ws[0])
topic2skillb[cnt] = wset
topic2csb[cnt] = ws[1]
cnt +=1
for ws in a:
wset = set(w[0]for w in ws[1])
topic2skilla[ws[0]+1] = wset
for i in range(1,num_topics+1):
for j in range(1,num_topics+1):
if topic2skilla[i].intersection(topic2skillb[j])==topic2skilla[i]:
topic2csa[i] = topic2csb[j]
finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topic2skilla.keys()
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2skilla.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)
finalData.to_csv('CoherenceScore.csv')
finalData
Topic | words | cs | |
---|---|---|---|
8 | Topic9 | {spark, python, r, tableau, technical, hive, pig, java, hadoop, sql} | -0.937897 |
9 | Topic10 | {python, css, mysql, c, html, java, javascript, sql, c++, php} | -1.086996 |
11 | Topic12 | {sas, python, r, excel, tableau, technical, matlab, java, sql, c++} | -1.159532 |
6 | Topic7 | {spark, hadoop, python, r, tableau, technical, pandas, scikit, sql, numpy} | -1.333507 |
1 | Topic2 | {ms, python, r, technical, c, data, java, sql, oracle, windows} | -1.502323 |
0 | Topic1 | {xml, technical, html, java, uml, sql, pl, windows, oracle, agile} | -1.657700 |
4 | Topic5 | {skills, computer, python, r, excel, technical, matlab, data, sql, windows} | -1.827899 |
5 | Topic6 | {means, key, excel, technical, k, access, teradata, sql, sql_server, oracle} | -2.382989 |
2 | Topic3 | {project, core, computer, analytics, analysis, r, team, data, areasof, c++} | -2.893935 |
3 | Topic4 | {project, skills, linkedin, powershell, salesforce, unix, technical, bullhorn, linux, taleo} | -6.654834 |
10 | Topic11 | {relevant, research, illustrator, french, data, spanish, native, english, mandarin, indesign} | -10.106876 |
7 | Topic8 | {s., d., core, software, m., skill, jobvite, j., r., taleo} | -11.682204 |
num_topics =12
vis = pyLDAvis.gensim.prepare(lda_final, doc_term_matrix, dictionary,sort_topics=False)
pyLDAvis.save_html(vis,f'pyLDAvis_{num_topics}.html')
vis
token_percent = vis.topic_coordinates.sort_values(by='topics').loc[:,['topics','Freq']]
def get_relevant_words(vis,lam=0.3,topn=10):
a = vis.topic_info
a['finalscore'] = a['logprob']*lam+(1-lam)*a['loglift']
a = a.loc[:,['Category','Term','finalscore']].groupby(['Category'])\
.apply(lambda x: x.sort_values(by='finalscore',ascending=False).head(topn))
a = a.loc[:,'Term'].reset_index().loc[:,['Category','Term']]
a = a[a['Category']!='Default']
a = a.to_dict('split')['data']
d ={}
for k,v in a:
if k not in d.keys():
d[k] =set()
d[k].add(v)
else:
d[k].add(v)
finalData = pd.DataFrame([],columns=['Topic','words with Relevance'])
finalData['Topic']=d.keys()
finalData['words with Relevance']=d.values()
return finalData
get_relevant_words(vis,0.3).merge(finalData,how='left',on ='Topic').sort_values(by='cs',ascending=False).iloc[:,[0,1]]
Topic | words with Relevance | |
---|---|---|
11 | Topic9 | {sqoop, kafka, cassandra, hdfs, hbase, hive, pig, impala, flume, oozie} |
1 | Topic10 | {jquery, xml, css, eclipse, html, c, ajax, django, javascript, php} |
3 | Topic12 | {sas, powerpoint, python, r, excel, matlab, spss, sql, word, stata} |
9 | Topic7 | {classification, svm, learn, k, scikit, pandas, regression, matplotlib, scipy, numpy} |
4 | Topic2 | {mssuite2012, tmux, spark2.0, databaseand, tableau_8, hive2.8, windows7/8/10, hadoop2, electronic, python2.7/3.3} |
0 | Topic1 | {jboss, weblogic, ant, rmi, struts, soap, jsf, uml, jms, cvs} |
7 | Topic5 | {linearandnon, hplc, mexico, gc, community, volunteer, tika, excelandword, pune, ontology} |
8 | Topic6 | {ggplot2and, gridsearchand, oncology, modeltuning/, ddl, stepwise, filter_methods, hiv, pigand, dml} |
5 | Topic3 | {magento, public_health, copy, campaign, hebrew, lucid, oracle_rdbms, ubuntuand, spatialdata, linearalgebra} |
6 | Topic4 | {dataquality, erecruit, d.c., brassring, google_earth, october, scorecards, bullhorn, icims, taleo} |
2 | Topic11 | {french, spanish, native, testing/, hootsuite, english, chinese, mandarin, cantonese, indesign} |
10 | Topic8 | {s., d., l., n., m., y., jobvite, g., j., p.} |