Final Model¶

In [3]:

import pandas as pd
from gensim import models,corpora
import pyLDAvis.gensim
from gensim.models.coherencemodel import CoherenceModel
import warnings

In [4]:

pd.set_option('max_colwidth',400)
pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')

In [5]:

Lda = models.LdaMulticore
lda_final =Lda.load('lda_final')
dictionary = corpora.Dictionary.load('dictionary')
doc_term_matrix = corpora.MmCorpus('doc_term_matrix.mm')

One stupid thing with LDA in gensim is when it shows the top coherent topics, it shows the word representation and the coherence score but it doesnt map with the topic id. The next cell is to overcome that issue and correctly map coherence score with the right topic id

Topic Evaluation¶

In [70]:

a = lda_final.show_topics(num_topics=12,formatted=False,num_words=10)
b = lda_final.top_topics(doc_term_matrix,dictionary=dictionary,topn=10) # This orders the topics in the decreasing order of coherence score

topic2skillb = {}
topic2csb = {}
topic2skilla = {}
topic2csa = {}
num_topics =lda_final.num_topics
cnt =1

for ws in b:
    wset = set(w[1] for w in ws[0])
    topic2skillb[cnt] = wset
    topic2csb[cnt] = ws[1]
    cnt +=1

for ws in a:
    wset = set(w[0]for w in ws[1])
    topic2skilla[ws[0]+1] = wset
    
for i in range(1,num_topics+1):
    for j in range(1,num_topics+1):  
        if topic2skilla[i].intersection(topic2skillb[j])==topic2skilla[i]:
            topic2csa[i] = topic2csb[j]

finalData = pd.DataFrame([],columns=['Topic','words'])
finalData['Topic']=topic2skilla.keys()
finalData['Topic'] = finalData['Topic'].apply(lambda x: 'Topic'+str(x))
finalData['words']=topic2skilla.values()
finalData['cs'] = topic2csa.values()
finalData.sort_values(by='cs',ascending=False,inplace=True)
finalData.to_csv('CoherenceScore.csv')
finalData

Out[70]:

	Topic	words	cs
8	Topic9	{spark, python, r, tableau, technical, hive, pig, java, hadoop, sql}	-0.937897
9	Topic10	{python, css, mysql, c, html, java, javascript, sql, c++, php}	-1.086996
11	Topic12	{sas, python, r, excel, tableau, technical, matlab, java, sql, c++}	-1.159532
6	Topic7	{spark, hadoop, python, r, tableau, technical, pandas, scikit, sql, numpy}	-1.333507
1	Topic2	{ms, python, r, technical, c, data, java, sql, oracle, windows}	-1.502323
0	Topic1	{xml, technical, html, java, uml, sql, pl, windows, oracle, agile}	-1.657700
4	Topic5	{skills, computer, python, r, excel, technical, matlab, data, sql, windows}	-1.827899
5	Topic6	{means, key, excel, technical, k, access, teradata, sql, sql_server, oracle}	-2.382989
2	Topic3	{project, core, computer, analytics, analysis, r, team, data, areasof, c++}	-2.893935
3	Topic4	{project, skills, linkedin, powershell, salesforce, unix, technical, bullhorn, linux, taleo}	-6.654834
10	Topic11	{relevant, research, illustrator, french, data, spanish, native, english, mandarin, indesign}	-10.106876
7	Topic8	{s., d., core, software, m., skill, jobvite, j., r., taleo}	-11.682204

Visualizing with pyLDAviz¶

In [66]:

num_topics =12
vis = pyLDAvis.gensim.prepare(lda_final, doc_term_matrix, dictionary,sort_topics=False)
pyLDAvis.save_html(vis,f'pyLDAvis_{num_topics}.html')
vis

Out[66]:

How to pick relevant words in each topic?¶

In [25]:

token_percent = vis.topic_coordinates.sort_values(by='topics').loc[:,['topics','Freq']]

In [77]:

def get_relevant_words(vis,lam=0.3,topn=10):
    a = vis.topic_info
    a['finalscore'] = a['logprob']*lam+(1-lam)*a['loglift']
    a = a.loc[:,['Category','Term','finalscore']].groupby(['Category'])\
    .apply(lambda x: x.sort_values(by='finalscore',ascending=False).head(topn))
    a = a.loc[:,'Term'].reset_index().loc[:,['Category','Term']]
    a = a[a['Category']!='Default']
    a = a.to_dict('split')['data']
    d ={}
    for k,v in a: 
        if k not in d.keys():
            d[k] =set()
            d[k].add(v)
        else:
            d[k].add(v)
    finalData = pd.DataFrame([],columns=['Topic','words with Relevance'])
    finalData['Topic']=d.keys()
    finalData['words with Relevance']=d.values()
    return finalData

In [91]:

get_relevant_words(vis,0.3).merge(finalData,how='left',on ='Topic').sort_values(by='cs',ascending=False).iloc[:,[0,1]]

Out[91]:

	Topic	words with Relevance
11	Topic9	{sqoop, kafka, cassandra, hdfs, hbase, hive, pig, impala, flume, oozie}
1	Topic10	{jquery, xml, css, eclipse, html, c, ajax, django, javascript, php}
3	Topic12	{sas, powerpoint, python, r, excel, matlab, spss, sql, word, stata}
9	Topic7	{classification, svm, learn, k, scikit, pandas, regression, matplotlib, scipy, numpy}
4	Topic2	{mssuite2012, tmux, spark2.0, databaseand, tableau_8, hive2.8, windows7/8/10, hadoop2, electronic, python2.7/3.3}
0	Topic1	{jboss, weblogic, ant, rmi, struts, soap, jsf, uml, jms, cvs}
7	Topic5	{linearandnon, hplc, mexico, gc, community, volunteer, tika, excelandword, pune, ontology}
8	Topic6	{ggplot2and, gridsearchand, oncology, modeltuning/, ddl, stepwise, filter_methods, hiv, pigand, dml}
5	Topic3	{magento, public_health, copy, campaign, hebrew, lucid, oracle_rdbms, ubuntuand, spatialdata, linearalgebra}
6	Topic4	{dataquality, erecruit, d.c., brassring, google_earth, october, scorecards, bullhorn, icims, taleo}
2	Topic11	{french, spanish, native, testing/, hootsuite, english, chinese, mandarin, cantonese, indesign}
10	Topic8	{s., d., l., n., m., y., jobvite, g., j., p.}