import os
import pandas as pd
from util import read_crowdtangle_files, create_corpus
import time
from datetime import timedelta
from pprint import pprint
from gensim.models.wrappers import LdaMallet
import pickle
from gensim.corpora.mmcorpus import MmCorpus
#Specify path to input and output directories
input_dir = '/Users/dankoban/Documents/EM6575/LDAInput'
output_dir = '/Users/dankoban/Documents/EM6575/LDAOutput'
# Extract file names from input directory
files = [file for file in os.listdir(input_dir) if file.endswith(".csv")]
file_paths = [input_dir + "/" + file for file in files]
# Select only n files for testing
file_paths = file_paths[0:1]
start_time = time.time()
df = read_crowdtangle_files(file_paths)
print("--- %s time elapsed ---" % str(timedelta(seconds=time.time() - start_time)))
100%|██████████| 1/1 [00:03<00:00, 3.09s/it]
--- 0:00:03.095897 time elapsed ---
# Extract subset of total data for testing the workflow
pd.set_option('display.max_colwidth', None)
df = pd.concat(df)
print(len(df))
df.head()
130193
Facebook Id | Text | |
---|---|---|
0 | 624614494274945 | Nika Vetsko, excerpts: ...Many researchers believe that Russia is trying to increase this traffic in Georgia, having already been active in fuelling anti-vaccination conspiracy theories. Some link this directly to the countrys measles outbreak last year. ...Russia has also revived conspiracy theories around the Lugar Laboratory, a US fi ced high-tech research centre in Tbilisi. Over the years, Russian authorities and media have worked to discredit the lab and US-Georgia relations more widely. Is Russia Exploiting Coronavirus Fears In Georgia? By Nika Vetsko* Experts warn that Russia is exploiting the recent appearance of coronavirus in Georgia to spread a new wave of disinformation and conspiracy theories. Georgia has registered only 15 |
1 | 26781952138 | The capitals first Covid-19 patient, a 45-year-old man from Mayur Vihar Phase II, has recovered fully from the viral infection. He was discharged from Ram Manohar Lohia Hospital on Saturday, said a source. Delhis first coronavirus patient recovers fully The capitals first Covid-19 patient, a 45-year-old man from Mayur Vihar Phase II, has recovered fully from the viral infection. He was discharged fro |
2 | 251907774312 | The coronavirus pandemic is yet to force widespread school shutdowns but many families are voluntarily withdrawing their children. 'I'm happy to be a small drop': Families withdrawing children from school to fight coronavirus The coronavirus pandemic is yet to force widespread school shutdowns but across Sydney, many families are voluntarily withdrawing their children. |
3 | 138280549589759 | The safety and well-being of our community and the Brothers Fish&chips family is always the top priority. In challenging times like this, we are faced with many uncertainties. However, one thing that is certain is that together as a community we will overcome this situation and wed like to reassure that we are following CDC recommended guidelines regarding coronavirus, COVID-19 to keep you and our family safe as much as we can! #ossining #croton #briarcliff #westchester #lohudfood We are temporarily offering prepaid delivery and curb side pick-up. Call (914) 488-5141 to place your order and before arrival. Timeline Photos |
4 | 32204506174 | With the coronavirus spreading across the globe @carynceolin with how the White House is trying to prevent it from spreading around the West Wing. Trump tested negative for COVID-19 - CityNews Toronto As the coronavirus inches closer to President Trump, Caryn Ceolin with how the White House is trying to prevent it from spreading around the West Wing. |
# write a text file of the merged files to run via the terminal
df.to_csv('/Users/dankoban/Documents/EM6575/coherence_test2/input.txt',sep=' ',header=False)
#~/mallet-2.0.8/bin/mallet import-file --input mallet_terminal_input_crowdtangle.txt --output ct.mallet --remove-stopwords TRUE --extra-stopwords new_stopwords.txt --keep-sequence TRUE
#~/mallet-2.0.8/bin/mallet train-topics --input ct.mallet --output-topic-keys ct.keys --topic-word-weights-file ct.topicwordweights --word-topic-counts-file ct.wordtopiccounts --output-doc-topics ct.doctopics_sparse --num-topics 20 --num-threads 48 --optimize-interval 10 --doc-topics-threshold 0.3 --diagnostics-file diagnostics.xml
# Import custom list of stop words
stop_words = pd.read_csv("/Users/dankoban/Documents/lda_evaluation/data/new_stopwords.csv")
stop_words = stop_words['stop_word'].tolist()
stop_words[0:5]
TopicTermFreq = pd.read_csv('/Users/dankoban/Documents/EM6575/mallet_command_line/tidy_topics.csv')
def top_n_terms(k, n = 20):
result = (TopicTermFreq[TopicTermFreq['topic'] == k].
sort_values('count', ascending=False).head(n))
return result
topics = []
for k in range(0, 20):
terms = top_n_terms(k, n = 20)['term'].tolist()
terms = [term.replace('.', '') for term in terms]
terms = [term.replace("'", '') for term in terms]
topics.append(terms)
#topics
[dictionary, corpus] = create_corpus(text = df.Text, stop_words = stop_words)
# Save dictionary and corpus to disc
dictionary.save(output_dir + "/dictionary.pkl")
MmCorpus.serialize(output_dir + "/corpus.pkl", corpus)
Fit a new model
os.environ.update({'MALLET_HOME':r'/Users/dankoban/mallet-2.0.8/'})
start_time = time.time()
lda = LdaMallet(mallet_path = '/Users/dankoban/mallet-2.0.8/bin/mallet',
corpus=corpus, num_topics=50, id2word=dictionary,
workers = 20, iterations = 500, random_seed = 1)
print("--- %s time elapsed ---" % str(timedelta(seconds=time.time() - start_time)))
# Save model to disk
pickle.dump(lda, open(output_dir + "/mallet.pkl", "wb"))
Load an existing model. If an existing model doesn't exist, execute the code to fit a new model
lda = pickle.load(open(output_dir + "/mallet.pkl", "rb"))
dictionary = pickle.load(open(output_dir + "/dictionary.pkl", "rb"))
corpus = MmCorpus(open(output_dir + "/corpus.pkl", "rb"))
# Show Topics
pprint(lda.show_topic(1))
tm_results = lda[corpus]
corpus_topics = [sorted(topics,
key=lambda record: -record[1])[0] for topics in tm_results]
topics = [[(term, round(wt, 3)) for term, wt in lda.show_topic(n, topn=20)]
for n in range(0, lda.num_topics)]
topics_df = pd.DataFrame([[term for term, wt in topic] for topic in topics],
columns = ['Term'+str(i) for i in range(1, 21)],
index=['Topic '+str(t) for t in range(1, lda.num_topics+1)]).T
topics_df
# set column width
pd.set_option('display.max_colwidth', None)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics],
columns = ['Terms per Topic'],
index=['Topic'+str(t) for t in range(1, lda.num_topics+1)] )
topics_df
from collections import OrderedDict
data_lda = {i: OrderedDict(lda.show_topic(i,5)) for i in range(50)}
df_lda = pd.DataFrame(data_lda)
df_lda = df_lda.fillna(0).T
print(df_lda.shape)
import seaborn as sns
import matplotlib.pyplot as plt
g=sns.clustermap(df_lda.corr(), center=0, standard_scale=1,
cmap="RdBu", metric='cosine', linewidths=.75, figsize=(15, 15))
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.show()