import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.WARNING)
logging.root.level = logging.WARNING

from sklearn import datasets
news_dataset = datasets.fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# A list of text document is contained in the data variable
documents = news_dataset.data

print "In the dataset there are", len(documents), "textual documents"
print "And this is the first one:\n", documents[0]

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in gensim.utils.simple_preprocess(text) if token not in gensim.parsing.preprocessing.STOPWORDS]

print "After the tokenizer, the previous document becomes:\n", tokenize(documents[0])

processed_docs = [tokenize(doc) for doc in documents]
word_count_dict = gensim.corpora.Dictionary(processed_docs)
print "In the corpus there are", len(word_count_dict), "unique tokens"

word_count_dict.filter_extremes(no_below=20, no_above=0.1) # word must appear >10 times, and no more than 20% documents

print "After filtering, in the corpus there are only", len(word_count_dict), "unique tokens"

bag_of_words_corpus = [word_count_dict.doc2bow(pdoc) for pdoc in processed_docs]

bow_doc1 = bag_of_words_corpus[0]

print "Bag of words representation of the first document (tuples are composed by token_id and multiplicity):\n", bow_doc1
print
for i in range(5):
    print "In the document, topic_id {} (word \"{}\") appears {} time[s]".format(bow_doc1[i][0], word_count_dict[bow_doc1[i][0]], bow_doc1[i][1])
print "..."

# LDA mono-core
lda_model = gensim.models.LdaModel(bag_of_words_corpus, num_topics=10, id2word=word_count_dict, passes=5)

# LDA multicore (in this configuration, defaulty, uses n_cores-1)
# lda_model = gensim.models.LdaMulticore(bag_of_words_corpus, num_topics=10, id2word=word_count_dict, passes=5)

_ = lda_model.print_topics(-1)

for index, score in sorted(lda_model[bag_of_words_corpus[0]], key=lambda tup: -1*tup[1]):
    print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10))

news_dataset.target_names[news_dataset.target[0]]

unseen_document = "In my spare time I either play badmington or drive my car"
print "The unseen document is composed by the following text:", unseen_document
print

bow_vector = word_count_dict.doc2bow(tokenize(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5))

print "Log perplexity of the model is", lda_model.log_perplexity(bag_of_words_corpus)