import logging logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.WARNING) logging.root.level = logging.WARNING from sklearn import datasets news_dataset = datasets.fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) # A list of text document is contained in the data variable documents = news_dataset.data print "In the dataset there are", len(documents), "textual documents" print "And this is the first one:\n", documents[0] import gensim from gensim.utils import simple_preprocess from gensim.parsing.preprocessing import STOPWORDS def tokenize(text): return [token for token in gensim.utils.simple_preprocess(text) if token not in gensim.parsing.preprocessing.STOPWORDS] print "After the tokenizer, the previous document becomes:\n", tokenize(documents[0]) processed_docs = [tokenize(doc) for doc in documents] word_count_dict = gensim.corpora.Dictionary(processed_docs) print "In the corpus there are", len(word_count_dict), "unique tokens" word_count_dict.filter_extremes(no_below=20, no_above=0.1) # word must appear >10 times, and no more than 20% documents print "After filtering, in the corpus there are only", len(word_count_dict), "unique tokens" bag_of_words_corpus = [word_count_dict.doc2bow(pdoc) for pdoc in processed_docs] bow_doc1 = bag_of_words_corpus[0] print "Bag of words representation of the first document (tuples are composed by token_id and multiplicity):\n", bow_doc1 print for i in range(5): print "In the document, topic_id {} (word \"{}\") appears {} time[s]".format(bow_doc1[i][0], word_count_dict[bow_doc1[i][0]], bow_doc1[i][1]) print "..." # LDA mono-core lda_model = gensim.models.LdaModel(bag_of_words_corpus, num_topics=10, id2word=word_count_dict, passes=5) # LDA multicore (in this configuration, defaulty, uses n_cores-1) # lda_model = gensim.models.LdaMulticore(bag_of_words_corpus, num_topics=10, id2word=word_count_dict, passes=5) _ = lda_model.print_topics(-1) for index, score in sorted(lda_model[bag_of_words_corpus[0]], key=lambda tup: -1*tup[1]): print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10)) news_dataset.target_names[news_dataset.target[0]] unseen_document = "In my spare time I either play badmington or drive my car" print "The unseen document is composed by the following text:", unseen_document print bow_vector = word_count_dict.doc2bow(tokenize(unseen_document)) for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]): print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)) print "Log perplexity of the model is", lda_model.log_perplexity(bag_of_words_corpus)