import gensim import plospy import os import nltk import cPickle as pickle all_names = [name for name in os.listdir('../plos/plos_biology/plos_biology_data') if '.dat' in name] article_bodies = [] for name_i, name in enumerate(all_names): docs = plospy.PlosXml('../plos/plos_biology/plos_biology_data/'+name) for article in docs.docs: article_bodies.append(article['body']) len(article_bodies) punkt_param = nltk.tokenize.punkt.PunktParameters() punkt_param.abbrev_types = set(['et al', 'i.e', 'e.g', 'ref', 'c.f', 'fig', 'Fig', 'Eq', 'eq', 'eqn', 'Eqn', 'dr', 'Dr']) sentence_splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(punkt_param) sentences = [] for body in article_bodies: sentences.append(sentence_splitter.tokenize(body)) articles = [] for body in sentences: this_article = [] for sentence in body: this_article.append(nltk.tokenize.word_tokenize(sentence)) articles.append(this_article) pickle.dump(articles, open('plos_biology_articles_tokenized.list', 'w')) articles = pickle.load(open('plos_biology_articles_tokenized.list', 'r')) is_stopword = lambda w: len(w) < 4 or w in nltk.corpus.stopwords.words('english') articles_unfurled = [] for article in articles: this_article = [] for sentence in article: this_article += [token.lower().encode('utf-8') for token in sentence if not is_stopword(token)] articles_unfurled.append(this_article) pickle.dump(articles_unfurled, open('plos_biology_articles_unfurled.list', 'w')) articles_unfurled = pickle.load(open('plos_biology_articles_unfurled.list', 'r')) dictionary = gensim.corpora.Dictionary(articles_unfurled) dictionary.save('plos_biology.dict') dictionary = gensim.corpora.dictionary.Dictionary().load('plos_biology.dict') dictionary.filter_extremes() corpus = [dictionary.doc2bow(article) for article in articles_unfurled] gensim.corpora.MmCorpus.serialize('plos_biology_corpus.mm', corpus) corpus = gensim.corpora.MmCorpus('plos_biology_corpus.mm') model = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, update_every=1, chunksize=100, passes=2, num_topics=20) model.save('plos_biology.lda_model') model = gensim.models.ldamodel.LdaModel.load('plos_biology.lda_model') for topic_i, topic in enumerate(model.print_topics(20)): print('topic # %d: %s\n' % (topic_i+1, topic)) from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() articles_lemmatized = [] for article in articles_unfurled: articles_lemmatized.append([wnl.lemmatize(token) for token in article]) pickle.dump(articles_lemmatized, open('plos_biology_articles_lemmatized.list', 'w')) dictionary_lemmatized = gensim.corpora.Dictionary(articles_lemmatized) dictionary_lemmatized.save('plos_biology_lemmatized.dict') dictionary_lemmatized.filter_extremes() corpus_lemmatized = [dictionary_lemmatized.doc2bow(article) for article in articles_lemmatized] gensim.corpora.MmCorpus.serialize('plos_biology_corpus_lemmatized.mm', corpus_lemmatized) model_lemmatized = gensim.models.ldamodel.LdaModel(corpus_lemmatized, id2word=dictionary_lemmatized, update_every=1, chunksize=100, passes=2, num_topics=20) for topic_i, topic in enumerate(model_lemmatized.print_topics(20)): print('topic # %d: %s\n' % (topic_i+1, topic))