#!/usr/bin/env python # coding: utf-8 # ## Topic Modeling with MALLET # We'd like to test how [Taylor Salo](https://www.github.com/tsalo) integrated MALLET into NeuroSynth, and whether that integration works in a docker container. # # First, let's import some dependencies and text to work with. # # For testing, we'll use an XML file separately downloaded from PubMed. In the spirit of NeuroSynth, we downloaded [Tal Yarkoni's](https://www.ncbi.nlm.nih.gov/pubmed/?term=tal+yarkoni) bibliography. Thanks, Tal! # In[1]: from bs4 import BeautifulSoup import pandas as pd with open('../neurosynth/tests/data/yarkoni_pubmed.xml') as infile: xml_file = infile.read() soup = BeautifulSoup(xml_file, 'lxml') try: assert type(soup) == BeautifulSoup except AssertionError: print('Check file type! Must be HTML or XML.') titles = soup.find_all('articletitle') abstracts = soup.find_all('abstract') if len(titles) != len(abstracts): print('Warning: Some articles do not have abstracts on PubMed!') print('Only articles with complete data will be included.') # Three articles do not have abstracts: # 1. Pain in the ACC? # 2. Introduction to the special issue on reliability and # replication in cognitive and affective neuroscience research. # 3. Establishing homology between monkey and human brains. # # Maybe because they're commentaries? We'll need to filter the results to only consider articles with abstracts. Then, import any matching articles into a pandas dataframe. # In[2]: abstracts = [] pmids = [] articles = soup.find_all('pubmedarticle') for a in articles: if a.find_all('abstract')!= []: # This is a little messy, but pulls out the # results in plain text without another loop. abstracts.append(a.find_all('abstracttext')[0].get_text()) pmids.append(a.find_all(idtype='pubmed')[0].get_text()) df = pd.DataFrame({'pmid': pmids, 'abstract': abstracts}) df.head() # We have a test dataset! Let's see how it plays with MALLET. # In[3]: import os import subprocess import shutil import sys sys.path.append(os.path.abspath('..')) from neurosynth.analysis.reduce import topic_models weights_df, keys_df = topic_models(df) keys_df.head()