#!/usr/bin/env python
# coding: utf-8

# ## Topic Modeling with MALLET

# We'd like to test how [Taylor Salo](https://www.github.com/tsalo) integrated MALLET into NeuroSynth, and whether that integration works in a docker container.
# 
# First, let's import some dependencies and text to work with. 
# 
# For testing, we'll use an XML file separately downloaded from PubMed. In the spirit of NeuroSynth, we downloaded [Tal Yarkoni's](https://www.ncbi.nlm.nih.gov/pubmed/?term=tal+yarkoni) bibliography. Thanks, Tal! 

# In[1]:


from bs4 import BeautifulSoup
import pandas as pd

with open('../neurosynth/tests/data/yarkoni_pubmed.xml') as infile:
    xml_file = infile.read()
soup = BeautifulSoup(xml_file, 'lxml')

try:
    assert type(soup) == BeautifulSoup
except AssertionError:
    print('Check file type! Must be HTML or XML.')

titles = soup.find_all('articletitle')
abstracts = soup.find_all('abstract')

if len(titles) != len(abstracts):
    print('Warning: Some articles do not have abstracts on PubMed!')
    print('Only articles with complete data will be included.')


# Three articles do not have abstracts:
# 1. Pain in the ACC?
# 2. Introduction to the special issue on reliability and 
#     replication in cognitive and affective neuroscience research.
# 3. Establishing homology between monkey and human brains.
# 
# Maybe because they're commentaries? We'll need to filter the results to only consider articles with abstracts. Then, import any matching articles into a pandas dataframe.

# In[2]:


abstracts = []
pmids = []

articles = soup.find_all('pubmedarticle')
for a in articles:
    if a.find_all('abstract')!= []:
        # This is a little messy, but pulls out the
        # results in plain text without another loop.
        abstracts.append(a.find_all('abstracttext')[0].get_text())
        pmids.append(a.find_all(idtype='pubmed')[0].get_text())

df = pd.DataFrame({'pmid': pmids,
     'abstract': abstracts})

df.head()


# We have a test dataset! Let's see how it plays with MALLET. 

# In[3]:


import os
import subprocess
import shutil
import sys
sys.path.append(os.path.abspath('..'))
from neurosynth.analysis.reduce import topic_models

weights_df, keys_df = topic_models(df)
keys_df.head()