#!/usr/bin/env python
# coding: utf-8

# This notebook replicates part of the [E-vident](https://github.com/biocore/evident) analysis platform, allowing you to explor a series of different distance metrics, and rarefaction levels by leveraging the Jupyter interface available in **Emperor**.
# 
# Before you execute this example, you need to make sure you install a few additional dependencies:
# 
# ```
# pip install scikit-learn ipywidgets h5py biom-format qiime_default_reference
# ```
# 
# Once you've done this, you will need to enable the `ipywidgets` interface, to do so, you will need to run:
# 
# ```
# jupyter nbextension enable --py widgetsnbextension
# ```
# 

# In[ ]:


get_ipython().run_line_magic('matplotlib', 'inline')
from __future__ import division

# biocore
from emperor.qiime_backports.parse import parse_mapping_file
from emperor.qiime_backports.format import format_mapping_file
from emperor import Emperor, nbinstall

nbinstall()

from skbio.stats.ordination import pcoa
from skbio.diversity import beta_diversity
from skbio import TreeNode
from skbio.io.util import open_file

from biom import load_table
from biom.util import biom_open

import qiime_default_reference

# pydata/scipy
import pandas as pd
import numpy as np

from scipy.spatial.distance import braycurtis, canberra
from ipywidgets import interact
from sklearn.metrics import pairwise_distances
from functools import partial

import warnings

warnings.filterwarnings(action='ignore', category=Warning)

# -1 means all the processors available
pw_dists = partial(pairwise_distances, n_jobs=-1)

def load_mf(fn):
    with open_file(fn) as f:
        mapping_data, header, _ = parse_mapping_file(f)
        _mapping_file = pd.DataFrame(mapping_data, columns=header)
        _mapping_file.set_index('SampleID', inplace=True)
    return _mapping_file

def write_mf(f, _df):
    with open(f, 'w') as fp:
        lines = format_mapping_file(['SampleID'] + _df.columns.tolist(),
                                    list(_df.itertuples()))
        fp.write(lines+'\n')


# We are going to load data from [Fierer et al. 2010](http://www.pnas.org/content/107/14/6477.full) (the data was retrieved from study [232](https://qiita.ucsd.edu/study/description/232) in [Qiita](https://qiita.ucsd.edu), remember you need to be logged in to access the study).
# 
# We will load this as a [QIIME](http://qiime.org) mapping file and as a [BIOM](http://biom-format.org) OTU table.

# In[ ]:


mf = load_mf('keyboard/mapping-file.txt')
bt = load_table('keyboard/otu-table.biom')


# Now we will load a reference database using [scikit-bio](http://scikit-bio.org)'s TreeNode object. The reference itself is as provided by [Greengenes](http://greengenes.secondgenome.com/downloads).

# In[ ]:


tree = TreeNode.read(qiime_default_reference.get_reference_tree())

for n in tree.traverse():
    if n.length is None:
        n.length = 0


# The function `evident` uses the OTU table (`bt`), the mapping file (`mf`), and the phylogenetic tree (`tree`), to construct a distance matrix and ordinate it using principal coordinates analysis.
# 
# To exercise this function, we build a small ipywidgets function that will let us experiment with a variety of rarefaction levels and distance metrics.

# In[ ]:


def evident(n, metric):
    rarefied = bt.subsample(n)
    data = np.array([rarefied.data(i) for i in rarefied.ids()], dtype='int64')
    
    if metric in ['unweighted_unifrac', 'weighted_unifrac']:
        res = pcoa(beta_diversity(metric, data, rarefied.ids(),
                                  otu_ids=rarefied.ids('observation'),
                                  tree=tree, pairwise_func=pw_dists))
    else:
        res = pcoa(beta_diversity(metric, data, rarefied.ids(),
                                  pairwise_func=pw_dists))
    # If you want to share your notebook via GitHub use `remote=True` and
    # make sure you share your notebook using nbviewer.
    return Emperor(res, mf, remote=False)


# **Note** that the ipywidgets themselves, will not be visible unless you are executing this notebook i.e. by running your own Jupyter server.

# In[ ]:


interact(evident, n=(200, 2000, 50),
         metric=['unweighted_unifrac', 'weighted_unifrac', 'braycurtis', 'euclidean'],
         __manual=True)