#!/usr/bin/env python # coding: utf-8 # This notebook replicates part of the [E-vident](https://github.com/biocore/evident) analysis platform, allowing you to explor a series of different distance metrics, and rarefaction levels by leveraging the Jupyter interface available in **Emperor**. # # Before you execute this example, you need to make sure you install a few additional dependencies: # # ``` # pip install scikit-learn ipywidgets h5py biom-format qiime_default_reference # ``` # # Once you've done this, you will need to enable the `ipywidgets` interface, to do so, you will need to run: # # ``` # jupyter nbextension enable --py widgetsnbextension # ``` # # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') from __future__ import division # biocore from emperor.qiime_backports.parse import parse_mapping_file from emperor.qiime_backports.format import format_mapping_file from emperor import Emperor, nbinstall nbinstall() from skbio.stats.ordination import pcoa from skbio.diversity import beta_diversity from skbio import TreeNode from skbio.io.util import open_file from biom import load_table from biom.util import biom_open import qiime_default_reference # pydata/scipy import pandas as pd import numpy as np from scipy.spatial.distance import braycurtis, canberra from ipywidgets import interact from sklearn.metrics import pairwise_distances from functools import partial import warnings warnings.filterwarnings(action='ignore', category=Warning) # -1 means all the processors available pw_dists = partial(pairwise_distances, n_jobs=-1) def load_mf(fn): with open_file(fn) as f: mapping_data, header, _ = parse_mapping_file(f) _mapping_file = pd.DataFrame(mapping_data, columns=header) _mapping_file.set_index('SampleID', inplace=True) return _mapping_file def write_mf(f, _df): with open(f, 'w') as fp: lines = format_mapping_file(['SampleID'] + _df.columns.tolist(), list(_df.itertuples())) fp.write(lines+'\n') # We are going to load data from [Fierer et al. 2010](http://www.pnas.org/content/107/14/6477.full) (the data was retrieved from study [232](https://qiita.ucsd.edu/study/description/232) in [Qiita](https://qiita.ucsd.edu), remember you need to be logged in to access the study). # # We will load this as a [QIIME](http://qiime.org) mapping file and as a [BIOM](http://biom-format.org) OTU table. # In[ ]: mf = load_mf('keyboard/mapping-file.txt') bt = load_table('keyboard/otu-table.biom') # Now we will load a reference database using [scikit-bio](http://scikit-bio.org)'s TreeNode object. The reference itself is as provided by [Greengenes](http://greengenes.secondgenome.com/downloads). # In[ ]: tree = TreeNode.read(qiime_default_reference.get_reference_tree()) for n in tree.traverse(): if n.length is None: n.length = 0 # The function `evident` uses the OTU table (`bt`), the mapping file (`mf`), and the phylogenetic tree (`tree`), to construct a distance matrix and ordinate it using principal coordinates analysis. # # To exercise this function, we build a small ipywidgets function that will let us experiment with a variety of rarefaction levels and distance metrics. # In[ ]: def evident(n, metric): rarefied = bt.subsample(n) data = np.array([rarefied.data(i) for i in rarefied.ids()], dtype='int64') if metric in ['unweighted_unifrac', 'weighted_unifrac']: res = pcoa(beta_diversity(metric, data, rarefied.ids(), otu_ids=rarefied.ids('observation'), tree=tree, pairwise_func=pw_dists)) else: res = pcoa(beta_diversity(metric, data, rarefied.ids(), pairwise_func=pw_dists)) # If you want to share your notebook via GitHub use `remote=True` and # make sure you share your notebook using nbviewer. return Emperor(res, mf, remote=False) # **Note** that the ipywidgets themselves, will not be visible unless you are executing this notebook i.e. by running your own Jupyter server. # In[ ]: interact(evident, n=(200, 2000, 50), metric=['unweighted_unifrac', 'weighted_unifrac', 'braycurtis', 'euclidean'], __manual=True)