Notebook

In [ ]:

from biom.table import Table
from biom.util import biom_open
import pandas as pd
import numpy as np
from skbio import TreeNode 
import math
from qiime2 import Artifact
from qiime2.plugins import empress
from qiime2 import Visualization
import scipy

In [ ]:

def assign_counts(tree,old_table, young_table, tip_names):
    """
    Counts how many old/young samples for each node in the tree. 
    """
    for node in tree.postorder(include_self=False):
        node.old_count = 0
        node.young_count = 0
        if node.is_tip():
            node.old_count = old_table[node.name].sum()
            node.young_count = young_table[node.name].sum()
        else:
            tips = [tip.name for tip in node.tips()]
            node.old_count = old_table[tips].max(axis=1).sum()
            node.young_count = young_table[tips].max(axis=1).sum()
            
def calc_old_young_log(tree, num_old_samples, num_young_samples):
    """
    Find log ratio of old / young samples for each node in the tree
    """
    young_min = 0
    for node in tree.postorder(include_self=False):
        if node.old_count == 0 and node.young_count == 0:
            # node was not found in a single old or young sample
            node.old_young_log = 0
        elif node.old_count == 0:
            # node was only found in young samples
            node.old_young_log = -np.inf
        elif node.young_count == 0:
            # node was only found in old samples
            node.old_young_log = np.inf
        else:
            # calculate the log ratio of old / young samples
            node.old_young_log = math.log(node.old_count / node.young_count,2)
            # normiliztion term
            node.old_young_log -= math.log(num_old_samples/ num_young_samples,2)
            if node.is_tip():
                young_min = min(young_min, node.old_young_log)
    return young_min

def assign_old_young_status(tree, young_min):
    """
    Assign old or young status for each node based on its log ratio.
    """
    for node in tree.postorder(include_self=False):
        if node.old_young_log > 0:
            node.age = "old"
        elif node.old_young_log < 0:
            node.age = "young"
        else:
            node.age = "no difference"
        if node.old_young_log > young_min:
            node.old_young_log = young_min
        if node.old_young_log < -1 * young_min:
            node.old_young_log = -1 * young_min
            
def assign_name(tree):
    """
    Assign unique ids for each node in the tree.
    """
    i = 0
    for node in tree.postorder(include_self=True):
        if not type(node.name) == type("str"):
            node.name = "blank_" + str(i)
            i += 1

In [ ]:

table_MG_path = 'data/finrisk/anonymized-finrisk-MG-BL_AGE-filtered_rarefied_table.biom'
finrisk_metadata_path = 'data/finrisk/anonymized.age-only.finrisk.metadata.txt'
wol_tree_path = 'data/wol/wol-tree.nwk'

Load BIOM table and sample metadata¶

In [ ]:

# with biom_open("finrisk-MG-BL_AGE-filtered_rarefied_table.biom") as f:
with biom_open(table_MG_path) as f:
    table = Table.from_hdf5(f)
table.pa()
pd_table = (table.to_dataframe(dense=True))
tip_names = set(pd_table.index.to_list())

# metadata = pd.read_csv("gotu.shared.metadata.txt", sep="\t", index_col=0)
metadata = pd.read_csv(finrisk_metadata_path, sep="\t", index_col=0)

Filter/split sample metadata into group >= 60 and <= 35¶

In [ ]:

old = metadata.loc[metadata["BL_AGE"] >= 60]
young = metadata.loc[metadata["BL_AGE"] <= 35]

old_samples = [s for s in old.index.tolist() if table.exists(s)]
young_sample = [s for s in young.index.tolist() if table.exists(s)]

old_table = (pd_table[old_samples]).T
young_table = (pd_table[young_sample]).T

Load tree and assign log ratios¶

In [ ]:

tree = TreeNode.read(wol_tree_path,format="newick")
assign_name(tree)
tree = tree.shear(tip_names)
assign_counts(tree, old_table, young_table, tip_names)
young_min = calc_old_young_log(tree, len(old_samples), len(young_sample))
young_min = abs(young_min)
assign_old_young_status(tree, young_min)

Format taxonomy¶

In [ ]:

tax = pd.read_csv("lineages.txt", sep="\t", index_col=0)
tax = tax.loc[tip_names]
tax[["Level 1", "Level 2","Level 3", "Level 4", "Level 5", "Level 6", "Level 7"]] = \
    tax.Taxonmy.str.split(";", expand=True)
tax.loc[["G001765415",
"G001899365",
"G001899425",
"G001917115",
"G001917235",
"G001917285",
"G000980455",
"G000431115",
"G000431315",
"G000431555",
"G000433095",
"G000433235",
"G000437435",
"G000437655",
"G000980375"], "Level 2"] = "p__Melainabacteria"
tax.loc[[
"G000432575",
"G000433255",
"G000433455",
"G000433475",
"G000433875",
"G000434835",
"G000436255",
"G000437335",
"G000438295",
"G000438415"
], "Level 2"] = "p__Firmicutes"
tax = tax.applymap(lambda x: x.split("__")[1].split("_")[0])
tax = tax.to_dict()

Create feature metadata and sample metadata¶

In [ ]:

with open("old_young_log.tsv", 'w') as f:
    f.write("feature id\told_young_log_ratio\tage\tLevel 1\tLevel 2\tLevel 3\tLevel 4\tLevel 5\tLevel 6\tLevel 7\n")
    for node in tree.postorder(include_self=False):
        if node.is_tip():
            f.write("" + node.name + \
                    "\t" + str(node.old_young_log) + \
                    "\t" + node.age  +
                    "\t" + tax["Level 1"][node.name] +
                    "\t" + tax["Level 2"][node.name] +
                    "\t" + tax["Level 3"][node.name] +
                    "\t" + tax["Level 4"][node.name] +
                    "\t" + tax["Level 5"][node.name] +
                    "\t" + tax["Level 6"][node.name] +
                    "\t" + tax["Level 7"][node.name] +
                    "\n")
        else:
            if type(node.name) == type("str"):
                f.write("" + node.name + \
                    "\t" + str(node.old_young_log) + \
                    "\t" + node.age  + \
                    "\t" + "" + \
                    "\t" + "" +
                    "\t" + "" +
                    "\t" + "" +
                    "\t" + "" +
                    "\t" + "" +
                    "\t" + "" +
                    "\n")
        
metadata.loc[young_sample, "age_status"] = "young"
metadata.loc[old_samples, "age_status"] = "old"
metadata = metadata.loc[old_samples + young_sample]
metadata.to_csv("s-meta.tsv", sep="\t", na_rep="NA")

Create qiime2 Artifacts¶

In [ ]:

table = table.filter(old_samples + young_sample, axis="sample", inplace=False)
Artifact.import_data("FeatureTable[Frequency]", table).save("table.qza")
Artifact.import_data("Phylogeny[Rooted]", tree).save("dec_shear_tree.qza")

Create EMPress plot¶

In [ ]:

!qiime empress community-plot \
    --i-tree dec_shear_tree.qza \
    --i-feature-table table.qza \
    --m-sample-metadata-file s-meta.tsv \
    --m-feature-metadata-file old_young_log.tsv \
    --o-visualization fig2c.qzv

Load EMPress visualization¶

In [ ]:

Visualization.load("fig2c.qzv")