from biom.table import Table
from biom.util import biom_open
import pandas as pd
import numpy as np
from skbio import TreeNode
import math
from qiime2 import Artifact
from qiime2.plugins import empress
from qiime2 import Visualization
import scipy
def assign_counts(tree,old_table, young_table, tip_names):
"""
Counts how many old/young samples for each node in the tree.
"""
for node in tree.postorder(include_self=False):
node.old_count = 0
node.young_count = 0
if node.is_tip():
node.old_count = old_table[node.name].sum()
node.young_count = young_table[node.name].sum()
else:
tips = [tip.name for tip in node.tips()]
node.old_count = old_table[tips].max(axis=1).sum()
node.young_count = young_table[tips].max(axis=1).sum()
def calc_old_young_log(tree, num_old_samples, num_young_samples):
"""
Find log ratio of old / young samples for each node in the tree
"""
young_min = 0
for node in tree.postorder(include_self=False):
if node.old_count == 0 and node.young_count == 0:
# node was not found in a single old or young sample
node.old_young_log = 0
elif node.old_count == 0:
# node was only found in young samples
node.old_young_log = -np.inf
elif node.young_count == 0:
# node was only found in old samples
node.old_young_log = np.inf
else:
# calculate the log ratio of old / young samples
node.old_young_log = math.log(node.old_count / node.young_count,2)
# normiliztion term
node.old_young_log -= math.log(num_old_samples/ num_young_samples,2)
if node.is_tip():
young_min = min(young_min, node.old_young_log)
return young_min
def assign_old_young_status(tree, young_min):
"""
Assign old or young status for each node based on its log ratio.
"""
for node in tree.postorder(include_self=False):
if node.old_young_log > 0:
node.age = "old"
elif node.old_young_log < 0:
node.age = "young"
else:
node.age = "no difference"
if node.old_young_log > young_min:
node.old_young_log = young_min
if node.old_young_log < -1 * young_min:
node.old_young_log = -1 * young_min
def assign_name(tree):
"""
Assign unique ids for each node in the tree.
"""
i = 0
for node in tree.postorder(include_self=True):
if not type(node.name) == type("str"):
node.name = "blank_" + str(i)
i += 1
table_MG_path = 'data/finrisk/anonymized-finrisk-MG-BL_AGE-filtered_rarefied_table.biom'
finrisk_metadata_path = 'data/finrisk/anonymized.age-only.finrisk.metadata.txt'
wol_tree_path = 'data/wol/wol-tree.nwk'
# with biom_open("finrisk-MG-BL_AGE-filtered_rarefied_table.biom") as f:
with biom_open(table_MG_path) as f:
table = Table.from_hdf5(f)
table.pa()
pd_table = (table.to_dataframe(dense=True))
tip_names = set(pd_table.index.to_list())
# metadata = pd.read_csv("gotu.shared.metadata.txt", sep="\t", index_col=0)
metadata = pd.read_csv(finrisk_metadata_path, sep="\t", index_col=0)
old = metadata.loc[metadata["BL_AGE"] >= 60]
young = metadata.loc[metadata["BL_AGE"] <= 35]
old_samples = [s for s in old.index.tolist() if table.exists(s)]
young_sample = [s for s in young.index.tolist() if table.exists(s)]
old_table = (pd_table[old_samples]).T
young_table = (pd_table[young_sample]).T
tree = TreeNode.read(wol_tree_path,format="newick")
assign_name(tree)
tree = tree.shear(tip_names)
assign_counts(tree, old_table, young_table, tip_names)
young_min = calc_old_young_log(tree, len(old_samples), len(young_sample))
young_min = abs(young_min)
assign_old_young_status(tree, young_min)
tax = pd.read_csv("lineages.txt", sep="\t", index_col=0)
tax = tax.loc[tip_names]
tax[["Level 1", "Level 2","Level 3", "Level 4", "Level 5", "Level 6", "Level 7"]] = \
tax.Taxonmy.str.split(";", expand=True)
tax.loc[["G001765415",
"G001899365",
"G001899425",
"G001917115",
"G001917235",
"G001917285",
"G000980455",
"G000431115",
"G000431315",
"G000431555",
"G000433095",
"G000433235",
"G000437435",
"G000437655",
"G000980375"], "Level 2"] = "p__Melainabacteria"
tax.loc[[
"G000432575",
"G000433255",
"G000433455",
"G000433475",
"G000433875",
"G000434835",
"G000436255",
"G000437335",
"G000438295",
"G000438415"
], "Level 2"] = "p__Firmicutes"
tax = tax.applymap(lambda x: x.split("__")[1].split("_")[0])
tax = tax.to_dict()
with open("old_young_log.tsv", 'w') as f:
f.write("feature id\told_young_log_ratio\tage\tLevel 1\tLevel 2\tLevel 3\tLevel 4\tLevel 5\tLevel 6\tLevel 7\n")
for node in tree.postorder(include_self=False):
if node.is_tip():
f.write("" + node.name + \
"\t" + str(node.old_young_log) + \
"\t" + node.age +
"\t" + tax["Level 1"][node.name] +
"\t" + tax["Level 2"][node.name] +
"\t" + tax["Level 3"][node.name] +
"\t" + tax["Level 4"][node.name] +
"\t" + tax["Level 5"][node.name] +
"\t" + tax["Level 6"][node.name] +
"\t" + tax["Level 7"][node.name] +
"\n")
else:
if type(node.name) == type("str"):
f.write("" + node.name + \
"\t" + str(node.old_young_log) + \
"\t" + node.age + \
"\t" + "" + \
"\t" + "" +
"\t" + "" +
"\t" + "" +
"\t" + "" +
"\t" + "" +
"\t" + "" +
"\n")
metadata.loc[young_sample, "age_status"] = "young"
metadata.loc[old_samples, "age_status"] = "old"
metadata = metadata.loc[old_samples + young_sample]
metadata.to_csv("s-meta.tsv", sep="\t", na_rep="NA")
table = table.filter(old_samples + young_sample, axis="sample", inplace=False)
Artifact.import_data("FeatureTable[Frequency]", table).save("table.qza")
Artifact.import_data("Phylogeny[Rooted]", tree).save("dec_shear_tree.qza")
!qiime empress community-plot \
--i-tree dec_shear_tree.qza \
--i-feature-table table.qza \
--m-sample-metadata-file s-meta.tsv \
--m-feature-metadata-file old_young_log.tsv \
--o-visualization fig2c.qzv
Visualization.load("fig2c.qzv")