--track-abundance
stuff¶You'll need to install the latest sourmash branch feature/abundance
for this -- something like
pip install git+https://github.com/dib-lab/sourmash.git@feature/abundance
should work if you're using a virtualenv.
%matplotlib inline
import numpy
from sourmash_lib import fig
# compare signatures, IGNORING abundance (default behavior of sourmash)
!../sourmash compare *.1.fq.sig -o noabund --ignore-abundance > out2
# running sourmash subcommand: compare loading 0Hour_ATCACG_L002001.1.fq.sig loading 0Hour_ATCACG_L002002.1.fq.sig loading 0Hour_ATCACG_L002003.1.fq.sig loading 0Hour_ATCACG_L002004.1.fq.sig loading 0Hour_ATCACG_L002005.1.fq.sig loading 12Hour_TTAGGC_L002001.1.fq.sig loading 12Hour_TTAGGC_L002002.1.fq.sig loading 12Hour_TTAGGC_L002003.1.fq.sig loading 12Hour_TTAGGC_L002004.1.fq.sig loading 18Hour_TGACCA_L002001.1.fq.sig loading 18Hour_TGACCA_L002002.1.fq.sig loading 18Hour_TGACCA_L002003.1.fq.sig loading 18Hour_TGACCA_L002004.1.fq.sig loading 18Hour_TGACCA_L002005.1.fq.sig loading 18Hour_TGACCA_L002006.1.fq.sig loading 18Hour_TGACCA_L002007.1.fq.sig loading 18Hour_TGACCA_L002008.1.fq.sig loading 24HourA_ACAGTG_L002001.1.fq.sig loading 24HourA_ACAGTG_L002002.1.fq.sig loading 24HourA_ACAGTG_L002003.1.fq.sig loading 24HourA_ACAGTG_L002004.1.fq.sig loading 24HourA_ACAGTG_L002005.1.fq.sig loading 24HourA_ACAGTG_L002006.1.fq.sig loading 24HourA_ACAGTG_L002007.1.fq.sig loading 24HourA_ACAGTG_L002008.1.fq.sig loading 24HourA_ACAGTG_L002009.1.fq.sig loading 24HourA_ACAGTG_L002010.1.fq.sig loading 24HourB_GCCAAT_L002001.1.fq.sig loading 6Hour_CGATGT_L002001.1.fq.sig loading 6Hour_CGATGT_L002002.1.fq.sig loading 6Hour_CGATGT_L002003.1.fq.sig loading 6Hour_CGATGT_L002004.1.fq.sig loading 6Hour_CGATGT_L002005.1.fq.sig min similarity in matrix: 0.119999997318 saving labels to: noabund.labels.txt saving distance matrix to: noabund
# compare signatures, now WITH abundance (signatures computed with `sourmash compute --track-abundance`)
!../sourmash compare *.1.fq.sig -o abund > out
# running sourmash subcommand: compare loading 0Hour_ATCACG_L002001.1.fq.sig loading 0Hour_ATCACG_L002002.1.fq.sig loading 0Hour_ATCACG_L002003.1.fq.sig loading 0Hour_ATCACG_L002004.1.fq.sig loading 0Hour_ATCACG_L002005.1.fq.sig loading 12Hour_TTAGGC_L002001.1.fq.sig loading 12Hour_TTAGGC_L002002.1.fq.sig loading 12Hour_TTAGGC_L002003.1.fq.sig loading 12Hour_TTAGGC_L002004.1.fq.sig loading 18Hour_TGACCA_L002001.1.fq.sig loading 18Hour_TGACCA_L002002.1.fq.sig loading 18Hour_TGACCA_L002003.1.fq.sig loading 18Hour_TGACCA_L002004.1.fq.sig loading 18Hour_TGACCA_L002005.1.fq.sig loading 18Hour_TGACCA_L002006.1.fq.sig loading 18Hour_TGACCA_L002007.1.fq.sig loading 18Hour_TGACCA_L002008.1.fq.sig loading 24HourA_ACAGTG_L002001.1.fq.sig loading 24HourA_ACAGTG_L002002.1.fq.sig loading 24HourA_ACAGTG_L002003.1.fq.sig loading 24HourA_ACAGTG_L002004.1.fq.sig loading 24HourA_ACAGTG_L002005.1.fq.sig loading 24HourA_ACAGTG_L002006.1.fq.sig loading 24HourA_ACAGTG_L002007.1.fq.sig loading 24HourA_ACAGTG_L002008.1.fq.sig loading 24HourA_ACAGTG_L002009.1.fq.sig loading 24HourA_ACAGTG_L002010.1.fq.sig loading 24HourB_GCCAAT_L002001.1.fq.sig loading 6Hour_CGATGT_L002001.1.fq.sig loading 6Hour_CGATGT_L002002.1.fq.sig loading 6Hour_CGATGT_L002003.1.fq.sig loading 6Hour_CGATGT_L002004.1.fq.sig loading 6Hour_CGATGT_L002005.1.fq.sig min similarity in matrix: 0.141103185708 saving labels to: abund.labels.txt saving distance matrix to: abund
# compute similarities from salmon output
!./distance.py *.counts.gz -o salmon
reading: 0Hour_ATCACG_L002001.quant.counts.gz reading: 0Hour_ATCACG_L002002.quant.counts.gz reading: 0Hour_ATCACG_L002003.quant.counts.gz reading: 0Hour_ATCACG_L002004.quant.counts.gz reading: 0Hour_ATCACG_L002005.quant.counts.gz reading: 12Hour_TTAGGC_L002001.quant.counts.gz reading: 12Hour_TTAGGC_L002002.quant.counts.gz reading: 12Hour_TTAGGC_L002003.quant.counts.gz reading: 12Hour_TTAGGC_L002004.quant.counts.gz reading: 18Hour_TGACCA_L002001.quant.counts.gz reading: 18Hour_TGACCA_L002002.quant.counts.gz reading: 18Hour_TGACCA_L002003.quant.counts.gz reading: 18Hour_TGACCA_L002004.quant.counts.gz reading: 18Hour_TGACCA_L002005.quant.counts.gz reading: 18Hour_TGACCA_L002006.quant.counts.gz reading: 18Hour_TGACCA_L002007.quant.counts.gz reading: 18Hour_TGACCA_L002008.quant.counts.gz reading: 24HourA_ACAGTG_L002001.quant.counts.gz reading: 24HourA_ACAGTG_L002002.quant.counts.gz reading: 24HourA_ACAGTG_L002003.quant.counts.gz reading: 24HourA_ACAGTG_L002004.quant.counts.gz reading: 24HourA_ACAGTG_L002005.quant.counts.gz reading: 24HourA_ACAGTG_L002006.quant.counts.gz reading: 24HourA_ACAGTG_L002007.quant.counts.gz reading: 24HourA_ACAGTG_L002008.quant.counts.gz reading: 24HourA_ACAGTG_L002009.quant.counts.gz reading: 24HourA_ACAGTG_L002010.quant.counts.gz reading: 24HourB_GCCAAT_L002001.quant.counts.gz reading: 6Hour_CGATGT_L002001.quant.counts.gz reading: 6Hour_CGATGT_L002002.quant.counts.gz reading: 6Hour_CGATGT_L002003.quant.counts.gz reading: 6Hour_CGATGT_L002004.quant.counts.gz reading: 6Hour_CGATGT_L002005.quant.counts.gz saving labels to: salmon.labels.txt saving distance matrix to: salmon
D2, labels2 = fig.load_matrix_and_labels('noabund')
_ = fig.plot_composite_matrix(D2, labels2)
D, labels = fig.load_matrix_and_labels('abund')
_ = fig.plot_composite_matrix(D, labels)
Here we are using dot products between the vectors of salmon output to quantify distance. (Note changed min scale.)
salmon, salmon_labels = fig.load_matrix_and_labels('salmon')
print(numpy.min(salmon), numpy.max(salmon))
_ = fig.plot_composite_matrix(salmon, salmon_labels, vmin=0.80)
0.796021488193 1.0
from skbio import DistanceMatrix
from skbio.stats.ordination import pcoa
# make a symmetric distance matrix
Dsym = 1 - (D+D.T)/2.0
print(numpy.trace(Dsym))
dm = DistanceMatrix(Dsym, labels)
# do a PCoA
pcoa_results = pcoa(dm)
1.06532935962e-06
--------------------------------------------------------------------------- DistanceMatrixError Traceback (most recent call last) <ipython-input-18-10e5a1bfc8a3> in <module>() 4 print(numpy.trace(Dsym)) 5 ----> 6 dm = DistanceMatrix(Dsym, labels) 7 8 /Users/t/dev/jup/lib/python3.5/site-packages/skbio/stats/distance/_base.py in __init__(self, data, ids) 105 ids = tuple(ids) 106 --> 107 self._validate(data, ids) 108 109 self._data = data /Users/t/dev/jup/lib/python3.5/site-packages/skbio/stats/distance/_base.py in _validate(self, data, ids) 873 874 if np.trace(data) != 0: --> 875 raise DistanceMatrixError("Data must be hollow (i.e., the diagonal" 876 " can only contain zeros).") 877 DistanceMatrixError: Data must be hollow (i.e., the diagonal can only contain zeros).
# plot, colored by timepoint.
import pandas as pd
d = {}
for l in labels:
d[l] = { 'timepoint': l.split('_')[0] }
df = pd.DataFrame.from_dict(d, orient='index')
_ = pcoa_results.plot(df=df, column='timepoint')