In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import scipy.stats as stats
In [23]:
# Truncate values in colname < val to 0
def filterExpression(df, colname="TPM", val=0.01):
    df[df[colname] < val] = 0
In [3]:
# Join a list of DataFrames together, merging on the index and renaming conflicting columns
def joinFrames(frames):
    current = frames[0]
    for i, frame in enumerate(frames[1:], 2):
        current = current.join(frame, rsuffix='{}'.format(i))
    return current
In [4]:
salmon_files = ["sim_test_salmon_{}/quant.sf".format(i) for i in xrange(1,11)]
salmonDataFrames = []
for fn in salmon_files:
    # Rename #Name => Name
    df = pd.read_csv(fn, sep='\t', skiprows=11)
    df = df.rename(columns={'# Name': 'Name'})
    df = df.set_index('Name')
    salmonDataFrames.append(df)
In [5]:
kallisto_files = ["sim_test_kallisto_bs/bs_abundance_{}.txt".format(i) for i in xrange(10)]
kallistoDataFrames = []
for fn in kallisto_files:
    # Rename #Name => Name
    df = pd.read_csv(fn, sep='\t')
    df = df.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'})
    df = df.set_index('Name')
    kallistoDataFrames.append(df)
In [6]:
mergedSalmon = joinFrames(salmonDataFrames)
mergedSalmon["NumReadsAvg"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedSalmon["TPMAvg"] = mergedSalmon[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedSalmon["NumReadsStdDev"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1)
In [7]:
mergedKallisto = joinFrames(kallistoDataFrames)
mergedKallisto["NumReadsAvg"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedKallisto["TPMAvg"] = mergedKallisto[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedKallisto["NumReadsStdDev"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1)
In [8]:
truth = pd.read_csv('data/quant_bias_corrected.sf', sep='\t', skiprows=12)
# Rename #Name => Name
truth = truth.rename(columns={'# Name':'Name'})
truth = truth.set_index('Name')
filterExpression(truth)
In [9]:
salmon = pd.read_csv('sim_test_salmon_1/quant.sf', sep='\t', skiprows=11)
salmon = salmon.rename(columns={'# Name': 'Name'})
salmon = salmon.set_index('Name')
filterExpression(salmon)
In [10]:
kallisto = pd.read_csv('sim_test_kallisto/abundance.txt', sep='\t')
kallisto = kallisto.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'})
kallisto = kallisto.set_index('Name')
filterExpression(kallisto)
In [55]:
# The original TPMs were on a larger dataset, so re-normalize them
truth = truth.loc[kallisto.index]
truth = truth.fillna(0)
x = 1000000.0 * (truth["TPM"] / truth["TPM"].sum())
truth["TPM"] = x
In [56]:
# Apply some reasonable filters (Salmon does better with no filtering i.e. on the very low end)
filterExpression(mergedSalmon, colname="NumReads", val=1)
filterExpression(mergedSalmon, colname="TPM", val=0.01)
truthSalmon = truth.join(mergedSalmon, lsuffix='_truth', rsuffix='_salmon')
In [57]:
# Apply some reasonable filters 
filterExpression(mergedKallisto, colname="NumReads", val=1)
filterExpression(mergedKallisto, colname="TPM", val=0.01)

truthKallisto = truth.join(kallisto, lsuffix='_truth', rsuffix='_kallisto')
In [58]:
# correlation with the average salmon TPM 
truthSalmon.corr(method='spearman')["TPM_truth"]["TPMAvg"]
Out[58]:
0.84378228965579938
In [78]:
# and with the first replicate
truthSalmon.corr(method='spearman')["TPM_truth"]["TPM_salmon"]
Out[78]:
0.84375208053402606
In [79]:
# correlation with Kallisto
truthKallisto.corr(method='spearman')["TPM_truth"]["TPM_kallisto"]
Out[79]:
0.83658299923652046
In [80]:
p = sns.regplot(mergedSalmon["NumReadsAvg"], mergedSalmon["NumReadsStdDev"] / mergedSalmon["NumReadsAvg"], fit_reg=False)
p.set_ylabel("CV")
p.set_title("Variability Salmon")
Out[80]:
<matplotlib.text.Text at 0x1012bce90>
In [81]:
p = sns.regplot(mergedKallisto["NumReadsAvg"], mergedKallisto["NumReadsStdDev"] / mergedKallisto["NumReadsAvg"], fit_reg=False)
p.set_ylabel("CV")
p.set_title("Variability Kallisto")
Out[81]:
<matplotlib.text.Text at 0x107dbad90>
In [82]:
p = sns.regplot(np.log(truthSalmon["TPM_truth"].values+1), np.log(truthSalmon["TPMAvg"]+1), truthSalmon)
p.set_xlabel("true TPM")
p.set_ylabel("Salmon TPM")
p.set_title("Salmon TPM correlation")
stats.spearmanr(truthSalmon["TPM_truth"], truthSalmon["TPMAvg"])
Out[82]:
(0.84378228965579938, 6.9804526878296429e-307)
In [83]:
p = sns.regplot(np.log(truthKallisto["TPM_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto)
p.set_xlabel("true TPM")
p.set_ylabel("Kallisto TPM")
p.set_title("Kallisto TPM correlation")
stats.spearmanr(truthKallisto["TPM_truth"], truthKallisto["TPM_kallisto"])
Out[83]:
(0.83658299923652069, 8.2442397318752635e-297)
In [ ]: