%matplotlib inline import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np import scipy as sp import scipy.stats as stats # Truncate values in colname < val to 0 def filterExpression(df, colname="TPM", val=0.01): df[df[colname] < val] = 0 # Join a list of DataFrames together, merging on the index and renaming conflicting columns def joinFrames(frames): current = frames[0] for i, frame in enumerate(frames[1:], 2): current = current.join(frame, rsuffix='{}'.format(i)) return current salmon_files = ["sim_test_salmon_{}/quant.sf".format(i) for i in xrange(1,11)] salmonDataFrames = [] for fn in salmon_files: # Rename #Name => Name df = pd.read_csv(fn, sep='\t', skiprows=11) df = df.rename(columns={'# Name': 'Name'}) df = df.set_index('Name') salmonDataFrames.append(df) kallisto_files = ["sim_test_kallisto_bs/bs_abundance_{}.txt".format(i) for i in xrange(10)] kallistoDataFrames = [] for fn in kallisto_files: # Rename #Name => Name df = pd.read_csv(fn, sep='\t') df = df.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'}) df = df.set_index('Name') kallistoDataFrames.append(df) mergedSalmon = joinFrames(salmonDataFrames) mergedSalmon["NumReadsAvg"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1) mergedSalmon["TPMAvg"] = mergedSalmon[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1) mergedSalmon["NumReadsStdDev"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1) mergedKallisto = joinFrames(kallistoDataFrames) mergedKallisto["NumReadsAvg"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1) mergedKallisto["TPMAvg"] = mergedKallisto[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1) mergedKallisto["NumReadsStdDev"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1) truth = pd.read_csv('data/quant_bias_corrected.sf', sep='\t', skiprows=12) # Rename #Name => Name truth = truth.rename(columns={'# Name':'Name'}) truth = truth.set_index('Name') filterExpression(truth) salmon = pd.read_csv('sim_test_salmon_1/quant.sf', sep='\t', skiprows=11) salmon = salmon.rename(columns={'# Name': 'Name'}) salmon = salmon.set_index('Name') filterExpression(salmon) kallisto = pd.read_csv('sim_test_kallisto/abundance.txt', sep='\t') kallisto = kallisto.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'}) kallisto = kallisto.set_index('Name') filterExpression(kallisto) # The original TPMs were on a larger dataset, so re-normalize them truth = truth.loc[kallisto.index] truth = truth.fillna(0) x = 1000000.0 * (truth["TPM"] / truth["TPM"].sum()) truth["TPM"] = x # Apply some reasonable filters (Salmon does better with no filtering i.e. on the very low end) filterExpression(mergedSalmon, colname="NumReads", val=1) filterExpression(mergedSalmon, colname="TPM", val=0.01) truthSalmon = truth.join(mergedSalmon, lsuffix='_truth', rsuffix='_salmon') # Apply some reasonable filters filterExpression(mergedKallisto, colname="NumReads", val=1) filterExpression(mergedKallisto, colname="TPM", val=0.01) truthKallisto = truth.join(kallisto, lsuffix='_truth', rsuffix='_kallisto') # correlation with the average salmon TPM truthSalmon.corr(method='spearman')["TPM_truth"]["TPMAvg"] # and with the first replicate truthSalmon.corr(method='spearman')["TPM_truth"]["TPM_salmon"] # correlation with Kallisto truthKallisto.corr(method='spearman')["TPM_truth"]["TPM_kallisto"] p = sns.regplot(mergedSalmon["NumReadsAvg"], mergedSalmon["NumReadsStdDev"] / mergedSalmon["NumReadsAvg"], fit_reg=False) p.set_ylabel("CV") p.set_title("Variability Salmon") p = sns.regplot(mergedKallisto["NumReadsAvg"], mergedKallisto["NumReadsStdDev"] / mergedKallisto["NumReadsAvg"], fit_reg=False) p.set_ylabel("CV") p.set_title("Variability Kallisto") p = sns.regplot(np.log(truthSalmon["TPM_truth"].values+1), np.log(truthSalmon["TPMAvg"]+1), truthSalmon) p.set_xlabel("true TPM") p.set_ylabel("Salmon TPM") p.set_title("Salmon TPM correlation") stats.spearmanr(truthSalmon["TPM_truth"], truthSalmon["TPMAvg"]) p = sns.regplot(np.log(truthKallisto["TPM_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto) p.set_xlabel("true TPM") p.set_ylabel("Kallisto TPM") p.set_title("Kallisto TPM correlation") stats.spearmanr(truthKallisto["TPM_truth"], truthKallisto["TPM_kallisto"])