%matplotlib inline import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np MEAN_FL = 250 small = 1e-2 # for log things (use same value as Harold) # Truncate values in colname < val to 0 def filterExpression(df, colname="TPM", val=0.01): df[colname][df[colname] < val] = 0 # Join a list of DataFrames together, merging on the index and renaming conflicting columns def joinFrames(frames): current = frames[0] for i, frame in enumerate(frames[1:], 2): current = current.join(frame, rsuffix='{}'.format(i)) return current salmon_files = ["sim_test_salmon_{}/quant.sf".format(i) for i in xrange(1,11)] salmonDataFrames = [] for fn in salmon_files: # Rename #Name => Name df = pd.read_csv(fn, sep='\t', skiprows=11) df = df.rename(columns={'# Name': 'Name'}) df = df.set_index('Name') salmonDataFrames.append(df) kallisto_files = ["sim_test_kallisto_bs/bs_abundance_{}.txt".format(i) for i in xrange(10)] kallistoDataFrames = [] for fn in kallisto_files: # Rename #Name => Name df = pd.read_csv(fn, sep='\t') df = df.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'}) df = df.set_index('Name') kallistoDataFrames.append(df) mergedSalmon = joinFrames(salmonDataFrames) mergedSalmon["NumReadsAvg"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1) mergedSalmon["TPMAvg"] = mergedSalmon[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1) mergedSalmon["NumReadsStdDev"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1) mergedKallisto = joinFrames(kallistoDataFrames) mergedKallisto["NumReadsAvg"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1) mergedKallisto["TPMAvg"] = mergedKallisto[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1) mergedKallisto["NumReadsStdDev"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1) salmon = pd.read_csv('sim_test_salmon_1/quant.sf', sep='\t', skiprows=11) salmon = salmon.rename(columns={'# Name': 'Name'}) salmon = salmon.set_index('Name') kallisto = pd.read_csv('sim_test_kallisto/abundance.txt', sep='\t') kallisto = kallisto.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'}) kallisto = kallisto.set_index('Name') # Load the ground truth files truth = pd.read_csv('data/quant_bias_corrected.sf', sep='\t', skiprows=12) # Rename #Name => Name truth = truth.rename(columns={'# Name':'Name'}) truth = truth.set_index('Name') # There were transcripts that *were* selected for the simulation, but they *were not* in the truth = truth.loc[kallisto.index] truth.fillna(0, inplace=True) # Since we filled all NaN's with 0, replace the length here truth["Length"] = kallisto["Length"] # Round the counts roundedCounts = truth["NumReads"].round() # The transcript fraction truth["EffectiveLen"] = truth["Length"] - MEAN_FL # Don't allow a negative effective length truth["EffectiveLen"][truth["EffectiveLen"] < 0] = 0 # Any reads from transcripts with no effective length strangeReads = roundedCounts[truth["EffectiveLen"] <= 0] print("Number of 'Strange' reads = {}".format(len(strangeReads[strangeReads > 0]))) # From the true transcript fractions, compute the true TPM with effective lengths transcriptFracs = (roundedCounts / truth["EffectiveLen"]) transcriptFracs[transcriptFracs == np.inf] = 0 trueTPMEffectiveLen = 1000000.0 * (transcriptFracs / sum(transcriptFracs.values)) truth["TPM_EL_truth"] = trueTPMEffectiveLen # From the true transcript fractions, compute the true TPM with actual lengths transcriptFracs = (roundedCounts / truth["Length"]) transcriptFracs[transcriptFracs == np.inf] = 0 trueTPMRealLen = 1000000.0 * (transcriptFracs / sum(transcriptFracs.values)) truth["TPM_RL_truth"] = trueTPMRealLen # Now, let the true counts be our rounded counts truth["NumReads"] = roundedCounts truthSalmon = truth.join(mergedSalmon, lsuffix='_truth', rsuffix='_salmon') truthKallisto = truth.join(kallisto, lsuffix='_truth', rsuffix='_kallisto') # correlation with the average salmon TPM print("TPM correlation Salmon (Avg) vs. Truth (RealLen) = {}".format(truthSalmon.corr(method='spearman')["TPM_RL_truth"]["TPMAvg"])) print("TPM correlation Salmon (Avg) vs. Truth (EffLen) = {}".format(truthSalmon.corr(method='spearman')["TPM_EL_truth"]["TPMAvg"])) # and with the first replicate print("TPM correlation Salmon (Sim1) vs. Truth (RealLen) = {}".format(truthSalmon.corr(method='spearman')["TPM_RL_truth"]["TPM_salmon"])) print("TPM correlation Salmon (Sim1) vs. Truth (EffLen) = {}".format(truthSalmon.corr(method='spearman')["TPM_EL_truth"]["TPM_salmon"])) # correlation with Kallisto print("TPM correlation Kallisto vs. Truth (RealLen) = {}".format(truthKallisto.corr(method='spearman')["TPM_RL_truth"]["TPM_kallisto"])) print("TPM correlation Kallisto vs. Truth (EffLen) = {}".format(truthKallisto.corr(method='spearman')["TPM_EL_truth"]["TPM_kallisto"])) p = sns.regplot(mergedSalmon["NumReadsAvg"], mergedSalmon["NumReadsStdDev"] / mergedSalmon["NumReadsAvg"], fit_reg=False) _ = p.set_ylabel("CV") _ = p.set_title("Variability Salmon") p = sns.regplot(mergedKallisto["NumReadsAvg"], mergedKallisto["NumReadsStdDev"] / mergedKallisto["NumReadsAvg"], fit_reg=False) _ = p.set_ylabel("CV") _ = p.set_title("Variability Kallisto") p = sns.regplot(np.log(truthSalmon["TPM_RL_truth"].values+1), np.log(truthSalmon["TPM_salmon"]+1), truthSalmon) p.set_xlabel("true TPM (RealLen)") p.set_ylabel("Salmon (Sim1) TPM") p.set_title("Salmon TPM (Sim1) correlation") sr = truthSalmon.corr(method='spearman')["TPM_RL_truth"]["TPM_salmon"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthSalmon["TPM_EL_truth"].values+1), np.log(truthSalmon["TPM_salmon"]+1), truthSalmon) p.set_xlabel("true TPM (EffLen)") p.set_ylabel("Salmon (Sim1) TPM") p.set_title("Salmon TPM (Sim1) correlation") sr = truthSalmon.corr(method='spearman')["TPM_EL_truth"]["TPM_salmon"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthKallisto["TPM_RL_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto) p.set_xlabel("true TPM (RealLen)") p.set_ylabel("Kallisto (FullData) TPM") p.set_title("Kallisto TPM correlation") sr = truthKallisto.corr(method='spearman')["TPM_RL_truth"]["TPM_kallisto"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthKallisto["TPM_EL_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto) p.set_xlabel("true TPM (EffLen)") p.set_ylabel("Kallisto (FullData) TPM") p.set_title("Kallisto TPM correlation") sr = truthKallisto.corr(method='spearman')["TPM_EL_truth"]["TPM_kallisto"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthSalmon["NumReads_truth"].values+1), np.log(truthSalmon["NumReads_salmon"]+1), truthSalmon) p.set_xlabel("true NumReads") p.set_ylabel("Salmon (Sim1) NumReads") p.set_title("Salmon NumReads correlation") sr = truthSalmon.corr(method='spearman')["NumReads_truth"]["NumReads_salmon"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthSalmon["NumReads_truth"].values+1), np.log(truthKallisto["NumReads_kallisto"]+1), truthKallisto) p.set_xlabel("true NumReads") p.set_ylabel("Kallisto (Sim1) NumReads") p.set_title("Kallisto NumReads correlation") sr = truthKallisto.corr(method="spearman")["NumReads_truth"]["NumReads_kallisto"] print("Spearman r = {}".format(sr)) # Maintain an un-filtered copy truthSalmonUnfiltered = truthSalmon.copy() truthKallistoUnfiltered = truthKallisto.copy() # No big changes, but let's see how everything looks if we apply some "reasonable" cutoffs filterExpression(truthSalmon, colname="NumReads_salmon", val=1) filterExpression(truthSalmon, colname="TPM_salmon", val=0.01) filterExpression(truthSalmon, colname="TPMAvg", val=0.01) filterExpression(truthSalmon, colname="NumReads_truth", val=1) filterExpression(truthSalmon, colname="TPM_RL_truth", val=0.01) filterExpression(truthSalmon, colname="TPM_EL_truth", val=0.01) filterExpression(truthKallisto, colname="NumReads_kallisto", val=1) filterExpression(truthKallisto, colname="TPM_kallisto", val=0.01) filterExpression(truthKallisto, colname="NumReads_truth", val=1) filterExpression(truthKallisto, colname="TPM_RL_truth", val=0.01) filterExpression(truthKallisto, colname="TPM_EL_truth", val=0.01) p = sns.regplot(np.log(truthSalmon["TPM_RL_truth"].values+1), np.log(truthSalmon["TPM_salmon"]+1), truthSalmon) p.set_xlabel("true TPM (RealLen)") p.set_ylabel("Salmon (Sim1) TPM") p.set_title("Salmon TPM (Sim1) correlation") sr = truthSalmon.corr(method='spearman')["TPM_RL_truth"]["TPM_salmon"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthSalmon["TPM_EL_truth"].values+1), np.log(truthSalmon["TPM_salmon"]+1), truthSalmon) p.set_xlabel("true TPM (EffLen)") p.set_ylabel("Salmon (Sim1) TPM") p.set_title("Salmon TPM (Sim1) correlation") sr = truthSalmon.corr(method='spearman')["TPM_EL_truth"]["TPM_salmon"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthKallisto["TPM_RL_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto) p.set_xlabel("true TPM (RealLen)") p.set_ylabel("Kallisto (FullData) TPM") p.set_title("Kallisto TPM correlation") sr = truthKallisto.corr(method='spearman')["TPM_RL_truth"]["TPM_kallisto"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthKallisto["TPM_EL_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto) p.set_xlabel("true TPM (EffLen)") p.set_ylabel("Kallisto (FullData) TPM") p.set_title("Kallisto TPM correlation") sr = truthKallisto.corr(method='spearman')["TPM_EL_truth"]["TPM_kallisto"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthSalmon["NumReads_truth"].values+1), np.log(truthSalmon["NumReads_salmon"]+1), truthSalmon) p.set_xlabel("true NumReads") p.set_ylabel("Salmon (Sim1) NumReads") p.set_title("Salmon NumReads correlation") sr = truthSalmon.corr(method='spearman')["NumReads_truth"]["NumReads_salmon"] print("Spearman r = {}".format(sr)) p = sns.regplot(np.log(truthSalmon["NumReads_truth"].values+1), np.log(truthKallisto["NumReads_kallisto"]+1), truthKallisto) p.set_xlabel("true NumReads") p.set_ylabel("Kallisto (Sim1) NumReads") p.set_title("Kallisto NumReads correlation") sr = truthKallisto.corr(method="spearman")["NumReads_truth"]["NumReads_kallisto"] print("Spearman r = {}".format(sr)) def relDiff(c1, c2, DF, verbose=False): rd = pd.DataFrame(data = {"Name" : DF.index, "relDiff" : np.zeros(len(DF.index))*np.nan}) rd.set_index("Name", inplace=True) bothZero = DF[(DF[c1] == 0) & (DF[c2] == 0)].index nonZero = DF.index - bothZero if (verbose): print("Zero occurs in both columns {} times".format(len(rd.loc[bothZero]))) print("Nonzero occurs in at least 1 column {} times".format(len(rd.loc[nonZero]))) allRD = 2.0 * ((DF[c1] - DF[c2]) / (DF[c1] + DF[c2]).abs()) assert(len(rd.loc[nonZero]["relDiff"]) == len(allRD[nonZero])) rd["relDiff"][nonZero] = allRD[nonZero] rd["relDiff"][bothZero] = 0.0 return rd, nonZero rdS, nzS = relDiff("TPM_salmon", "TPM_RL_truth", truthSalmonUnfiltered) medRelDiffSalmonUnfilt = rdS["relDiff"][nzS].median() X = truthSalmonUnfiltered.copy() filterExpression(X, colname='TPM_salmon', val=1e-30) rdS, nzS = relDiff("TPM_salmon", "TPM_RL_truth", X) medRelDiffSalmonFilt = rdS["relDiff"][nzS].median() filterExpression(X, colname='TPM_RL_truth', val=1e-30) rdS, nzS = relDiff("TPM_salmon", "TPM_RL_truth", X) medRelDiffSalmonFiltBoth = rdS["relDiff"][nzS].median() print("Median (non-zero) relative differences\n========================================") print("Unfiltered = {}".format(medRelDiffSalmonUnfilt)) print("Filtered (salmon TPM < 1e-30 => 0.0) = {}".format(medRelDiffSalmonFilt)) print("Filtered (salmon TPM < 1e-30 => 0.0 and true TPM < 1e-30 => 0.0) = {}".format(medRelDiffSalmonFiltBoth)) rdK, nzK = relDiff("TPM_kallisto", "TPM_RL_truth", truthKallisto) medRelDiffKallisto = rdK["relDiff"].median() rdS, nzS = relDiff("TPM_salmon", "TPM_RL_truth", truthSalmon) medRelDiffSalmon = rdS["relDiff"].median() relDiffTPM = pd.DataFrame(rdK).join(pd.DataFrame(rdS), lsuffix='_kallisto', rsuffix='_salmon') print("Kallisto") print("===============================") print("Median relative difference = {}".format(medRelDiffKallisto)) print("Median relative difference (non-zero) = {}".format(rdK["relDiff"][nzK].median())) print("Mean relative difference = {}".format(rdK["relDiff"].mean())) print("\n\nSalmon") print("===============================") print("Median relative difference = {}".format(medRelDiffSalmon)) print("Median relative difference (non-zero) = {}".format(rdS["relDiff"][nzS].median())) print("Mean relative difference = {}".format(rdS["relDiff"].mean())) p = plt.hist(relDiffTPM["relDiff_salmon"].values, bins=100) _ = plt.axes().set_ylim((0,700)) _ = plt.axes().set_title("Salmon TPM Relative Difference") p = plt.hist(relDiffTPM["relDiff_kallisto"].values, bins=100) _ = plt.axes().set_ylim((0,700)) _ = plt.axes().set_title("Kallisto TPM Relative Difference") f, ax = plt.subplots(2, 1, sharex=True) p = sns.regplot(truthSalmon["TPM_RL_truth"].values, rdS["relDiff"].values, scatter_kws={'alpha' : 0.3}, fit_reg=False, ax=ax[0]) p2 = sns.regplot(truthKallisto["TPM_RL_truth"].values, rdK["relDiff"].values, scatter_kws={'alpha' : 0.3}, fit_reg=False, ax=ax[1]) _ = ax[0].semilogx() _ = ax[1].semilogx() _ = ax[0].set_xlim(1, 2e5) _ = ax[0].set_ylabel("Rel. diff Salmon") _ = ax[1].set_ylabel("Rel. diff Kallisto") _ = ax[1].set_xlabel("True TPM") _ = ax[0].set_title("Rel. diff vs. TPM") rdK, nzK = relDiff("NumReads_kallisto", "NumReads_truth", truthKallisto) medRelDiffKallisto = rdK["relDiff"].median() rdS, nzS = relDiff("NumReads_salmon", "NumReads_truth", truthSalmon) medRelDiffSalmon = rdS["relDiff"].median() relDiffReads = pd.DataFrame(rdK).join(pd.DataFrame(rdS), lsuffix='_kallisto', rsuffix='_salmon') print("Kallisto") print("===============================") print("Median relative difference = {}".format(medRelDiffKallisto)) print("Median relative difference (non-zero) = {}".format(rdK["relDiff"][nzK].median())) print("Mean relative difference = {}\n".format(rdK["relDiff"].mean())) print("\n\nSalmon") print("===============================") print("Median relative difference = {}".format(medRelDiffSalmon)) print("Median relative difference (non-zero) = {}".format(rdS["relDiff"][nzS].median())) print("Mean relative difference = {}\n".format(rdS["relDiff"].mean())) p = plt.hist(np.clip(relDiffReads["relDiff_salmon"], -2.0, 2.0), bins=100) _ = plt.axes().set_ylim((0,800)) _ = plt.axes().set_title("Salmon Reads Relative Difference") p = plt.hist(np.clip(relDiffReads["relDiff_kallisto"], -2.0, 2.0), bins=100) _ = plt.axes().set_ylim((0,800)) _ = plt.axes().set_title("Kallisto Reads Relative Difference") f, ax = plt.subplots(2, 1, sharex=True) p = sns.regplot(truthSalmon["NumReads_truth"].values, relDiffReads["relDiff_salmon"].values, scatter_kws={'alpha' : 0.3}, fit_reg=False, ax=ax[0]) p2 = sns.regplot(truthKallisto["NumReads_truth"].values, relDiffReads["relDiff_kallisto"].values, scatter_kws={'alpha' : 0.3}, fit_reg=False, ax=ax[1]) _ = ax[0].semilogx() _ = ax[1].semilogx() _ = ax[0].set_xlim(1, 2e5) _ = ax[0].set_ylabel("Rel. diff Salmon") _ = ax[1].set_ylabel("Rel. diff Kallisto") _ = ax[1].set_xlabel("True Counts") _ = ax[0].set_title("Rel. diff vs. Read Count")