%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

MEAN_FL = 250
small = 1e-2 # for log things (use same value as Harold)

# Truncate values in colname < val to 0
def filterExpression(df, colname="TPM", val=0.01):
    df[colname][df[colname] < val] = 0

# Join a list of DataFrames together, merging on the index and renaming conflicting columns
def joinFrames(frames):
    current = frames[0]
    for i, frame in enumerate(frames[1:], 2):
        current = current.join(frame, rsuffix='{}'.format(i))
    return current

salmon_files = ["sim_test_salmon_{}/quant.sf".format(i) for i in xrange(1,11)]
salmonDataFrames = []
for fn in salmon_files:
    # Rename #Name => Name
    df = pd.read_csv(fn, sep='\t', skiprows=11)
    df = df.rename(columns={'# Name': 'Name'})
    df = df.set_index('Name')
    salmonDataFrames.append(df)

kallisto_files = ["sim_test_kallisto_bs/bs_abundance_{}.txt".format(i) for i in xrange(10)]
kallistoDataFrames = []
for fn in kallisto_files:
    # Rename #Name => Name
    df = pd.read_csv(fn, sep='\t')
    df = df.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'})
    df = df.set_index('Name')
    kallistoDataFrames.append(df)

mergedSalmon = joinFrames(salmonDataFrames)
mergedSalmon["NumReadsAvg"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedSalmon["TPMAvg"] = mergedSalmon[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedSalmon["NumReadsStdDev"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1)

mergedKallisto = joinFrames(kallistoDataFrames)
mergedKallisto["NumReadsAvg"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedKallisto["TPMAvg"] = mergedKallisto[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedKallisto["NumReadsStdDev"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1)

salmon = pd.read_csv('sim_test_salmon_1/quant.sf', sep='\t', skiprows=11)
salmon = salmon.rename(columns={'# Name': 'Name'})
salmon = salmon.set_index('Name')

kallisto = pd.read_csv('sim_test_kallisto/abundance.txt', sep='\t')
kallisto = kallisto.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'})
kallisto = kallisto.set_index('Name')

# Load the ground truth files
truth = pd.read_csv('data/quant_bias_corrected.sf', sep='\t', skiprows=12)
# Rename #Name => Name
truth = truth.rename(columns={'# Name':'Name'})
truth = truth.set_index('Name')

# There were transcripts that *were* selected for the simulation, but they *were not* in the
truth = truth.loc[kallisto.index]
truth.fillna(0, inplace=True)
# Since we filled all NaN's with 0, replace the length here
truth["Length"] = kallisto["Length"]
# Round the counts
roundedCounts = truth["NumReads"].round()

# The transcript fraction 
truth["EffectiveLen"] = truth["Length"] - MEAN_FL
# Don't allow a negative effective length
truth["EffectiveLen"][truth["EffectiveLen"] < 0] = 0

# Any reads from transcripts with no effective length
strangeReads = roundedCounts[truth["EffectiveLen"] <= 0]
print("Number of 'Strange' reads = {}".format(len(strangeReads[strangeReads > 0])))

# From the true transcript fractions, compute the true TPM with effective lengths
transcriptFracs = (roundedCounts / truth["EffectiveLen"])
transcriptFracs[transcriptFracs == np.inf] = 0
trueTPMEffectiveLen = 1000000.0 * (transcriptFracs / sum(transcriptFracs.values))
truth["TPM_EL_truth"] = trueTPMEffectiveLen

# From the true transcript fractions, compute the true TPM with actual lengths
transcriptFracs = (roundedCounts / truth["Length"])
transcriptFracs[transcriptFracs == np.inf] = 0
trueTPMRealLen = 1000000.0 * (transcriptFracs / sum(transcriptFracs.values))
truth["TPM_RL_truth"] = trueTPMRealLen

# Now, let the true counts be our rounded counts
truth["NumReads"] = roundedCounts

truthSalmon = truth.join(mergedSalmon, lsuffix='_truth', rsuffix='_salmon')

truthKallisto = truth.join(kallisto, lsuffix='_truth', rsuffix='_kallisto')

# correlation with the average salmon TPM 
print("TPM correlation Salmon (Avg) vs. Truth (RealLen) = {}".format(truthSalmon.corr(method='spearman')["TPM_RL_truth"]["TPMAvg"]))
print("TPM correlation Salmon (Avg) vs. Truth (EffLen) = {}".format(truthSalmon.corr(method='spearman')["TPM_EL_truth"]["TPMAvg"]))

# and with the first replicate
print("TPM correlation Salmon (Sim1) vs. Truth (RealLen) = {}".format(truthSalmon.corr(method='spearman')["TPM_RL_truth"]["TPM_salmon"]))
print("TPM correlation Salmon (Sim1) vs. Truth (EffLen) = {}".format(truthSalmon.corr(method='spearman')["TPM_EL_truth"]["TPM_salmon"]))

# correlation with Kallisto
print("TPM correlation Kallisto vs. Truth (RealLen) = {}".format(truthKallisto.corr(method='spearman')["TPM_RL_truth"]["TPM_kallisto"]))
print("TPM correlation Kallisto vs. Truth (EffLen) = {}".format(truthKallisto.corr(method='spearman')["TPM_EL_truth"]["TPM_kallisto"]))

p = sns.regplot(mergedSalmon["NumReadsAvg"], mergedSalmon["NumReadsStdDev"] / mergedSalmon["NumReadsAvg"], fit_reg=False)
_ = p.set_ylabel("CV")
_ = p.set_title("Variability Salmon")

p = sns.regplot(mergedKallisto["NumReadsAvg"], mergedKallisto["NumReadsStdDev"] / mergedKallisto["NumReadsAvg"], fit_reg=False)
_ = p.set_ylabel("CV")
_ = p.set_title("Variability Kallisto")

p = sns.regplot(np.log(truthSalmon["TPM_RL_truth"].values+1), np.log(truthSalmon["TPM_salmon"]+1), truthSalmon)
p.set_xlabel("true TPM (RealLen)")
p.set_ylabel("Salmon (Sim1) TPM")
p.set_title("Salmon TPM (Sim1) correlation")
sr = truthSalmon.corr(method='spearman')["TPM_RL_truth"]["TPM_salmon"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthSalmon["TPM_EL_truth"].values+1), np.log(truthSalmon["TPM_salmon"]+1), truthSalmon)
p.set_xlabel("true TPM (EffLen)")
p.set_ylabel("Salmon (Sim1) TPM")
p.set_title("Salmon TPM (Sim1) correlation")
sr = truthSalmon.corr(method='spearman')["TPM_EL_truth"]["TPM_salmon"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthKallisto["TPM_RL_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto)
p.set_xlabel("true TPM (RealLen)")
p.set_ylabel("Kallisto (FullData) TPM")
p.set_title("Kallisto TPM correlation")
sr = truthKallisto.corr(method='spearman')["TPM_RL_truth"]["TPM_kallisto"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthKallisto["TPM_EL_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto)
p.set_xlabel("true TPM (EffLen)")
p.set_ylabel("Kallisto (FullData) TPM")
p.set_title("Kallisto TPM correlation")
sr = truthKallisto.corr(method='spearman')["TPM_EL_truth"]["TPM_kallisto"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthSalmon["NumReads_truth"].values+1), np.log(truthSalmon["NumReads_salmon"]+1), truthSalmon)
p.set_xlabel("true NumReads")
p.set_ylabel("Salmon (Sim1) NumReads")
p.set_title("Salmon NumReads correlation")
sr = truthSalmon.corr(method='spearman')["NumReads_truth"]["NumReads_salmon"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthSalmon["NumReads_truth"].values+1), np.log(truthKallisto["NumReads_kallisto"]+1), truthKallisto)
p.set_xlabel("true NumReads")
p.set_ylabel("Kallisto (Sim1) NumReads")
p.set_title("Kallisto NumReads correlation")
sr = truthKallisto.corr(method="spearman")["NumReads_truth"]["NumReads_kallisto"]
print("Spearman r = {}".format(sr))

# Maintain an un-filtered copy
truthSalmonUnfiltered = truthSalmon.copy()
truthKallistoUnfiltered = truthKallisto.copy()

# No big changes, but let's see how everything looks if we apply some "reasonable" cutoffs
filterExpression(truthSalmon, colname="NumReads_salmon", val=1)
filterExpression(truthSalmon, colname="TPM_salmon", val=0.01)
filterExpression(truthSalmon, colname="TPMAvg", val=0.01)
filterExpression(truthSalmon, colname="NumReads_truth", val=1)
filterExpression(truthSalmon, colname="TPM_RL_truth", val=0.01)
filterExpression(truthSalmon, colname="TPM_EL_truth", val=0.01)

filterExpression(truthKallisto, colname="NumReads_kallisto", val=1)
filterExpression(truthKallisto, colname="TPM_kallisto", val=0.01)
filterExpression(truthKallisto, colname="NumReads_truth", val=1)
filterExpression(truthKallisto, colname="TPM_RL_truth", val=0.01)
filterExpression(truthKallisto, colname="TPM_EL_truth", val=0.01)

p = sns.regplot(np.log(truthSalmon["TPM_RL_truth"].values+1), np.log(truthSalmon["TPM_salmon"]+1), truthSalmon)
p.set_xlabel("true TPM (RealLen)")
p.set_ylabel("Salmon (Sim1) TPM")
p.set_title("Salmon TPM (Sim1) correlation")
sr = truthSalmon.corr(method='spearman')["TPM_RL_truth"]["TPM_salmon"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthSalmon["TPM_EL_truth"].values+1), np.log(truthSalmon["TPM_salmon"]+1), truthSalmon)
p.set_xlabel("true TPM (EffLen)")
p.set_ylabel("Salmon (Sim1) TPM")
p.set_title("Salmon TPM (Sim1) correlation")
sr = truthSalmon.corr(method='spearman')["TPM_EL_truth"]["TPM_salmon"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthKallisto["TPM_RL_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto)
p.set_xlabel("true TPM (RealLen)")
p.set_ylabel("Kallisto (FullData) TPM")
p.set_title("Kallisto TPM correlation")
sr = truthKallisto.corr(method='spearman')["TPM_RL_truth"]["TPM_kallisto"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthKallisto["TPM_EL_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto)
p.set_xlabel("true TPM (EffLen)")
p.set_ylabel("Kallisto (FullData) TPM")
p.set_title("Kallisto TPM correlation")
sr = truthKallisto.corr(method='spearman')["TPM_EL_truth"]["TPM_kallisto"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthSalmon["NumReads_truth"].values+1), np.log(truthSalmon["NumReads_salmon"]+1), truthSalmon)
p.set_xlabel("true NumReads")
p.set_ylabel("Salmon (Sim1) NumReads")
p.set_title("Salmon NumReads correlation")
sr = truthSalmon.corr(method='spearman')["NumReads_truth"]["NumReads_salmon"]
print("Spearman r = {}".format(sr))

p = sns.regplot(np.log(truthSalmon["NumReads_truth"].values+1), np.log(truthKallisto["NumReads_kallisto"]+1), truthKallisto)
p.set_xlabel("true NumReads")
p.set_ylabel("Kallisto (Sim1) NumReads")
p.set_title("Kallisto NumReads correlation")
sr = truthKallisto.corr(method="spearman")["NumReads_truth"]["NumReads_kallisto"]
print("Spearman r = {}".format(sr))

def relDiff(c1, c2, DF, verbose=False):
    rd = pd.DataFrame(data = {"Name" : DF.index, "relDiff" : np.zeros(len(DF.index))*np.nan})
    rd.set_index("Name", inplace=True)
    bothZero = DF[(DF[c1] == 0) & (DF[c2] == 0)].index
    nonZero = DF.index - bothZero
    if (verbose):
        print("Zero occurs in both columns {} times".format(len(rd.loc[bothZero])))
        print("Nonzero occurs in at least 1 column {} times".format(len(rd.loc[nonZero])))
    allRD = 2.0 * ((DF[c1] - DF[c2]) / (DF[c1] + DF[c2]).abs())
    assert(len(rd.loc[nonZero]["relDiff"]) == len(allRD[nonZero]))
    rd["relDiff"][nonZero] = allRD[nonZero]
    rd["relDiff"][bothZero] = 0.0
    return rd, nonZero

rdS, nzS = relDiff("TPM_salmon", "TPM_RL_truth", truthSalmonUnfiltered)
medRelDiffSalmonUnfilt = rdS["relDiff"][nzS].median()

X = truthSalmonUnfiltered.copy()
filterExpression(X, colname='TPM_salmon', val=1e-30)

rdS, nzS = relDiff("TPM_salmon", "TPM_RL_truth", X)
medRelDiffSalmonFilt = rdS["relDiff"][nzS].median()

filterExpression(X, colname='TPM_RL_truth', val=1e-30)

rdS, nzS = relDiff("TPM_salmon", "TPM_RL_truth", X)
medRelDiffSalmonFiltBoth = rdS["relDiff"][nzS].median()

print("Median (non-zero) relative differences\n========================================")
print("Unfiltered = {}".format(medRelDiffSalmonUnfilt))
print("Filtered (salmon TPM < 1e-30 => 0.0) = {}".format(medRelDiffSalmonFilt))
print("Filtered (salmon TPM < 1e-30 => 0.0 and true TPM < 1e-30 => 0.0) = {}".format(medRelDiffSalmonFiltBoth))

rdK, nzK = relDiff("TPM_kallisto", "TPM_RL_truth", truthKallisto)
medRelDiffKallisto = rdK["relDiff"].median()

rdS, nzS = relDiff("TPM_salmon", "TPM_RL_truth", truthSalmon)
medRelDiffSalmon = rdS["relDiff"].median()
relDiffTPM = pd.DataFrame(rdK).join(pd.DataFrame(rdS), lsuffix='_kallisto', rsuffix='_salmon')

print("Kallisto")
print("===============================")
print("Median relative difference = {}".format(medRelDiffKallisto))
print("Median relative difference (non-zero) = {}".format(rdK["relDiff"][nzK].median()))
print("Mean relative difference = {}".format(rdK["relDiff"].mean()))

print("\n\nSalmon")
print("===============================")
print("Median relative difference = {}".format(medRelDiffSalmon))
print("Median relative difference (non-zero) = {}".format(rdS["relDiff"][nzS].median()))
print("Mean relative difference = {}".format(rdS["relDiff"].mean()))

p = plt.hist(relDiffTPM["relDiff_salmon"].values, bins=100)
_ = plt.axes().set_ylim((0,700))
_ = plt.axes().set_title("Salmon TPM Relative Difference")

p = plt.hist(relDiffTPM["relDiff_kallisto"].values, bins=100)
_ = plt.axes().set_ylim((0,700))
_ = plt.axes().set_title("Kallisto TPM Relative Difference")

f, ax = plt.subplots(2, 1, sharex=True)
p = sns.regplot(truthSalmon["TPM_RL_truth"].values, rdS["relDiff"].values, scatter_kws={'alpha' : 0.3}, fit_reg=False, ax=ax[0])
p2 = sns.regplot(truthKallisto["TPM_RL_truth"].values, rdK["relDiff"].values, scatter_kws={'alpha' : 0.3}, fit_reg=False, ax=ax[1])
_ = ax[0].semilogx()
_ = ax[1].semilogx()
_ = ax[0].set_xlim(1, 2e5)
_ = ax[0].set_ylabel("Rel. diff Salmon")
_ = ax[1].set_ylabel("Rel. diff Kallisto")
_ = ax[1].set_xlabel("True TPM")
_ = ax[0].set_title("Rel. diff vs. TPM")

rdK, nzK = relDiff("NumReads_kallisto", "NumReads_truth", truthKallisto)
medRelDiffKallisto = rdK["relDiff"].median()

rdS, nzS = relDiff("NumReads_salmon", "NumReads_truth", truthSalmon)
medRelDiffSalmon = rdS["relDiff"].median()
relDiffReads = pd.DataFrame(rdK).join(pd.DataFrame(rdS), lsuffix='_kallisto', rsuffix='_salmon')

print("Kallisto")
print("===============================")
print("Median relative difference = {}".format(medRelDiffKallisto))
print("Median relative difference (non-zero) = {}".format(rdK["relDiff"][nzK].median()))
print("Mean relative difference = {}\n".format(rdK["relDiff"].mean()))

print("\n\nSalmon")
print("===============================")
print("Median relative difference = {}".format(medRelDiffSalmon))
print("Median relative difference (non-zero) = {}".format(rdS["relDiff"][nzS].median()))
print("Mean relative difference = {}\n".format(rdS["relDiff"].mean()))

p = plt.hist(np.clip(relDiffReads["relDiff_salmon"], -2.0, 2.0), bins=100)
_ = plt.axes().set_ylim((0,800))
_ = plt.axes().set_title("Salmon Reads Relative Difference")

p = plt.hist(np.clip(relDiffReads["relDiff_kallisto"], -2.0, 2.0), bins=100)
_ = plt.axes().set_ylim((0,800))
_ = plt.axes().set_title("Kallisto Reads Relative Difference")

f, ax = plt.subplots(2, 1, sharex=True)
p = sns.regplot(truthSalmon["NumReads_truth"].values, relDiffReads["relDiff_salmon"].values, scatter_kws={'alpha' : 0.3}, fit_reg=False, ax=ax[0])
p2 = sns.regplot(truthKallisto["NumReads_truth"].values, relDiffReads["relDiff_kallisto"].values, scatter_kws={'alpha' : 0.3}, fit_reg=False, ax=ax[1])
_ = ax[0].semilogx()
_ = ax[1].semilogx()
_ = ax[0].set_xlim(1, 2e5)
_ = ax[0].set_ylabel("Rel. diff Salmon")
_ = ax[1].set_ylabel("Rel. diff Kallisto")
_ = ax[1].set_xlabel("True Counts")
_ = ax[0].set_title("Rel. diff vs. Read Count")