%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import scipy.stats as stats

# Truncate values in colname < val to 0
def filterExpression(df, colname="TPM", val=0.01):
    df[df[colname] < val] = 0

# Join a list of DataFrames together, merging on the index and renaming conflicting columns
def joinFrames(frames):
    current = frames[0]
    for i, frame in enumerate(frames[1:], 2):
        current = current.join(frame, rsuffix='{}'.format(i))
    return current

salmon_files = ["sim_test_salmon_{}/quant.sf".format(i) for i in xrange(1,11)]
salmonDataFrames = []
for fn in salmon_files:
    # Rename #Name => Name
    df = pd.read_csv(fn, sep='\t', skiprows=11)
    df = df.rename(columns={'# Name': 'Name'})
    df = df.set_index('Name')
    salmonDataFrames.append(df)

kallisto_files = ["sim_test_kallisto_bs/bs_abundance_{}.txt".format(i) for i in xrange(10)]
kallistoDataFrames = []
for fn in kallisto_files:
    # Rename #Name => Name
    df = pd.read_csv(fn, sep='\t')
    df = df.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'})
    df = df.set_index('Name')
    kallistoDataFrames.append(df)

mergedSalmon = joinFrames(salmonDataFrames)
mergedSalmon["NumReadsAvg"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedSalmon["TPMAvg"] = mergedSalmon[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedSalmon["NumReadsStdDev"] = mergedSalmon[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1)

mergedKallisto = joinFrames(kallistoDataFrames)
mergedKallisto["NumReadsAvg"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedKallisto["TPMAvg"] = mergedKallisto[["TPM"] + ["TPM{}".format(i) for i in xrange(2,11)]].mean(axis=1)
mergedKallisto["NumReadsStdDev"] = mergedKallisto[["NumReads"] + ["NumReads{}".format(i) for i in xrange(2,11)]].std(axis=1)

truth = pd.read_csv('data/quant_bias_corrected.sf', sep='\t', skiprows=12)
# Rename #Name => Name
truth = truth.rename(columns={'# Name':'Name'})
truth = truth.set_index('Name')
filterExpression(truth)

salmon = pd.read_csv('sim_test_salmon_1/quant.sf', sep='\t', skiprows=11)
salmon = salmon.rename(columns={'# Name': 'Name'})
salmon = salmon.set_index('Name')
filterExpression(salmon)

kallisto = pd.read_csv('sim_test_kallisto/abundance.txt', sep='\t')
kallisto = kallisto.rename(columns={'target_id' : 'Name', 'length' : 'Length', 'est_counts' : 'NumReads', 'tpm' : 'TPM'})
kallisto = kallisto.set_index('Name')
filterExpression(kallisto)

# The original TPMs were on a larger dataset, so re-normalize them
truth = truth.loc[kallisto.index]
truth = truth.fillna(0)
x = 1000000.0 * (truth["TPM"] / truth["TPM"].sum())
truth["TPM"] = x

# Apply some reasonable filters (Salmon does better with no filtering i.e. on the very low end)
filterExpression(mergedSalmon, colname="NumReads", val=1)
filterExpression(mergedSalmon, colname="TPM", val=0.01)
truthSalmon = truth.join(mergedSalmon, lsuffix='_truth', rsuffix='_salmon')

# Apply some reasonable filters 
filterExpression(mergedKallisto, colname="NumReads", val=1)
filterExpression(mergedKallisto, colname="TPM", val=0.01)

truthKallisto = truth.join(kallisto, lsuffix='_truth', rsuffix='_kallisto')

# correlation with the average salmon TPM 
truthSalmon.corr(method='spearman')["TPM_truth"]["TPMAvg"]

# and with the first replicate
truthSalmon.corr(method='spearman')["TPM_truth"]["TPM_salmon"]

# correlation with Kallisto
truthKallisto.corr(method='spearman')["TPM_truth"]["TPM_kallisto"]

p = sns.regplot(mergedSalmon["NumReadsAvg"], mergedSalmon["NumReadsStdDev"] / mergedSalmon["NumReadsAvg"], fit_reg=False)
p.set_ylabel("CV")
p.set_title("Variability Salmon")

p = sns.regplot(mergedKallisto["NumReadsAvg"], mergedKallisto["NumReadsStdDev"] / mergedKallisto["NumReadsAvg"], fit_reg=False)
p.set_ylabel("CV")
p.set_title("Variability Kallisto")

p = sns.regplot(np.log(truthSalmon["TPM_truth"].values+1), np.log(truthSalmon["TPMAvg"]+1), truthSalmon)
p.set_xlabel("true TPM")
p.set_ylabel("Salmon TPM")
p.set_title("Salmon TPM correlation")
stats.spearmanr(truthSalmon["TPM_truth"], truthSalmon["TPMAvg"])

p = sns.regplot(np.log(truthKallisto["TPM_truth"].values+1), np.log(truthKallisto["TPM_kallisto"].values+1), truthKallisto)
p.set_xlabel("true TPM")
p.set_ylabel("Kallisto TPM")
p.set_title("Kallisto TPM correlation")
stats.spearmanr(truthKallisto["TPM_truth"], truthKallisto["TPM_kallisto"])