%matplotlib inline

import time
import pandas as pd
import os
from matplotlib_venn import venn3

# CONSTANTS
DEBUG = True
DBDIR = os.path.join("..", "databases")

# HELPER FUNCTIONS
def debug(*args):
    """ Prints to stdout only if DEBUG is True. """
    if DEBUG:
        print(*args)
            
def timed(function):
    """ A decorator that prints the running time of the function call. """
    def inner(*args, **kwargs):
        start_time = time.time()
        result = function(*args, **kwargs)
        print("{} seconds".format(time.time() - start_time))
        return result
    return inner

#@timed
def read_csv(path, sep='\t', filter=None, chunksize=100000, skiprows=None):
    """
    Reads a csv file from a given filepath into a pandas Data frame.
    Files are iterated over with a chunksize of 100000 by default.
    Rows can be filtered so that only rows with certain values in a column pass.
    All rows that do not pass the filter are immediately discarded.
    When using this to load eQTLs, you can include a tuple for filtering based on gene names
    (such as VIP genes from PharmGKB).
    The first element of the tuple is the name of the field to be filtered on,
    the second element is a list of values that pass the filter.
    """
    print("Reading file: {}...".format(path))
    iterator = pd.read_csv(path, sep=sep, chunksize=chunksize, skiprows=skiprows)
    if filter is None:
        df = pd.concat(iterator)
    else:
        df = pd.concat([chunk[chunk[filter[0]].isin(filter[1])] for chunk in iterator], ignore_index=True)
    print("Returning {} rows from {}".format(len(df), path))
    return df

def read_xls(path, sheet=0, filter=None, skiprows=0):
    df = pd.read_excel(path, sheet, skiprows=skiprows)
    if filter is not None:
        df = df[df[filter[0]].isin(filter[1])]
        df = df.reset_index(drop=True)
    print("Returning {} rows from {} ({})".format(len(df), path, sheet))
    return df

def pass_below(df, filter):
    """
    Filters a pandas Data frame, so that all rows where column filter[0] < filter[1] pass.
    E.g. filter(df, ('p-value', 1e-8)) returns a dataframe consisting of all rows in df where
    the value of column 'p-value' is below 1e-8.
    """
    df = df[df[filter[0]] < filter[1]]
    df = df.reset_index(drop=True)
    print("{} rows left after filtering by column '{}' < {}".format(len(df), filter[0], filter[1]))
    return df


# dataframe containing the data for VIP genes
vip_genes_df = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'genes.tsv'), filter=('Is VIP', [True]))
# set of the VIP gene names
vip_genes = set(vip_genes_df['Symbol'])

vip_genes_df.head()

# SNPs in VIP genes with annotation in PharmGKB
vip_snps = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'rsid.tsv'), skiprows=1, filter=('Gene Symbols', vip_genes))
vip_snps.head()

# clinical annotations in PharmGKB for VIP genes
vip_annotations = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'clinicalAnnotations.csv'), sep=',', filter=('gene', vip_genes))
# set of refSNP IDs with clinical annotation
vip_clinical_snps = set(vip_annotations.variant)

vip_annotations.head()

# counts of clinical annotations in VIP genes
vip_annotations_by_gene = vip_annotations.groupby(['gene', 'level of evidence'])
vip_annotations_by_gene.count().head()              # summary
#print(vip_annotations_by_gene.count().to_string())  # all values as string
#from IPython.core.display import HTML, display      # all values as HTML
#display(HTML(vip_annotations_by_gene.count().to_html()))

vip_blood_eqtls = pass_below(read_csv(os.path.join(DBDIR, 'eqtl-blood', '2012-12-21-CisAssociationsProbeLevelFDR0.5.txt'), filter=('HUGO', vip_genes)), 
                             ('PValue', 1e-8))
vip_blood_eqtls.head()

vip_blood_eqtls.groupby(['HUGO'])['HUGO'].count()

def read_adme_xls(sheet):
    return read_xls(os.path.join(DBDIR, 'eqtl-liver', 'tpj201144x3.xls'), sheet=sheet, skiprows=7, filter=('SNP-gene HGNC', vip_genes))

vip_liver_eqtls_1 = pass_below(pd.concat([read_adme_xls('Table S2 (A)'), read_adme_xls('Table S2 (B)'), read_adme_xls('Table S2 (C)')]),
                               ('p-value', 1e-8))
vip_liver_eqtls_1.head()

vip_liver_eqtls_2 = pass_below(read_csv(os.path.join(DBDIR, 'eqtl-liver', 'journal.pgen.1002078.s013'), filter=('Gene', vip_genes)),
                               ('UC Best RS UC lm pvalue', 1e-4))
vip_liver_eqtls_2.head()

vip_intestine_eqtls = pass_below(read_xls(os.path.join(DBDIR, 'eqtl-intestine', 'mmc1.xls'), filter=('Gene', vip_genes)),
                                 ('P-value', 1e-8))
vip_intestine_eqtls.head()

def eqtl_statistics(df, rsid_column, vip_snps):
    count = len(df[df[rsid_column].isin(vip_snps)])
    return "Out of {} eQTLs, {} are already annotated in PharmGKB.".format(len(df), count)

# compare eQTLs and PharmGKB SNPs
print("Blood:", eqtl_statistics(vip_blood_eqtls, 'SNPName', vip_snps.RSID))
print("Liver 1:", eqtl_statistics(vip_liver_eqtls_1, 'SNP ID', vip_snps.RSID))
print("Liver 2:", eqtl_statistics(vip_liver_eqtls_2, 'UC Best RSID', vip_snps.RSID))
print("Intestines:", eqtl_statistics(vip_intestine_eqtls, 'SNP', vip_snps.RSID))

blood_set = set(vip_blood_eqtls['SNPName'].tolist())
liver_set = set(vip_liver_eqtls_1['SNP ID'].tolist()) | set(vip_liver_eqtls_2['UC Best RSID'])
intestine_set = set(vip_intestine_eqtls['SNP'])

list(map(len, [blood_set, liver_set, intestine_set]))

venn3([liver_set, intestine_set, blood_set], ('Liver', 'Intestine', 'Blood'))

intestine_set.intersection(liver_set)

blood_set.intersection(liver_set)

blood_set

liver_set

intestine_set