%matplotlib inline import time import pandas as pd import os from matplotlib_venn import venn3 # CONSTANTS DEBUG = True DBDIR = os.path.join("..", "databases") # HELPER FUNCTIONS def debug(*args): """ Prints to stdout only if DEBUG is True. """ if DEBUG: print(*args) def timed(function): """ A decorator that prints the running time of the function call. """ def inner(*args, **kwargs): start_time = time.time() result = function(*args, **kwargs) print("{} seconds".format(time.time() - start_time)) return result return inner #@timed def read_csv(path, sep='\t', filter=None, chunksize=100000, skiprows=None): """ Reads a csv file from a given filepath into a pandas Data frame. Files are iterated over with a chunksize of 100000 by default. Rows can be filtered so that only rows with certain values in a column pass. All rows that do not pass the filter are immediately discarded. When using this to load eQTLs, you can include a tuple for filtering based on gene names (such as VIP genes from PharmGKB). The first element of the tuple is the name of the field to be filtered on, the second element is a list of values that pass the filter. """ print("Reading file: {}...".format(path)) iterator = pd.read_csv(path, sep=sep, chunksize=chunksize, skiprows=skiprows) if filter is None: df = pd.concat(iterator) else: df = pd.concat([chunk[chunk[filter[0]].isin(filter[1])] for chunk in iterator], ignore_index=True) print("Returning {} rows from {}".format(len(df), path)) return df def read_xls(path, sheet=0, filter=None, skiprows=0): df = pd.read_excel(path, sheet, skiprows=skiprows) if filter is not None: df = df[df[filter[0]].isin(filter[1])] df = df.reset_index(drop=True) print("Returning {} rows from {} ({})".format(len(df), path, sheet)) return df def pass_below(df, filter): """ Filters a pandas Data frame, so that all rows where column filter[0] < filter[1] pass. E.g. filter(df, ('p-value', 1e-8)) returns a dataframe consisting of all rows in df where the value of column 'p-value' is below 1e-8. """ df = df[df[filter[0]] < filter[1]] df = df.reset_index(drop=True) print("{} rows left after filtering by column '{}' < {}".format(len(df), filter[0], filter[1])) return df # dataframe containing the data for VIP genes vip_genes_df = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'genes.tsv'), filter=('Is VIP', [True])) # set of the VIP gene names vip_genes = set(vip_genes_df['Symbol']) vip_genes_df.head() # SNPs in VIP genes with annotation in PharmGKB vip_snps = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'rsid.tsv'), skiprows=1, filter=('Gene Symbols', vip_genes)) vip_snps.head() # clinical annotations in PharmGKB for VIP genes vip_annotations = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'clinicalAnnotations.csv'), sep=',', filter=('gene', vip_genes)) # set of refSNP IDs with clinical annotation vip_clinical_snps = set(vip_annotations.variant) vip_annotations.head() # counts of clinical annotations in VIP genes vip_annotations_by_gene = vip_annotations.groupby(['gene', 'level of evidence']) vip_annotations_by_gene.count().head() # summary #print(vip_annotations_by_gene.count().to_string()) # all values as string #from IPython.core.display import HTML, display # all values as HTML #display(HTML(vip_annotations_by_gene.count().to_html())) vip_blood_eqtls = pass_below(read_csv(os.path.join(DBDIR, 'eqtl-blood', '2012-12-21-CisAssociationsProbeLevelFDR0.5.txt'), filter=('HUGO', vip_genes)), ('PValue', 1e-8)) vip_blood_eqtls.head() vip_blood_eqtls.groupby(['HUGO'])['HUGO'].count() def read_adme_xls(sheet): return read_xls(os.path.join(DBDIR, 'eqtl-liver', 'tpj201144x3.xls'), sheet=sheet, skiprows=7, filter=('SNP-gene HGNC', vip_genes)) vip_liver_eqtls_1 = pass_below(pd.concat([read_adme_xls('Table S2 (A)'), read_adme_xls('Table S2 (B)'), read_adme_xls('Table S2 (C)')]), ('p-value', 1e-8)) vip_liver_eqtls_1.head() vip_liver_eqtls_2 = pass_below(read_csv(os.path.join(DBDIR, 'eqtl-liver', 'journal.pgen.1002078.s013'), filter=('Gene', vip_genes)), ('UC Best RS UC lm pvalue', 1e-4)) vip_liver_eqtls_2.head() vip_intestine_eqtls = pass_below(read_xls(os.path.join(DBDIR, 'eqtl-intestine', 'mmc1.xls'), filter=('Gene', vip_genes)), ('P-value', 1e-8)) vip_intestine_eqtls.head() def eqtl_statistics(df, rsid_column, vip_snps): count = len(df[df[rsid_column].isin(vip_snps)]) return "Out of {} eQTLs, {} are already annotated in PharmGKB.".format(len(df), count) # compare eQTLs and PharmGKB SNPs print("Blood:", eqtl_statistics(vip_blood_eqtls, 'SNPName', vip_snps.RSID)) print("Liver 1:", eqtl_statistics(vip_liver_eqtls_1, 'SNP ID', vip_snps.RSID)) print("Liver 2:", eqtl_statistics(vip_liver_eqtls_2, 'UC Best RSID', vip_snps.RSID)) print("Intestines:", eqtl_statistics(vip_intestine_eqtls, 'SNP', vip_snps.RSID)) blood_set = set(vip_blood_eqtls['SNPName'].tolist()) liver_set = set(vip_liver_eqtls_1['SNP ID'].tolist()) | set(vip_liver_eqtls_2['UC Best RSID']) intestine_set = set(vip_intestine_eqtls['SNP']) list(map(len, [blood_set, liver_set, intestine_set])) venn3([liver_set, intestine_set, blood_set], ('Liver', 'Intestine', 'Blood')) intestine_set.intersection(liver_set) blood_set.intersection(liver_set) blood_set liver_set intestine_set