In [1]:
%matplotlib inline

import time
import pandas as pd
import os
from matplotlib_venn import venn3

DEBUG = True
DBDIR = os.path.join("..", "databases")

def debug(*args):
    """ Prints to stdout only if DEBUG is True. """
    if DEBUG:
def timed(function):
    """ A decorator that prints the running time of the function call. """
    def inner(*args, **kwargs):
        start_time = time.time()
        result = function(*args, **kwargs)
        print("{} seconds".format(time.time() - start_time))
        return result
    return inner

def read_csv(path, sep='\t', filter=None, chunksize=100000, skiprows=None):
    Reads a csv file from a given filepath into a pandas Data frame.
    Files are iterated over with a chunksize of 100000 by default.
    Rows can be filtered so that only rows with certain values in a column pass.
    All rows that do not pass the filter are immediately discarded.
    When using this to load eQTLs, you can include a tuple for filtering based on gene names
    (such as VIP genes from PharmGKB).
    The first element of the tuple is the name of the field to be filtered on,
    the second element is a list of values that pass the filter.
    print("Reading file: {}...".format(path))
    iterator = pd.read_csv(path, sep=sep, chunksize=chunksize, skiprows=skiprows)
    if filter is None:
        df = pd.concat(iterator)
        df = pd.concat([chunk[chunk[filter[0]].isin(filter[1])] for chunk in iterator], ignore_index=True)
    print("Returning {} rows from {}".format(len(df), path))
    return df

def read_xls(path, sheet=0, filter=None, skiprows=0):
    df = pd.read_excel(path, sheet, skiprows=skiprows)
    if filter is not None:
        df = df[df[filter[0]].isin(filter[1])]
        df = df.reset_index(drop=True)
    print("Returning {} rows from {} ({})".format(len(df), path, sheet))
    return df

def pass_below(df, filter):
    Filters a pandas Data frame, so that all rows where column filter[0] < filter[1] pass.
    E.g. filter(df, ('p-value', 1e-8)) returns a dataframe consisting of all rows in df where
    the value of column 'p-value' is below 1e-8.
    df = df[df[filter[0]] < filter[1]]
    df = df.reset_index(drop=True)
    print("{} rows left after filtering by column '{}' < {}".format(len(df), filter[0], filter[1]))
    return df

Get PharmGKB data

Get VIP genes

Very Important Pharmacogenes (VIP genes) are genes that are involved in drug response and have a summary in PharmGKB.

In [2]:
# dataframe containing the data for VIP genes
vip_genes_df = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'genes.tsv'), filter=('Is VIP', [True]))
# set of the VIP gene names
vip_genes = set(vip_genes_df['Symbol'])

Reading file: ..\databases\pharmgkb clinical annotations\genes.tsv...
Returning 54 rows from ..\databases\pharmgkb clinical annotations\genes.tsv
PharmGKB Accession Id Entrez Id Ensembl Id Name Symbol Alternate Names Alternate Symbols Is VIP Has Variant Annotation Cross-references Has CPIC Dosing Guideline Chromosome Chromosomal Start Chromosomal Stop
0 PA109 1080 ENSG00000001626 cystic fibrosis transmembrane conductance regu... CFTR ATP-binding cassette sub-family C, member 7, ABC35,"CFTR/MRP","MRP7","TNR-CFTR","dJ760C5.1", True True HumanCycGene:HS00075,alfred:LO000194O,ctd:1080... True chr7 117110017 117311719
1 PA117 1312 ENSG00000093010 catechol-O-methyltransferase COMT NaN NaN True True HumanCycGene:HS01791,alfred:LO000195P,ctd:1312... False chr22 019919263 019960498
2 PA121 1548 ENSG00000198077 cytochrome P450, family 2, subfamily A, polype... CYP2A6 NaN CPA6,"CYP2A", True True HumanCycGene:HS10343,alfred:LO027806X,ctd:1548... False chr19 041346443 041366352
3 PA123 1555 ENSG00000197408 cytochrome P450, family 2, subfamily B, polype... CYP2B6 NaN CPB6,"CYPIIB6", True True HumanCycGene:HS09587,ctd:1555,ensembl:ENSG0000... False chr19 041487204 041527301
4 PA124 1557 ENSG00000165841 cytochrome P450, family 2, subfamily C, polype... CYP2C19 NaN CPCJ,"P450IIC19", True True HumanCycGene:HS09293,alfred:LO008778E,ctd:1557... True chr10 096512463 096615671

Get SNPs annotated in PharmGKB

These SNPs are from the PharmGKB website. It is a list of ~25M SNPs that are within genes and are annotated in PharmGKB. Around 70k of those SNPs correspond to VIP genes. However, the only available data in this file is the name of the SNP and identifiers for the related gene.

In [3]:
# SNPs in VIP genes with annotation in PharmGKB
vip_snps = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'rsid.tsv'), skiprows=1, filter=('Gene Symbols', vip_genes))
Reading file: ..\databases\pharmgkb clinical annotations\rsid.tsv...
Returning 69182 rows from ..\databases\pharmgkb clinical annotations\rsid.tsv
RSID Gene IDs Gene Symbols
0 rs5270 PA293 PTGS2
1 rs5271 PA293 PTGS2
2 rs5272 PA293 PTGS2
3 rs5273 PA293 PTGS2
4 rs5274 PA293 PTGS2

Get clinical annotations

Load PharmGKB's clinical annotations from clinicalAnnotations.csv.

In [4]:
# clinical annotations in PharmGKB for VIP genes
vip_annotations = read_csv(os.path.join(DBDIR, 'pharmgkb clinical annotations', 'clinicalAnnotations.csv'), sep=',', filter=('gene', vip_genes))
# set of refSNP IDs with clinical annotation
vip_clinical_snps = set(vip_annotations.variant)

Reading file: ..\databases\pharmgkb clinical annotations\clinicalAnnotations.csv...
Returning 673 rows from ..\databases\pharmgkb clinical annotations\clinicalAnnotations.csv
variant gene type level of evidence drugs diseases
0 rs4149056 SLCO1B1 Toxicity/ADR 1A simvastatin Muscular Diseases,Myopathy, Central Core
1 rs28399504 CYP2C19 Efficacy 1A clopidogrel Acute coronary syndrome,Cardiovascular Diseases
2 HLA-B *57:01:01 HLA-B Toxicity/ADR 1A abacavir Drug Hypersensitivity
3 rs4244285 CYP2C19 Efficacy 1A amitriptyline NaN
4 CYP2C19 *1, CYP2C19 *17, CYP2C19 *2, CYP2C19 *3 CYP2C19 Efficacy,Toxicity/ADR 1A amitriptyline NaN
In [5]:
# counts of clinical annotations in VIP genes
vip_annotations_by_gene = vip_annotations.groupby(['gene', 'level of evidence'])
vip_annotations_by_gene.count().head()              # summary
#print(vip_annotations_by_gene.count().to_string())  # all values as string
#from IPython.core.display import HTML, display      # all values as HTML
variant type drugs diseases
gene level of evidence
ABCB1 2A 3 3 3 2
3 86 85 86 71
4 28 28 28 6
ACE 2A 1 1 1 1
3 14 14 14 10

Get eQTL data

Load blood eQTLs

Westra et al.

The eQTLs for blood have been downloaded from

The data are in a single tsv file (2012-12-21-CisAssociationsProbeLevelFDR0.5.txt), ~900k cis-eQTLs (Trans-eQTLs don't have a gene identifier, so those are not used).

In [6]:
vip_blood_eqtls = pass_below(read_csv(os.path.join(DBDIR, 'eqtl-blood', '2012-12-21-CisAssociationsProbeLevelFDR0.5.txt'), filter=('HUGO', vip_genes)), 
                             ('PValue', 1e-8))
Reading file: ..\databases\eqtl-blood\2012-12-21-CisAssociationsProbeLevelFDR0.5.txt...
Returning 2175 rows from ..\databases\eqtl-blood\2012-12-21-CisAssociationsProbeLevelFDR0.5.txt
760 rows left after filtering by column 'PValue' < 1e-08
PValue SNPName SNPChr SNPChrPos ProbeName ProbeChr ProbeCenterChrPos CisTrans SNPType AlleleAssessed OverallZScore DatasetsWhereSNPProbePairIsAvailableAndPassesQC DatasetsZScores HUGO FDR
0 9.813428e-198 rs407257 22 22676550 7400537 22 22706401 cis C/G G -69.193035 EGCUT,SHIP_TREND,Groningen-HT12,Groningen-H8v2... -30.195047,-30.32998,-32.38628,-12.45152,-25.1... GSTT1 0
1 9.813428e-198 rs5760147 22 22664948 7400537 22 22706401 cis A/C C -59.187917 EGCUT,SHIP_TREND,Groningen-HT12,Groningen-H8v2... -27.385191,-25.535507,-27.544918,-9.413573,-20... GSTT1 0
2 9.813428e-198 rs4822466 22 22642204 7400537 22 22706401 cis A/G G -57.828173 EGCUT,SHIP_TREND,Groningen-HT12,Groningen-H8v2... -27.614359,-23.631924,-28.399845,-11.038388,-2... GSTT1 0
3 9.813428e-198 rs5760176 22 22732321 7400537 22 22706401 cis A/G G -57.720846 EGCUT,SHIP_TREND,Groningen-HT12,Groningen-H8v2... -26.823101,-22.730206,-28.169338,-10.8359165,-... GSTT1 0
4 9.813428e-198 rs4822458 22 22595659 7400537 22 22706401 cis T/C C -56.486053 EGCUT,SHIP_TREND,Groningen-HT12,Groningen-H8v2... -27.78793,-23.153557,-27.657444,-10.880648,-19... GSTT1 0

Show counts of different genes in filtered blood eQTL list.

In [7]:
ADRB2       11
BRCA1      179
CYP2J2      45
DPYD        18
F5          84
GSTP1       50
GSTT1      220
KCNH2        3
NQO1        55
PTGS2       33
SLC19A1      6
TPMT        56
Name: HUGO, dtype: int64

Load liver eQTLs

Schröder et al.

Genomics of ADME gene expression: mapping expression quantitative trait loci relevant for absorption, distribution, metabolism and excretion of drugs in human liver (, Table S2.

The data are in three worksheets of a single xls file (tpj201144x3.xls, with sheets A, B and C), ~1k eQTLs altogether.

In [8]:
def read_adme_xls(sheet):
    return read_xls(os.path.join(DBDIR, 'eqtl-liver', 'tpj201144x3.xls'), sheet=sheet, skiprows=7, filter=('SNP-gene HGNC', vip_genes))

vip_liver_eqtls_1 = pass_below(pd.concat([read_adme_xls('Table S2 (A)'), read_adme_xls('Table S2 (B)'), read_adme_xls('Table S2 (C)')]),
                               ('p-value', 1e-8))
Returning 1 rows from ..\databases\eqtl-liver\tpj201144x3.xls (Table S2 (A))
Returning 0 rows from ..\databases\eqtl-liver\tpj201144x3.xls (Table S2 (B))
Returning 4 rows from ..\databases\eqtl-liver\tpj201144x3.xls (Table S2 (C))
5 rows left after filtering by column 'p-value' < 1e-08
Description Description.1 Linkage (r2) SNP ID SNP location SNP-chromosome SNP-gene HGNC SNP-gene RefSeq ID SNPs Seattle study Seattle p-value Trait HGNC Trait RefSeq ID Trait-chromosome p-value
0 Homo sapiens cytochrome P450, family 3, subfam... Homo sapiens cytochrome P450, family 3, subfam... NaN rs10242455 DOWNSTREAM 7 CYP3A5 NM_000777.2 rs10242455 3.350000e-22 CYP3A5 NM_000777.2 7 3.360000e-10
1 Homo sapiens enolase superfamily member 1 (ENO... Homo sapiens thymidylate synthetase (TYMS), mRNA. NaN rs2847153 INTRONIC 18 TYMS NM_001071.1 NaN NaN ENOSF1 NM_017512.2 18 1.430000e-11
2 Homo sapiens nudix (nucleoside diphosphate lin... Homo sapiens glutathione S-transferase pi (GST... NaN rs1695 NaN 11 GSTP1 NM_000852.2 NaN NaN NUDT8 NM_181843.1 11 1.940000e-14
3 Homo sapiens nudix (nucleoside diphosphate lin... Homo sapiens glutathione S-transferase pi (GST... NaN rs6591256 UPSTREAM 11 GSTP1 NM_000852.2 NaN NaN NUDT8 NM_181843.1 11 5.900000e-10
4 Homo sapiens vitamin K epoxide reductase compl... Homo sapiens vitamin K epoxide reductase compl... NaN rs7294 3PRIME_UTR 16 VKORC1 NM_024006.4 NaN NaN VKORC1 NM_024006.4 16 1.470000e-09

Innocenti et al.

Identification, Replication, and Functional Fine-Mapping of Expression Quantitative Trait Loci in Primary Human Liver Tissue (, Table S1.

The data are in a single tsv file (pgen.1002078.s013), ~12k eQTLs.

In [9]:
vip_liver_eqtls_2 = pass_below(read_csv(os.path.join(DBDIR, 'eqtl-liver', 'journal.pgen.1002078.s013'), filter=('Gene', vip_genes)),
                               ('UC Best RS UC lm pvalue', 1e-4))
Reading file: ..\databases\eqtl-liver\journal.pgen.1002078.s013...
Returning 43 rows from ..\databases\eqtl-liver\journal.pgen.1002078.s013
14 rows left after filtering by column 'UC Best RS UC lm pvalue' < 0.0001
Gene Chr Strand TSS coord UC gene cluster UC probe count in cluster UC probe ID UC dbSnp131/1KG in probe count UC Exon number UC refseq transcript ID ... UC Best RS Merck alleles UC Best RS Merck MAF UC Best RS Merck imputed bool UC Best RS Merck lm beta UC Best RS Merck pvalue UC Best RS UC n=60 simulations UC Best RS UC genotype variance UC Best RS UW genotype variance UC Best RS Merck genotype variance UC Best RS hapmap variance
0 MTHFR 1 - 11788747 3 1 A_23_P404045 3 7 NM_005957 ... G.T 0.7085 1 0.009211 0.286432 0.85 0.429156 0.274213 0.405950 0.232063
1 DPYD 1 - 98159203 1 2 NaN NaN NaN NaN ... C.T 0.9195 1 -0.020213 0.381781 0.62 0.160711 0.066100 0.157506 0.058594
2 F5 1 - 167822393 1 2 NaN NaN NaN NaN ... A.C 0.4035 1 0.003945 0.773724 0.94 0.442370 0.259345 0.469296 0.227500
3 P2RY12 3 - 152585234 1 1 A_23_P143902 2 2 NM_022788 ... G.A 0.5950 1 -0.000442 0.963123 0.61 0.475411 0.374225 0.493117 0.232687
4 SLC22A1 6 + 160462852 1 1 A_23_P145569 2 7 NM_153187 ... A.G 0.6210 1 -0.096047 0.000010 0.81 0.486203 0.468644 0.489380 0.236527

5 rows × 45 columns

Load intestine eQTLs

Kabakchiev et al.

Expression Quantitative Trait Loci Analysis Identifies Associations Between Genotype and Gene Expression in Human Intestine (, Supplementary Table 1.

The data are in mmc1.xls on worksheet '50 Kb', ~14k Cis eQTLs.

In [10]:
vip_intestine_eqtls = pass_below(read_xls(os.path.join(DBDIR, 'eqtl-intestine', 'mmc1.xls'), filter=('Gene', vip_genes)),
                                 ('P-value', 1e-8))
Returning 128 rows from ..\databases\eqtl-intestine\mmc1.xls (0)
27 rows left after filtering by column 'P-value' < 1e-08
Gene Gene Chromosome SNP SNP Chromosome SNP Position SNP Locus P-value FDR P-value
0 VKORC1 16 rs9923231 16 31107689 312832 3.709740e-21 3.490030e-17
1 VKORC1 16 rs9934438 16 31104878 312832 3.709740e-21 3.441560e-17
2 VKORC1 16 rs749671 16 31088347 312832 2.647910e-20 2.056600e-16
3 VKORC1 16 rs2359612 16 31103796 312832 3.784890e-20 2.633460e-16
4 VKORC1 16 rs14235 16 31121793 312845 6.003060e-20 3.855530e-16


In [11]:
def eqtl_statistics(df, rsid_column, vip_snps):
    count = len(df[df[rsid_column].isin(vip_snps)])
    return "Out of {} eQTLs, {} are already annotated in PharmGKB.".format(len(df), count)

# compare eQTLs and PharmGKB SNPs
print("Blood:", eqtl_statistics(vip_blood_eqtls, 'SNPName', vip_snps.RSID))
print("Liver 1:", eqtl_statistics(vip_liver_eqtls_1, 'SNP ID', vip_snps.RSID))
print("Liver 2:", eqtl_statistics(vip_liver_eqtls_2, 'UC Best RSID', vip_snps.RSID))
print("Intestines:", eqtl_statistics(vip_intestine_eqtls, 'SNP', vip_snps.RSID))
Blood: Out of 760 eQTLs, 191 are already annotated in PharmGKB.
Liver 1: Out of 5 eQTLs, 4 are already annotated in PharmGKB.
Liver 2: Out of 14 eQTLs, 7 are already annotated in PharmGKB.
Intestines: Out of 27 eQTLs, 13 are already annotated in PharmGKB.
In [12]:
blood_set = set(vip_blood_eqtls['SNPName'].tolist())
liver_set = set(vip_liver_eqtls_1['SNP ID'].tolist()) | set(vip_liver_eqtls_2['UC Best RSID'])
intestine_set = set(vip_intestine_eqtls['SNP'])
In [13]:
list(map(len, [blood_set, liver_set, intestine_set]))
[668, 19, 27]
In [14]:
venn3([liver_set, intestine_set, blood_set], ('Liver', 'Intestine', 'Blood'))
<matplotlib_venn._common.VennDiagram at 0x9d81748>
In [15]:
In [16]:
{'rs10489185', 'rs407257', 'rs6591256'}
In [18]:
In [19]:
In [20]: