In [1]:

#!/usr/bin/env python3
import os
import re
import sys
import collections
import argparse
#import tables
import itertools
import matplotlib
import glob
import math
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy.sparse as sp_sparse

from multiprocessing import Pool
from collections import defaultdict
from scipy import sparse, io
from scipy.sparse import csr_matrix
from multiprocessing import Pool
#from matplotlib_venn import venn2, venn2_circles
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:

GWAS_df = pd.read_csv('./gwas_catalog_v1.0.2-associations_e95_r2019-03-01.tsv', sep='\t', header=0, low_memory=False)

In [3]:

GWAS_df.columns

Out[3]:

Index(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL',
       'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE SIZE',
       'REPLICATION SAMPLE SIZE', 'REGION', 'CHR_ID', 'CHR_POS',
       'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',
       'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE',
       'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS',
       'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC',
       'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)',
       'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV',
       'MAPPED_TRAIT', 'MAPPED_TRAIT_URI', 'STUDY ACCESSION',
       'GENOTYPING TECHNOLOGY'],
      dtype='object')

In [5]:

#chr6:135252920-135391745
#chr6:135089817-135228642
#region = 'chr6:135252920-135421745'

## Now we get all the SNPs within the MYB enhancer region
region = 'chr6:135152920-135921745'
chrom, left, right = re.split(':|-', region)
snp_idx = []
hits_df = pd.DataFrame()
for i, row in GWAS_df.loc[(GWAS_df.CHR_ID == '6')].iterrows():
    try: 
        pos = int(row.CHR_POS)
#        print(pos)
        if  (pos > int(left)) & (pos < int(right)):
            snp_idx.append(i)
            hits_df = hits_df.append(row)
        sys.exit(0)
    except:
        next

In [6]:

pd.set_option("display.max_rows", 200)

hits_df[['CHR_ID', 'CHR_POS', 'PVALUE_MLOG', 'DISEASE/TRAIT', 'SNPS', 'LINK']].sort_values(by='CHR_POS').head()

Out[6]:

	CHR_ID	CHR_POS	PVALUE_MLOG	DISEASE/TRAIT	SNPS	LINK
116684	6	135161428	9.301030	Balding type 1	rs6569999	www.ncbi.nlm.nih.gov/pubmed/30595370
105128	6	135165003	14.397940	Red cell distribution width	rs113617776	www.ncbi.nlm.nih.gov/pubmed/30595370
24037	6	135173737	5.698970	Multiple sclerosis	rs9321490	www.ncbi.nlm.nih.gov/pubmed/21833088
103354	6	135174088	252.522879	Mean corpuscular hemoglobin	rs2327586	www.ncbi.nlm.nih.gov/pubmed/30595370
6928	6	135178322	8.221849	White blood cell count (basophil)	rs9376098	www.ncbi.nlm.nih.gov/pubmed/27863252

Print a bed format file for all the SNPs¶

In [9]:

final_hits_df = hits_df[['CHR_ID', 'CHR_POS', 'PVALUE_MLOG', 'DISEASE/TRAIT', 'SNPS']].sort_values(by='CHR_POS')

snp_list = []
for i, row in final_hits_df.iterrows():
    snp = '\t'.join(row[['CHR_ID', 'CHR_POS', 'SNPS']].values)
    snp_list.append(snp)

In [10]:

for i in np.unique(snp_list):
    chrom, left, snp = i.split('\t')
    right = int(left) + 1
    print('chr', end = '')
    print('\t'.join([chrom, str(int(left)-300), str(right+300), snp]))

chr6	135161128	135161729	rs6569999
chr6	135164703	135165304	rs113617776
chr6	135173437	135174038	rs9321490
chr6	135173788	135174389	rs2327586
chr6	135178022	135178623	rs9376098
chr6	135182347	135182948	rs210962
chr6	135193392	135193993	rs34931156
chr6	135201688	135202289	rs181880988
chr6	135211846	135212447	rs210946
chr6	135218942	135219543	rs210937
chr6	135264540	135265141	rs118038583
chr6	135304910	135305511	rs6928977
chr6	135318206	135318807	rs6914831
chr6	135328183	135328784	rs146318841
chr6	135347593	135348194	rs17064440
chr6	135347659	135348260	rs2746427
chr6	135370078	135370679	rs2246943
chr6	135375159	135375760	rs2746432
chr6	135386048	135386649	rs7773987
chr6	135391000	135391601	rs12190426
chr6	135396630	135397231	rs2746438
chr6	135399847	135400448	rs13208164
chr6	135417917	135418518	rs11154801
chr6	135538259	135538860	rs9402715
chr6	135539603	135540204	rs144124553
chr6	135568367	135568968	rs12663042
chr6	135581161	135581762	rs761357
chr6	135589473	135590074	rs116990752
chr6	135629979	135630580	rs2327650
chr6	135679565	135680166	rs9389316
chr6	135679596	135680197	rs9402743
chr6	135744375	135744976	rs1119555
chr6	135746578	135747179	rs4895455
chr6	135756491	135757092	rs79846291
chr6	135812221	135812822	rs947583
chr6	135840984	135841585	rs117976152
chr6	135883067	135883668	rs10872447
chr6	135892717	135893318	rs2027518
chr6	135905816	135906417	rs9376164
chr6	135907179	135907780	rs78928932
chr6	135907237	135907838	rs12209685
chr6	135909573	135910174	rs12529766
chr6	135914130	135914731	rs12208105

The major catalog of traits¶

For similicity, we group all the traits to a few major catalogs, and then use WashU genome browser to visualize all of them.

Red blood cell count
Red cell distribution width
Mean corpuscular hemoglobin

White blood cell count (basophil)
Neutrophil count
Sum basophil neutrophil counts
White blood cell count
Basophil percentage of granulocytes
Basophil percentage of white cells
Granulocyte count
Myeloid white cell count

Other Blood¶

Blood osmolality (transformed sodium)
Diastolic blood pressure
Selective IgA deficiency

Immune / Autoimmune¶

Autoimmune traits
Hodgkin's lymphoma
Multiple sclerosis
Systemic lupus erythematosus
Nodular sclerosis Hodgkin lymphoma
Immune reponse to smallpox (secreted IL-2)

Other¶

Balding type 1
Eczema
Heel bone mineral density
Hippocampal volume in Alzheimer's disease dementia
Lung function (FVC)
Menarche (age at onset)
Obesity-related traits
Peripheral arterial disease (traffic-related air pollution interaction)
Phosphorus levels
Respiratory diseases
Hypothyroidism

Print a bed format file for all the SNPs¶

The major catalog of traits¶

Red blood cell related¶

Myeloid cell related¶

Other Blood¶

Immune / Autoimmune¶

Other¶