#!/usr/bin/env python3
import os
import re
import sys
import collections
import argparse
#import tables
import itertools
import matplotlib
import glob
import math
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy.sparse as sp_sparse
from multiprocessing import Pool
from collections import defaultdict
from scipy import sparse, io
from scipy.sparse import csr_matrix
from multiprocessing import Pool
#from matplotlib_venn import venn2, venn2_circles
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
GWAS_df = pd.read_csv('./gwas_catalog_v1.0.2-associations_e95_r2019-03-01.tsv', sep='\t', header=0, low_memory=False)
GWAS_df.columns
Index(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE', 'JOURNAL', 'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE SIZE', 'REPLICATION SAMPLE SIZE', 'REGION', 'CHR_ID', 'CHR_POS', 'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID', 'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE', 'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS', 'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC', 'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG', 'P-VALUE (TEXT)', 'OR or BETA', '95% CI (TEXT)', 'PLATFORM [SNPS PASSING QC]', 'CNV', 'MAPPED_TRAIT', 'MAPPED_TRAIT_URI', 'STUDY ACCESSION', 'GENOTYPING TECHNOLOGY'], dtype='object')
#chr6:135252920-135391745
#chr6:135089817-135228642
#region = 'chr6:135252920-135421745'
## Now we get all the SNPs within the MYB enhancer region
region = 'chr6:135152920-135921745'
chrom, left, right = re.split(':|-', region)
snp_idx = []
hits_df = pd.DataFrame()
for i, row in GWAS_df.loc[(GWAS_df.CHR_ID == '6')].iterrows():
try:
pos = int(row.CHR_POS)
# print(pos)
if (pos > int(left)) & (pos < int(right)):
snp_idx.append(i)
hits_df = hits_df.append(row)
sys.exit(0)
except:
next
pd.set_option("display.max_rows", 200)
hits_df[['CHR_ID', 'CHR_POS', 'PVALUE_MLOG', 'DISEASE/TRAIT', 'SNPS', 'LINK']].sort_values(by='CHR_POS').head()
CHR_ID | CHR_POS | PVALUE_MLOG | DISEASE/TRAIT | SNPS | LINK | |
---|---|---|---|---|---|---|
116684 | 6 | 135161428 | 9.301030 | Balding type 1 | rs6569999 | www.ncbi.nlm.nih.gov/pubmed/30595370 |
105128 | 6 | 135165003 | 14.397940 | Red cell distribution width | rs113617776 | www.ncbi.nlm.nih.gov/pubmed/30595370 |
24037 | 6 | 135173737 | 5.698970 | Multiple sclerosis | rs9321490 | www.ncbi.nlm.nih.gov/pubmed/21833088 |
103354 | 6 | 135174088 | 252.522879 | Mean corpuscular hemoglobin | rs2327586 | www.ncbi.nlm.nih.gov/pubmed/30595370 |
6928 | 6 | 135178322 | 8.221849 | White blood cell count (basophil) | rs9376098 | www.ncbi.nlm.nih.gov/pubmed/27863252 |
final_hits_df = hits_df[['CHR_ID', 'CHR_POS', 'PVALUE_MLOG', 'DISEASE/TRAIT', 'SNPS']].sort_values(by='CHR_POS')
snp_list = []
for i, row in final_hits_df.iterrows():
snp = '\t'.join(row[['CHR_ID', 'CHR_POS', 'SNPS']].values)
snp_list.append(snp)
for i in np.unique(snp_list):
chrom, left, snp = i.split('\t')
right = int(left) + 1
print('chr', end = '')
print('\t'.join([chrom, str(int(left)-300), str(right+300), snp]))
chr6 135161128 135161729 rs6569999 chr6 135164703 135165304 rs113617776 chr6 135173437 135174038 rs9321490 chr6 135173788 135174389 rs2327586 chr6 135178022 135178623 rs9376098 chr6 135182347 135182948 rs210962 chr6 135193392 135193993 rs34931156 chr6 135201688 135202289 rs181880988 chr6 135211846 135212447 rs210946 chr6 135218942 135219543 rs210937 chr6 135264540 135265141 rs118038583 chr6 135304910 135305511 rs6928977 chr6 135318206 135318807 rs6914831 chr6 135328183 135328784 rs146318841 chr6 135347593 135348194 rs17064440 chr6 135347659 135348260 rs2746427 chr6 135370078 135370679 rs2246943 chr6 135375159 135375760 rs2746432 chr6 135386048 135386649 rs7773987 chr6 135391000 135391601 rs12190426 chr6 135396630 135397231 rs2746438 chr6 135399847 135400448 rs13208164 chr6 135417917 135418518 rs11154801 chr6 135538259 135538860 rs9402715 chr6 135539603 135540204 rs144124553 chr6 135568367 135568968 rs12663042 chr6 135581161 135581762 rs761357 chr6 135589473 135590074 rs116990752 chr6 135629979 135630580 rs2327650 chr6 135679565 135680166 rs9389316 chr6 135679596 135680197 rs9402743 chr6 135744375 135744976 rs1119555 chr6 135746578 135747179 rs4895455 chr6 135756491 135757092 rs79846291 chr6 135812221 135812822 rs947583 chr6 135840984 135841585 rs117976152 chr6 135883067 135883668 rs10872447 chr6 135892717 135893318 rs2027518 chr6 135905816 135906417 rs9376164 chr6 135907179 135907780 rs78928932 chr6 135907237 135907838 rs12209685 chr6 135909573 135910174 rs12529766 chr6 135914130 135914731 rs12208105
For similicity, we group all the traits to a few major catalogs, and then use WashU genome browser to visualize all of them.