Method based on our previousely reported protocal, but that replaces HapMap linkage data from DAPPLE with 1000 Genomes Pilot data from SNAP:
import re
import json
import math
import collections
import requests
import numpy
import pandas
# Read EBI's GWAS Catalog with ontology annotations
# https://www.ebi.ac.uk/gwas/docs/fileheaders
path = 'download/gwas_catalog_v1.0.1-downloaded_2015-06-08.tsv.gz'
dtype_dict = {'UPSTREAM_GENE_ID': numpy.character,
'DOWNSTREAM_GENE_ID': numpy.character,
'SNP_GENE_IDS': numpy.character}
ebi_df = pandas.read_table(path, compression='gzip', low_memory=False, dtype=dtype_dict)
ebi_df.head()
DATE ADDED TO CATALOG | PUBMEDID | FIRST AUTHOR | DATE | JOURNAL | LINK | STUDY | DISEASE/TRAIT | INITIAL SAMPLE DESCRIPTION | REPLICATION SAMPLE DESCRIPTION | ... | RISK ALLELE FREQUENCY | P-VALUE | PVALUE_MLOG | P-VALUE (TEXT) | OR or BETA | 95% CI (TEXT) | PLATFORM [SNPS PASSING QC] | CNV | MAPPED_TRAIT | MAPPED_TRAIT_URI | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | 25231870 | Perry JR | 23-Jul-2014 | Nature | http://europepmc.org/abstract/MED/25231870 | Parent-of-origin-specific allelic associations... | Menarche (age at onset) | Up to 182,413 European ancestry women | NaN | ... | 0.47 | 2.000000e-12 | 11.698970004336019 | NaN | 0.04 | [0.03-0.05] (unit increase) | Illumina & Affymetrix [2,441,815] (imputed) | N | age at menarche | http://www.ebi.ac.uk/efo/EFO_0004703 |
1 | NaN | 25231870 | Perry JR | 23-Jul-2014 | Nature | http://europepmc.org/abstract/MED/25231870 | Parent-of-origin-specific allelic associations... | Menarche (age at onset) | Up to 182,413 European ancestry women | NaN | ... | 0.29 | 7.000000e-20 | 19.154901959985743 | NaN | 0.05 | [0.038-0.062] (unit increase) | Illumina & Affymetrix [2,441,815] (imputed) | N | age at menarche | http://www.ebi.ac.uk/efo/EFO_0004703 |
2 | NaN | 25231870 | Perry JR | 23-Jul-2014 | Nature | http://europepmc.org/abstract/MED/25231870 | Parent-of-origin-specific allelic associations... | Menarche (age at onset) | Up to 182,413 European ancestry women | NaN | ... | 0.29 | 7.000000e-20 | 19.154901959985743 | NaN | 0.05 | [0.038-0.062] (unit increase) | Illumina & Affymetrix [2,441,815] (imputed) | N | age at menarche | http://www.ebi.ac.uk/efo/EFO_0004703 |
3 | NaN | 25231870 | Perry JR | 23-Jul-2014 | Nature | http://europepmc.org/abstract/MED/25231870 | Parent-of-origin-specific allelic associations... | Menarche (age at onset) | Up to 182,413 European ancestry women | NaN | ... | 0.46 | 3.000000e-08 | 7.522878745280337 | NaN | 0.03 | [0.02-0.04] (unit increase) | Illumina & Affymetrix [2,441,815] (imputed) | N | age at menarche | http://www.ebi.ac.uk/efo/EFO_0004703 |
4 | NaN | 25231870 | Perry JR | 23-Jul-2014 | Nature | http://europepmc.org/abstract/MED/25231870 | Parent-of-origin-specific allelic associations... | Menarche (age at onset) | Up to 182,413 European ancestry women | NaN | ... | 0.46 | 3.000000e-08 | 7.522878745280337 | NaN | 0.03 | [0.02-0.04] (unit increase) | Illumina & Affymetrix [2,441,815] (imputed) | N | age at menarche | http://www.ebi.ac.uk/efo/EFO_0004703 |
5 rows × 36 columns
# Read entrez symbol to gene mapping
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/symbols-human.json'
symbol_to_id = requests.get(url).json()
# Read entrez synonym to genes mapping
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/synonyms-human.json'
synonym_to_ids = requests.get(url).json()
# Read entrez genes
url = 'https://raw.githubusercontent.com/dhimmel/entrez-gene/6e133f9ef8ce51a4c5387e58a6cc97564a66cec8/data/genes-human.tsv'
entrez_df = pandas.read_table(url)
# Create a set of coding genes
coding_genes = set(entrez_df.GeneID[entrez_df.type_of_gene == 'protein-coding'].astype(str))
len(coding_genes)
# Create a symbol dataframe
symbol_df = entrez_df[['GeneID', 'Symbol']].rename(columns={'GeneID': 'gene', 'Symbol': 'symbol'})
symbol_df.gene = symbol_df.gene.astype(str)
# Create a uri_df (for cross-references)
rows = list()
for uri in filter(pandas.notnull, set(ebi_df['MAPPED_TRAIT_URI'])):
head, tail = uri.rsplit('/', 1)
resource, resource_id = tail.split('_', 1)
rows.append([uri, resource, resource_id])
uri_df = pandas.DataFrame(rows, columns=['MAPPED_TRAIT_URI', 'resource', 'resource_id'])
# Read DO Slim propagated cross-references
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs-prop-slim.tsv'
doxref_df = pandas.read_table(url)
# Inner join the GWAS catalog with the DO slim mapping
map_df = uri_df.merge(doxref_df)
map_df = map_df[['MAPPED_TRAIT_URI', 'doid_code', 'doid_name']]
ebi_df = ebi_df.merge(map_df)
len(ebi_df)
10342
The SNAP proxy search must be run manually. Windows are computed in a companion notebook
# Write all lead SNPs for SNAP input
gwas_snps = set('rs{}'.format(x) for x in ebi_df.SNP_ID_CURRENT if pandas.notnull(x))
gwas_snps = sorted(gwas_snps)
with open('data/snap/do-slim-lead-SNPs.txt', 'w') as write_file:
write_file.write('\n'.join(gwas_snps))
len(gwas_snps)
5255
HC_mlogp = -math.log10(5e-8)
HC_samples = 1000
def extract_sample_size(text):
pattern_cases = r'([\d,]+)[^\d]*?cases'
pattern_contols = r'([\d,]+)[^\d]*?controls'
pattern_samples = r'[\d,]+'
remove_commas = lambda x: int(x.replace(',', ''))
sample_size = {'cases': None, 'controls': None, 'samples': None,}
for key, pattern in ('cases', pattern_cases), ('controls', pattern_contols):
try:
match = re.search(pattern, text)
sample_size[key] = remove_commas(match.group(1))
except (IndexError, AttributeError, ValueError):
continue
if sample_size['cases'] is not None and sample_size['controls'] is not None:
sample_size['samples'] = sample_size['cases'] + sample_size['controls']
if sample_size['cases'] is None and sample_size['controls'] is None:
try:
match = re.search(pattern_samples, text)
sample_size['samples'] = remove_commas(match.group(0))
except (IndexError, AttributeError, ValueError):
pass
return sample_size
def process_reported_symbols(text):
if pandas.isnull(text):
return set()
symbols = set(re.split(r'[, ]+', text))
symbols -= {'Intergenic', 'NR', 'Pending'}
entrez_ids = set()
for symbol in symbols:
gene_id = symbol_to_id.get(symbol)
if not gene_id:
gene_ids = synonym_to_ids.get(symbol, [])
if len(gene_ids) != 1:
continue
gene_id, = gene_ids
entrez_ids.add(str(gene_id))
return entrez_ids
# rename and order columns
rename_dict = {'SNP_ID_CURRENT': 'lead_snp', 'REPORTED GENE(S)': 'symbols', 'PUBMEDID': 'pubmed_id',
'INITIAL SAMPLE DESCRIPTION': 'sample_description', 'PVALUE_MLOG': 'mlog_pvalue',
'DATE': 'date', 'REPORTED GENE(S)': 'reported_symbols', 'UPSTREAM_GENE_ID': 'upstream_gene',
'DOWNSTREAM_GENE_ID': 'downstream_gene', 'SNP_GENE_IDS': 'containing_genes'}
columns = ['doid_code', 'doid_name'] + list(rename_dict.values())
association_df = ebi_df.rename(columns = rename_dict)[columns]
# split containing genes
association_df.containing_genes = association_df.containing_genes.map(
lambda x: set(x.split(';')) if pandas.notnull(x) else set())
# convert reported symbols to entrez GeneIDs
association_df['reported_genes'] = association_df.reported_symbols.map(process_reported_symbols)
# convert dates
association_df.date = pandas.to_datetime(association_df.date)
# Add rs to SNP names
association_df.lead_snp = 'rs' + association_df.lead_snp
# extract sample size information
sample_size_df = pandas.DataFrame(list(map(extract_sample_size, association_df.sample_description)))
association_df = association_df.drop(['sample_description'], axis=1)
association_df = pandas.concat([association_df, sample_size_df], axis=1)
# convert mlog_pavlue to numeric
association_df.mlog_pvalue = association_df.mlog_pvalue.convert_objects(convert_numeric=True)
association_df['high_confidence'] = (
(association_df.mlog_pvalue >= HC_mlogp) &
(association_df.samples >= HC_samples)).astype(int)
# Add window information
window_df = pandas.read_table('data/snap/windows.tsv')
association_df = association_df.merge(window_df[['lead_snp', 'lead_chrom', 'lower_coord', 'upper_coord']])
association_df.head()
doid_code | doid_name | reported_symbols | pubmed_id | upstream_gene | date | containing_genes | lead_snp | mlog_pvalue | downstream_gene | reported_genes | cases | controls | samples | high_confidence | lead_chrom | lower_coord | upper_coord | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | DOID:1686 | glaucoma | ABCA1 | 25173107 | NaN | 2014-08-31 | set([]) | rs2487032 | 13.154902 | NaN | set([19]) | 966 | 1005 | 1971 | 1 | chr9 | 106731644 | 106747174 |
1 | DOID:1686 | glaucoma | PMM2 | 25173107 | NaN | 2014-08-31 | set([]) | rs3785176 | 6.698970 | NaN | set([5373]) | 966 | 1005 | 1971 | 0 | chr16 | 8804432 | 8854102 |
2 | DOID:1686 | glaucoma | CDKN2B-AS1 | 25173105 | NaN | 2014-08-31 | set([100048912]) | rs4977756 | 29.154902 | NaN | set([100048912]) | 1155 | 1922 | 3077 | 1 | chr9 | 21993367 | 22111349 |
3 | DOID:1686 | glaucoma | CDKN2B-AS1 | 25173105 | NaN | 2014-08-31 | set([102724137]) | rs4977756 | 29.154902 | NaN | set([100048912]) | 1155 | 1922 | 3077 | 1 | chr9 | 21993367 | 22111349 |
4 | DOID:1686 | glaucoma | CDKN2B-AS1 | 21532571 | NaN | 2011-05-01 | set([100048912]) | rs4977756 | 14.000000 | NaN | set([100048912]) | 590 | 3956 | 4546 | 1 | chr9 | 21993367 | 22111349 |
def identify_loci(disease_df):
disease_df = disease_df.sort(['high_confidence', 'mlog_pvalue', 'date'], ascending=False)
loci = dict()
for i, row in disease_df.iterrows():
interval = row.lead_chrom, row.lower_coord, row.upper_coord
for chrom, lower, upper in loci:
if interval[0] == chrom and (
(lower <= interval[1] and interval[1] <= upper) or
(lower <= interval[2] and interval[2] <= upper)):
loci[(chrom, lower, upper)].append(row)
break
else:
loci[interval] = [row]
# convert each locus to a dataframe
loci = list(map(pandas.DataFrame, loci.values()))
for i, locus_df in enumerate(loci):
locus_df['locus'] = i
return pandas.concat(loci)
association_df = association_df.groupby('doid_code', as_index=False).apply(identify_loci)
# Save associations to tsv
association_write_df = association_df.copy()
for column in ['reported_genes', 'containing_genes']:
association_write_df[column] = association_write_df[column].map(lambda x: '|'.join(x))
association_write_df.to_csv('data/snp-associations.tsv', sep='\t', index=False)
def max_counter_keys(counter, key_subset=None):
if key_subset is not None:
counter = collections.Counter({k: v for
k, v in counter.items() if k in key_subset})
max_value = max(list(counter.values()) + [0])
max_keys = [k for k, v in counter.items() if v == max_value]
return max_keys
def associate_by_gene(locus_df):
# sort associations by precedence
locus_df = locus_df.sort(['high_confidence', 'mlog_pvalue', 'date'], ascending=False)
# Count the number of times each gene is reported across all studies in a loci
reported_counts = collections.Counter()
for genes in locus_df.reported_genes:
reported_counts.update(genes)
primary_gene = None
# Identify a single mode reported gene
mode_reported = max_counter_keys(reported_counts, coding_genes)
if len(mode_reported) == 1:
primary_gene, = mode_reported
# Iterate through associations attempting to resolve primary-gene ambiguity
for i, row in locus_df.iterrows():
if primary_gene:
break
# Find containing genes
containing_genes = row.containing_genes & coding_genes
# If a lead SNP is in a gene, consider the containing gene primary
if len(containing_genes) == 1:
primary_gene, = containing_genes
break
# If the lead SNP is in multiple genes, take the mode reported of those genes
mode_reported = max_counter_keys(reported_counts, containing_genes)
if len(mode_reported) == 1:
primary_gene, = mode_reported
break
# Take the more reported gene from the upstream and downstream gene
stream_genes = {gene for gene in [row.downstream_gene, row.upstream_gene] if pandas.notnull(gene)}
mode_reported = max_counter_keys(reported_counts, coding_genes & stream_genes)
if len(mode_reported) == 1:
primary_gene, = mode_reported
break
# All genes except the primary are considered secondary
secondary_genes = set()
for column in ['reported_genes', 'containing_genes']:
for genes in locus_df[column]:
secondary_genes |= genes
secondary_genes |= set(locus_df.upstream_gene)
secondary_genes |= set(locus_df.downstream_gene)
secondary_genes = set(filter(pandas.notnull, secondary_genes))
secondary_genes.discard(primary_gene)
secondary_genes &= coding_genes
# Create a dataframe to return
columns = ['doid_code', 'doid_name', 'locus', 'high_confidence', 'primary', 'gene']
rows = list()
first_row = locus_df.iloc[0]
base_row = first_row.doid_code, first_row.doid_name, first_row.locus, first_row.high_confidence
if primary_gene:
rows.append(base_row + (1, primary_gene))
for secondary_gene in secondary_genes:
rows.append(base_row + (0, secondary_gene))
gene_df = pandas.DataFrame(rows, columns=columns)
return gene_df
grouped = association_df.groupby(['doid_code', 'locus'], as_index=False)
gene_df = grouped.apply(associate_by_gene)
for column in 'locus', 'high_confidence', 'primary':
gene_df[column] = gene_df[column].astype(int)
status = gene_df.apply(lambda x: '{}C-{}'.format('H' if x.high_confidence else 'L', 'P' if x.primary else 'S'), axis=1)
gene_df.insert(5, 'status', status)
gene_df = gene_df.merge(symbol_df, how='left')
# Remove duplicated associations
gene_df = gene_df.sort(['doid_code', 'high_confidence', 'primary'], ascending=False)
associations_with_dups = len(gene_df)
gene_df = gene_df.drop_duplicates(['doid_code', 'gene'])
'{} associations removed that were duplicated across statuses'.format(associations_with_dups - len(gene_df))
'274 associations removed that were duplicated across statuses'
# save gene_df as a tsv
gene_df.to_csv('data/gene-associations.tsv', sep='\t', index=False)
gene_df.head()
doid_code | doid_name | locus | high_confidence | primary | status | gene | symbol | |
---|---|---|---|---|---|---|---|---|
6097 | DOID:9970 | obesity | 0 | 1 | 1 | HC-P | 3953 | LEPR |
6114 | DOID:9970 | obesity | 14 | 1 | 1 | HC-P | 4094 | MAF |
6116 | DOID:9970 | obesity | 15 | 1 | 1 | HC-P | 3667 | IRS1 |
6123 | DOID:9970 | obesity | 20 | 1 | 1 | HC-P | 7021 | TFAP2B |
6124 | DOID:9970 | obesity | 21 | 1 | 1 | HC-P | 79068 | FTO |
statuses = ['HC-P', 'HC-S', 'LC-P', 'LC-S']
def summarize_gene_df(df):
counter = collections.Counter(df.status)
series = pandas.Series()
for status in statuses:
series[status] = counter[status]
return series
# Count the number of associations per disease
disease_summary_df = gene_df.groupby(['doid_code', 'doid_name']).apply(summarize_gene_df)
disease_summary_df = disease_summary_df.reset_index()
disease_summary_df = disease_summary_df.sort(statuses, ascending=False)
disease_summary_df.to_csv('data/diseases.tsv', sep='\t', index=False)
disease_summary_df.head()
doid_code | doid_name | HC-P | HC-S | LC-P | LC-S | |
---|---|---|---|---|---|---|
84 | DOID:8778 | Crohn's disease | 86 | 131 | 10 | 13 |
72 | DOID:5419 | schizophrenia | 79 | 236 | 76 | 57 |
78 | DOID:7148 | rheumatoid arthritis | 79 | 43 | 48 | 30 |
92 | DOID:9352 | type 2 diabetes mellitus | 67 | 31 | 52 | 29 |
43 | DOID:1612 | breast cancer | 58 | 68 | 42 | 30 |
# Count the number of associations per gene
gene_summary_df = gene_df.groupby(['gene', 'symbol']).apply(summarize_gene_df)
gene_summary_df = gene_summary_df.reset_index()
gene_summary_df = gene_summary_df.sort(statuses, ascending=False)
gene_summary_df.to_csv('data/genes.tsv', sep='\t', index=False)
gene_summary_df.head()
gene | symbol | HC-P | HC-S | LC-P | LC-S | |
---|---|---|---|---|---|---|
1971 | 4609 | MYC | 7 | 0 | 0 | 0 |
3051 | 7015 | TERT | 6 | 1 | 0 | 0 |
2564 | 57178 | ZMIZ1 | 6 | 0 | 5 | 0 |
1574 | 3394 | IRF8 | 6 | 0 | 1 | 0 |
1651 | 3559 | IL2RA | 6 | 0 | 0 | 2 |