import sys
import pandas
# local imports
sys.path.insert(0, '../')
import utils
commit = '72614ade9f1cc5a5317b8f6836e1e464b31d5587'
url = utils.rawgit('dhimmel', 'disease-ontology', commit, 'data/slim-terms.tsv')
disease_df = pandas.read_table(url)
disease_df = disease_df.rename(columns={'doid': 'doid_id', 'name': 'doid_name'})
disease_df = disease_df[['doid_id', 'doid_name']]
disease_df.head(2)
doid_id | doid_name | |
---|---|---|
0 | DOID:2531 | Hematologic cancer |
1 | DOID:1319 | Brain cancer |
commit = '6e133f9ef8ce51a4c5387e58a6cc97564a66cec8'
url = utils.rawgit('dhimmel', 'entrez-gene', commit, 'data/genes-human.tsv')
gene_df = pandas.read_table(url)
gene_df = gene_df[gene_df.type_of_gene == 'protein-coding']
gene_df = gene_df.rename(columns={'GeneID': 'entrez_gene_id', 'Symbol': 'gene_symbol'})
gene_df = gene_df[['entrez_gene_id', 'gene_symbol']]
gene_df.head(2)
entrez_gene_id | gene_symbol | |
---|---|---|
0 | 1 | A1BG |
1 | 2 | A2M |
# DISEASES
commit = 'e0089ef89a56348d7d4e0684a9c51c5747b16237'
url = utils.rawgit('dhimmel', 'diseases', commit, 'data/merged-slim.tsv')
diseases_df = pandas.read_table(url)
diseases_df.head(2)
doid_id | doid_name | entrez_gene_id | gene_symbol | score_text | score_knowledge | score_cosmic | score_distild | score_integrated_no_distild | score_integrated | |
---|---|---|---|---|---|---|---|---|---|---|
0 | DOID:13223 | uterine fibroid | 60 | ACTB | 0.8 | NaN | NaN | NaN | 0.8 | 0.8 |
1 | DOID:13223 | uterine fibroid | 71 | ACTG1 | 0.8 | NaN | NaN | NaN | 0.8 | 0.8 |
# DOAF
commit = 'bbe1c326aa385416e36d02b144e89e2b99e700b6'
url = utils.rawgit('dhimmel', 'doaf', commit, 'data/doaf.tsv')
doaf_df = pandas.read_table(url)
doaf_df = doaf_df.rename(columns={'doid_code': 'doid_id', 'GeneID': 'entrez_gene_id'})
doaf_df.head(3)
doid_id | doid_name | entrez_gene_id | Symbol | count | |
---|---|---|---|---|---|
0 | DOID:0001816 | angiosarcoma | 302 | ANXA2 | 1 |
1 | DOID:0001816 | angiosarcoma | 595 | CCND1 | 1 |
2 | DOID:0001816 | angiosarcoma | 2324 | FLT4 | 1 |
# DisGeNET
commit = 'fdc5f42f2da745cbf71d7b4cc5021de5685e4a11'
url = utils.rawgit('dhimmel', 'disgenet', commit, 'data/consolidated.tsv')
disgenet_df = pandas.read_table(url)
disgenet_df = disgenet_df.rename(columns={'doid_code': 'doid_id', 'geneId': 'entrez_gene_id'})
disgenet_df.head(2)
doid_id | doid_name | entrez_gene_id | geneSymbol | count | pubmeds_max | score_max | score_mean | associationType | source | |
---|---|---|---|---|---|---|---|---|---|---|
0 | DOID:0050156 | idiopathic pulmonary fibrosis | 729238 | SFTPA2 | 1 | 1 | 0.620284 | 0.620284 | Biomarker|GeneticVariation | BeFree|CLINVAR|CTD_human|UNIPROT |
1 | DOID:0050156 | idiopathic pulmonary fibrosis | 7015 | TERT | 1 | 10 | 0.422153 | 0.422153 | Biomarker|GeneticVariation | BeFree|CLINVAR|CTD_human|GAD|LHGDN |
# hetio GWAS
commit = '0617ea7ea8268f21f5ca1b8dbe487dd12671fc7b'
url = utils.rawgit('dhimmel', 'gwas-catalog', commit, 'data/gene-associations.tsv')
gwas_df = pandas.read_table(url)
gwas_df = gwas_df.rename(columns={'doid_code': 'doid_id', 'gene': 'entrez_gene_id'})
gwas_df.head(2)
doid_id | doid_name | locus | high_confidence | primary | status | entrez_gene_id | symbol | |
---|---|---|---|---|---|---|---|---|
0 | DOID:9970 | obesity | 0 | 1 | 1 | HC-P | 3953 | LEPR |
1 | DOID:9970 | obesity | 14 | 1 | 1 | HC-P | 4094 | MAF |
diseases_df = diseases_df.query('score_integrated_no_distild >= 2')
doaf_df = doaf_df.query('count >= 3')
disgenet_df = disgenet_df.query('score_max >= 0.06')
gwas_df = gwas_df[gwas_df.status == 'HC-P']
diseases_df['provenance'] = 'DISEASES'
doaf_df['provenance'] = 'DOAF'
disgenet_df['provenance'] = 'DisGeNET'
gwas_df['provenance'] = 'GWAS Catalog'
diseases_df['license'] = 'CC BY 4.0'
doaf_df['license'] = ''
disgenet_df['license'] = 'ODbL 1.0'
gwas_df['license'] = 'CC BY 4.0'
dfs = [df[['doid_id', 'entrez_gene_id', 'provenance', 'license']]
for df in (diseases_df, doaf_df, disgenet_df, gwas_df)]
concat_df = pandas.concat(dfs)
concat_df = disease_df.merge(gene_df.merge(concat_df))
concat_df.provenance.value_counts()
DisGeNET 7552 DISEASES 4990 DOAF 1649 GWAS Catalog 1284 Name: provenance, dtype: int64
def condense(df):
"""Consolidate multiple associations into a single Series."""
row = pandas.Series()
row['sources'] = '|'.join(df.provenance)
licenses = set(df.license)
licenses.discard('')
try:
row['license'], = licenses
except ValueError:
row['license'] = None
return row
short_df = concat_df.groupby(['doid_id', 'entrez_gene_id']).apply(condense).reset_index()
short_df = disease_df.merge(gene_df.merge(short_df))
short_df.head()
doid_id | doid_name | entrez_gene_id | gene_symbol | sources | license | |
---|---|---|---|---|---|---|
0 | DOID:2531 | Hematologic cancer | 25 | ABL1 | DISEASES|DisGeNET | None |
1 | DOID:2531 | Hematologic cancer | 27 | ABL2 | DisGeNET | ODbL 1.0 |
2 | DOID:2531 | Hematologic cancer | 54 | ACP5 | DISEASES | CC BY 4.0 |
3 | DOID:2531 | Hematologic cancer | 113 | ADCY7 | DisGeNET | ODbL 1.0 |
4 | DOID:2531 | Hematologic cancer | 142 | PARP1 | DISEASES|DisGeNET | None |
short_df.to_csv('DaG-association.tsv', sep='\t', index=False)