import bz2
import json
import requests
import pandas
import sqlite3
import l1000
# Read symbol to entrez_gene_id mapping
url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/symbol-map.json'
symbol_map = json.loads(requests.get(url).text)
# Read dataframe of entrez gene records
url = 'https://github.com/dhimmel/entrez-gene/raw/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv'
entrez_gene_df = pandas.read_table(url)
renamer = {
'GeneID': 'entrez_gene_id',
'Symbol': 'symbol',
'type_of_gene': 'type_of_gene',
'description': 'description',
}
entrez_gene_df = entrez_gene_df.rename(columns=renamer)[list(renamer.values())]
entrez_gene_df.entrez_gene_id = entrez_gene_df.entrez_gene_id.astype(str)
# construct probe_df
probe_df = pandas.read_table('data/geneinfo/geneinfo.tsv.gz')
probe_to_gene = dict(zip(probe_df.pr_id, probe_df.pr_gene_id))
# Landmark probes
landmark_probe_df = probe_df[probe_df.pr_pool_id.str.startswith('epsilon').fillna(False)]
print('landmark genes', len(landmark_probe_df))
# BIGS + Landmark probes
probe_df = probe_df[probe_df.is_bing.fillna(False)]
# If a gene is an epsilon landmark, do not include imputed probes
probe_df.query("pr_gene_id not in @landmark_probe_df.pr_gene_id or pr_id in @landmark_probe_df.pr_id")
print('best inferred gene set size', len(probe_df))
probe_df.head(2)
('landmark genes', 978) ('best inferred gene set size', 10638)
pr_id | pr_gene_id | pr_gene_symbol | pr_gene_title | is_lm | is_l1000 | is_bing | pr_pool_id | |
---|---|---|---|---|---|---|---|---|
1831 | 218075_at | 8086 | AAAS | achalasia, adrenocortical insufficiency, alacr... | False | True | True | inferred |
1833 | 218434_s_at | 65985 | AACS | acetoacetyl-CoA synthetase | False | True | True | inferred |
# Create a dataset of the genes represented by probes
gene_df = probe_df.groupby('pr_gene_id').apply(
lambda df: pandas.Series({'status': 'measured' if df.pr_pool_id.iloc[0].startswith('epsilon') else 'imputed'})
)
gene_df.index.name = 'entrez_gene_id'
gene_df = gene_df.reset_index()
gene_df = gene_df.merge(entrez_gene_df).sort_values('entrez_gene_id')
gene_df.to_csv('data/consensi/genes.tsv', index=False, sep='\t')
gene_df.head(2)
entrez_gene_id | status | symbol | type_of_gene | description | |
---|---|---|---|---|---|
0 | 100 | imputed | ADA | protein-coding | adenosine deaminase |
1 | 1000 | imputed | CDH2 | protein-coding | cadherin 2, type 1, N-cadherin (neuronal) |
# Number of genes after converting from probe to gene
gene_df.status.value_counts()
imputed 6489 measured 978 Name: status, dtype: int64
# Filter BING probes for valid entrez gene IDs
probe_df = probe_df.query("pr_gene_id in @gene_df.entrez_gene_id")
# Probes in BING
probe_df.pr_pool_id.value_counts()
inferred 9490 epsilon|deltap 786 epsilon 192 deltap 165 Name: pr_pool_id, dtype: int64
def run_consensi(sig_expr_df, pert_to_sigs, name):
"""Compute consensi signatures"""
pert_expr_df = l1000.get_consensus_signatures(sig_expr_df, pert_to_sigs, weighting_subset=landmark_probe_df.pr_id)
pert_expr_df = l1000.probes_to_genes(pert_expr_df, probe_to_gene)
pert_expr_df = pert_expr_df.transpose()
pert_expr_df.index.name = 'perturbagen'
print(pert_expr_df.shape)
path = 'data/consensi/consensi-{}.tsv.bz2'.format(name)
with bz2.BZ2File(path, 'w') as write_file:
pert_expr_df.reset_index().to_csv(write_file, sep='\t', index=False, float_format='%.3f')
return pert_expr_df
# open database connection
connection = sqlite3.connect('data/l1000.db')
%%time
# get all gold signatures
query = "SELECT sigs.sig_id FROM sigs WHERE sigs.is_gold = 1"
sigs = pandas.read_sql(query, connection).sig_id.tolist()
# get probes and extract signatures
probes = probe_df.pr_id.tolist()
path = 'download/modzs.gctx'
sig_expr_df = l1000.extract_from_gctx(path, probes, sigs)
CPU times: user 56min 42s, sys: 9min 9s, total: 1h 5min 52s Wall time: 1h 6min 47s
query = """
SELECT unichem.resource_id AS drugbank_id, perts.pert_id, sigs.sig_id, sigs.is_gold
FROM unichem, perts, sigs
WHERE unichem.resource = 'drugbank'
AND unichem.pert_uid = perts.pert_uid
AND sigs.pert_id = perts.pert_id
"""
sig_df = pandas.read_sql(query, connection)
sig_df = sig_df.query("is_gold == 1")
%%time
pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('drugbank_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='drugbank')
(1170, 7467) CPU times: user 1h 54min 57s, sys: 1min 16s, total: 1h 56min 14s Wall time: 1h 56min 28s
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND pert_type = 'trt_sh'
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df['pertubation_entrez_gene_id'] = sig_df.pert_iname.map(symbol_map.get)
sig_df.head(2)
pert_id | pert_iname | pert_type | sig_id | pertubation_entrez_gene_id | |
---|---|---|---|---|---|
0 | TRCN0000139323 | PTMS | trt_sh | DER001_HA1E_96H:TRCN0000139323:-666 | 5763 |
1 | TRCN0000005420 | RIOK3 | trt_sh | DER001_HA1E_96H:TRCN0000005420:-666 | 8780 |
%%time
# Condense to perturbagens
pert_to_sigs = {int(k): g['sig_id'].tolist() for k, g in sig_df.groupby('pertubation_entrez_gene_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='knockdown')
(4326, 7467) CPU times: user 5h 38min 35s, sys: 1min 5s, total: 5h 39min 40s Wall time: 5h 40min 16s
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND pert_type = 'trt_oe'
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df['pertubation_entrez_gene_id'] = sig_df.pert_iname.map(symbol_map.get)
sig_df.head(2)
pert_id | pert_iname | pert_type | sig_id | pertubation_entrez_gene_id | |
---|---|---|---|---|---|
0 | CMAP-HSF-DSTYK | DSTYK | trt_oe | HSF001_HEK293T_48H:CMAP-HSF-DSTYK:200 | 25778 |
1 | CMAP-HSF-DYRK1B | DYRK1B | trt_oe | HSF001_HEK293T_48H:CMAP-HSF-DYRK1B:200 | 9149 |
%%time
# Condense to perturbagens
pert_to_sigs = {int(k): g['sig_id'].tolist() for k, g in sig_df.groupby('pertubation_entrez_gene_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='overexpression')
(2413, 7467) CPU times: user 2h 25min 22s, sys: 6.9 s, total: 2h 25min 29s Wall time: 2h 25min 42s
query = """
SELECT perts.pert_id, perts.pert_iname, perts.pert_type, sigs.sig_id
FROM perts, sigs
WHERE sigs.pert_id = perts.pert_id
AND sigs.is_gold = 1
"""
sig_df = pandas.read_sql(query, connection)
sig_df.head(2)
pert_id | pert_iname | pert_type | sig_id | |
---|---|---|---|---|
0 | BRD-K07762753 | aminopurvalanol-a | trt_cp | CVD001_HUH7_24H:BRD-K07762753-001-03-6:50 |
1 | BRD-A46393198 | tetramisole | trt_cp | CPC004_VCAP_6H:BRD-A46393198-003-10-9:10 |
%%time
# Condense to perturbagens
pert_to_sigs = {k: g['sig_id'].tolist() for k, g in sig_df.groupby('pert_id')}
pert_expr_df = run_consensi(sig_expr_df, pert_to_sigs, name='pert_id')
(38327, 7467) CPU times: user 1d 18h 22min 52s, sys: 6min 7s, total: 1d 18h 29min Wall time: 1d 18h 33min 36s
pert_expr_df.head(2)
100 | 1000 | 10000 | 10001 | 10005 | 10006 | 10007 | 1001 | 10010 | 100129250 | ... | 998 | 9984 | 9986 | 9987 | 9988 | 9989 | 999 | 9991 | 9994 | 9997 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
perturbagen | |||||||||||||||||||||
56582 | -0.433639 | 0.797953 | -0.089733 | -0.627366 | 0.132508 | -0.114487 | -1.254363 | 0.089211 | -0.327051 | -0.492944 | ... | -0.128962 | -0.018879 | -0.026018 | -0.300755 | 0.447996 | 0.39047 | 2.441479 | -0.325544 | 0.481472 | -0.690715 |
5981 | 0.953652 | -0.170241 | -1.344758 | 0.200088 | 3.032086 | 1.175347 | -0.768220 | 1.000292 | 1.883492 | 0.335095 | ... | 0.045078 | -0.595802 | -0.201252 | -0.149790 | -1.290155 | 1.35908 | 0.803665 | -1.481666 | 0.970973 | 0.088374 |
2 rows × 7467 columns