import rdkit
import rdkit.Chem
import rdkit.Chem.inchi
import rdkit.Chem.AllChem
import rdkit.DataStructs
import pandas
import sqlite3
# Establish database connection
connection = sqlite3.connect('data/l1000.db')
cursor = connection.cursor()
query = """
SELECT * FROM perts
WHERE pert_type == 'trt_cp'
AND inchi_string NOTNULL;
"""
pert_df = pandas.read_sql(query, connection)
pert_df.head()
pert_uid | pert_id | pert_iname | pert_type | num_gold | num_inst | num_sig | in_summly | inchi_string | inchi_key | pubchem_cid | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12 | BRD-K68741898 | BRD-K68741898 | trt_cp | 0 | 3 | 1 | 0 | InChI=1S/C21H37N5O5/c1-15(2)13-31-21(29)24(5)1... | MNAJUJYQFCFYAB-YQVWRLOYSA-N | 44505553 |
1 | 13 | BRD-A05457250 | BAY-K8644 | trt_cp | 1 | 23 | 8 | 0 | InChI=1S/C16H15F3N2O4/c1-8-12(15(22)25-3)13(14... | ZFLWDHHVRRZMEI-UHFFFAOYSA-N | 2303 |
2 | 14 | BRD-K72034655 | peucedanin | trt_cp | 2 | 33 | 8 | 0 | InChI=1S/C15H14O4/c1-8(2)14-15(17-3)10-6-9-4-5... | YQBNJPACAUPNLV-UHFFFAOYSA-N | 8616 |
3 | 15 | BRD-K02458594 | KU-C103869 | trt_cp | 2 | 44 | 13 | 0 | InChI=1S/C15H13NO/c1-11-6-2-5-9-14(11)16-10-12... | YCUIAYUVYLNFFS-UHFFFAOYSA-N | 21785456 |
4 | 16 | BRD-K18814832 | BRD-K18814832 | trt_cp | 1 | 49 | 13 | 0 | InChI=1S/C27H23ClN2O3S/c1-18-12-13-19(2)25(16-... | ZQJTYJZLKBRKPC-UHFFFAOYSA-N | 2228302 |
rows = list()
for i, series in pert_df.iterrows():
# check InChI Keys
inchi = series.inchi_string
inchi_key = rdkit.Chem.inchi.InchiToInchiKey(inchi)
assert inchi_key == series.inchi_key
# molecule
mol = rdkit.Chem.MolFromInchi(inchi)
# fingerprint
fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)
rows.append([series.pert_uid, inchi_key, inchi, mol, fingerprint])
inchi_df = pandas.DataFrame(rows, columns=['pert_uid', 'inchi_key', 'inchi_string', 'mol', 'fingerprint'])
inchi_df.sort('inchi_key', inplace=True)
inchi_df.head()
pert_uid | inchi_key | inchi_string | mol | fingerprint | |
---|---|---|---|---|---|
9081 | 39929 | AAALVYBICLMAMA-UHFFFAOYSA-N | InChI=1S/C20H15N3O2/c24-19-15-11-17(21-13-7-3-... | <rdkit.Chem.rdchem.Mol object at 0x7fdfc0ee07b8> | <rdkit.DataStructs.cDataStructs.UIntSparseIntV... |
2664 | 9777 | AACFPJSJOWQNBN-UHFFFAOYSA-N | InChI=1S/C12H11NO3/c14-7-3-4-10-9(6-7)8-2-1-5-... | <rdkit.Chem.rdchem.Mol object at 0x7fdfc18cf668> | <rdkit.DataStructs.cDataStructs.UIntSparseIntV... |
789 | 2195 | AADCDMQTJNYOSS-LBPRGKRZSA-N | InChI=1S/C17H25ClN2O3/c1-4-11-9-13(18)16(23-3)... | <rdkit.Chem.rdchem.Mol object at 0x7fdfc1917518> | <rdkit.DataStructs.cDataStructs.UIntSparseIntV... |
16462 | 47318 | AADVJQLQUVDEBP-GQIGUUNPSA-N | InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9... | <rdkit.Chem.rdchem.Mol object at 0x7fdfc0860828> | <rdkit.DataStructs.cDataStructs.UIntSparseIntV... |
19637 | 50497 | AADVJQLQUVDEBP-GUXCAODWSA-N | InChI=1S/C22H36N4O4/c1-14(2)23-22(28)24-17-8-9... | <rdkit.Chem.rdchem.Mol object at 0x7fdfc06fdb38> | <rdkit.DataStructs.cDataStructs.UIntSparseIntV... |
# # Delete similarities table
# cursor.execute('DELETE FROM similarities;')
# connection.commit()
pert_uids = [int(x) for x in inchi_df.pert_uid]
fingerprints = list(inchi_df.fingerprint)
for i, series in inchi_df.iterrows():
print(series.inchi_key, '\r', end='')
similarities = rdkit.DataStructs.BulkDiceSimilarity(series.fingerprint, fingerprints)
similarities = [round(x, 4) for x in similarities]
rows = ((series.pert_uid, pert_uid, similarity) for pert_uid, similarity in zip(pert_uids, similarities))
cursor.executemany('INSERT INTO similarities VALUES (?,?,?)', rows)
connection.commit()
# close database connection
connection.close()
connection = sqlite3.connect('file:data/l1000.db', uri=True)
cursor = connection.cursor()
def get_similarities(pert_ids, connection):
"""Retrieve chemical similarity scores between perts."""
cursor = connection.cursor()
pandas.DataFrame({'pert_id': pert_ids}).to_sql('pert_id_subset', connection, if_exists='replace', index=False)
query = """
CREATE TEMPORARY TABLE perts_subset AS
SELECT *
FROM pert_id_subset
JOIN perts USING (pert_id);
"""
cursor.execute(query)
query = """
SELECT P0.pert_id as pert_id_0, P1.pert_id as pert_id_1, chemical
FROM perts_subset P0, perts_subset P1, similarities
WHERE P0.pert_uid = similarities.pert_uid_0 AND P1.pert_uid = similarities.pert_uid_1
"""
similarity_df = pandas.read_sql(query, connection)
query = """
DROP TABLE pert_id_subset;
DROP TABLE perts_subset;
"""
cursor.executescript(query)
return similarity_df
# test simple case
pert_ids = ['BRD-K68741898', 'BRD-A05457250']
get_similarities(pert_ids, connection)
pert_id_0 | pert_id_1 | chemical | |
---|---|---|---|
0 | BRD-K68741898 | BRD-K68741898 | 1.0000 |
1 | BRD-K68741898 | BRD-A05457250 | 0.1589 |
2 | BRD-A05457250 | BRD-K68741898 | 0.1589 |
3 | BRD-A05457250 | BRD-A05457250 | 1.0000 |
# test scalability
pert_ids = pert_df.pert_id[:1100]
large_df = get_similarities(pert_ids, connection)
large_df.tail()
pert_id_0 | pert_id_1 | chemical | |
---|---|---|---|
1209995 | BRD-A25143711 | BRD-K99946902 | 0.2205 |
1209996 | BRD-A25143711 | BRD-K86465814 | 0.2041 |
1209997 | BRD-A25143711 | BRD-A77722753 | 0.0650 |
1209998 | BRD-A25143711 | BRD-K02715688 | 0.1786 |
1209999 | BRD-A25143711 | BRD-A25143711 | 1.0000 |
# head of similarities table
pandas.read_sql('SELECT * FROM similarities LIMIT 5', connection)
pert_uid_0 | pert_uid_1 | chemical | |
---|---|---|---|
0 | 39929 | 39929 | 1.0000 |
1 | 39929 | 9777 | 0.2689 |
2 | 39929 | 2195 | 0.1324 |
3 | 39929 | 47318 | 0.2078 |
4 | 39929 | 50497 | 0.2078 |
# head of perts table
pandas.read_sql('SELECT * FROM perts LIMIT 5', connection)
pert_uid | pert_id | pert_iname | pert_type | num_gold | num_inst | num_sig | in_summly | inchi_string | inchi_key | pubchem_cid | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | CSS001-ATTGCAT | ATTGCAT | trt_sh.css | 0 | 0 | 7 | 0 | None | None | None |
1 | 2 | CSS001-GAGGATA | GAGGATA | trt_sh.css | 0 | 0 | 1 | 0 | None | None | None |
2 | 3 | CSS001-TCAATGA | TCAATGA | trt_sh.css | 0 | 0 | 7 | 0 | None | None | None |
3 | 4 | CSS001-TCAGTTC | TCAGTTC | trt_sh.css | 0 | 0 | 7 | 0 | None | None | None |
4 | 5 | CSS001-TCCATCA | TCCATCA | trt_sh.css | 0 | 0 | 1 | 0 | None | None | None |