import pandas as pd
import numpy as np
import os
from rdkit import Chem
from rdkit import RDPaths
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import MolDraw2DSVG
from bokeh.plotting import ColumnDataSource, figure, output_file, show
from sklearn.decomposition import PCA
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from bokeh.io import output_notebook
output_notebook()
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Table
from sqlalchemy import Column, BIGINT, Index
from sqlalchemy.orm import mapper
from sqlalchemy.ext.declarative import declarative_base
from razi import rdkit_postgresql
from razi.rdkit_postgresql.types import Mol
from razi.rdkit_postgresql.types import Bfp
from razi.rdkit_postgresql.functions import mol_amw
db = create_engine('postgres://iwatobipen@localhost:5432/chembl_27',
#echo=True
)
metadata = MetaData(schema='rdk')
Base = declarative_base()
class Mols(Base):
__table__ = Table('mols',
metadata,
Column('molregno', BIGINT, primary_key=True),
Column('m', Mol),
extend_existing=True,
)
__table_args__ = (
Index('molidx', 'structure',
postgresql_using='gist'),
)
def __repr__(self):
if isinstance(self.m, Chem.Mol):
return '(%s) < %s >' % (self.molregno, Chem.MolToSmiles(self.m))
return '(%s) < %s >' % (self.molregno, self.m)
class Fps(Base):
__table__ = Table('fps',
metadata,
Column('molregno', BIGINT, primary_key=True),
Column('torsionbv', Bfp),
Column('mfp2', Bfp),
Column('ffp2', Bfp),
extend_existing=True)
Session = sessionmaker(bind=db)
session = Session()
cpds = session.query(Mols)
from IPython.display import display
for row in cpds[:4]:
display(row.m)
print(row)
(1) < Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl >
(2) < Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1 >
(3) < Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1 >
(4) < Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1 >
cpds = session.query(Mols)
cpds = cpds.filter(Mols.m.hassubstruct('n1cccc2ccccc12')).filter(mol_amw(Mols.m) <= 350).filter(mol_amw(Mols.m) > 250)
hitrows = [row for row in cpds] #m, molregno as column
def mol2svg(mol):
d2d = rdMolDraw2D.MolDraw2DSVG(200, 100)
d2d.DrawMolecule(mol)
d2d.FinishDrawing()
return d2d.GetDrawingText()
def mol2fp(mol, radi=2, nBits=1024):
arr = np.zeros((1,))
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radi, nBits=nBits)
DataStructs.ConvertToNumpyArray(fp, arr)
return arr
mols = [row.m for row in hitrows[:400]]
ids = [row.molregno for row in hitrows[:400]]
molsvgs = [mol2svg(m) for m in mols]
fparrs = np.array([mol2fp(m) for m in mols], dtype=np.int32)
fparrs.shape
(400, 1024)
pca = PCA(n_components=2)
pca_res = pca.fit_transform(fparrs)
data = dict(
x= pca_res[:,0],
y=pca_res[:,1],
ids = ids,
img = molsvgs
)
TOOLTIPS = """
<div>
ChEMBL-molregno: @ids<br>
@img{safe}
</div>
"""
source = ColumnDataSource(data)
p = figure(plot_width=600, plot_height=400, tooltips=TOOLTIPS)
p.circle('x', 'y', size=10, source=source)
show(p)