Functions defined in this notebook is:
from rdkit.Chem import rdmolops
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import DrawingOptions
smi='CC(C)CC1=CC=C(C=C1)C(C)C(=O)O'
mol=Chem.MolFromSmiles(smi)
def show_mol(m):
DrawingOptions.includeAtomNumbers=True
view= Chem.Draw.MolToImage(m)
DrawingOptions.includeAtomNumbers=False
return view
show_mol(mol)
Warning: unable to load font metrics from dir c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\rdkit\sping\PIL\pilfonts
rdmolops.FindAllSubgraphsOfLengthN
is not useful this time, because we want to extract linear fragment, not branched fragments.
import networkx as nx
n_len=3 # 3-lengths paths
lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)
G = nx.Graph()
lst=[]
for bond_ix in lst_paths[16]:# 1, 16
bond=mol.GetBondWithIdx(bond_ix)
lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
G.add_edges_from(lst)
print(G.adj)
print(G.adjacency())
{7: {10: {}}, 10: {7: {}, 12: {}}, 12: {10: {}, 14: {}}, 14: {12: {}}} <dict_itemiterator object at 0x000001BA6C083C28>
def bond_indices_to_nodes(m, bond_indices):
G = nx.Graph()
lst=[]
for bond_ix in bond_indices:
bond= m.GetBondWithIdx(bond_ix)
lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
G.add_edges_from(lst)
return G
def list_end_points_indices(g):
return [k for k,v in g.adjacency()if len(v)==1]
def sort_atoms_in_indices(ix_start,g):
indices_sorted = []
atom_indices = set(g.nodes)
atom_indices -= {ix_start}
next_ix = ix_start
indices_sorted += [next_ix]
while any(k in atom_indices for k in list(g.adj[next_ix].keys())):
if list(g.adj[next_ix].keys())[0] in atom_indices:
next_ix = list(g.adj[next_ix].keys())[0]
elif list(g.adj[next_ix].keys())[1] in atom_indices:
next_ix = list(g.adj[next_ix].keys())[1]
indices_sorted += [next_ix]
atom_indices -= {next_ix}
return indices_sorted
gg = bond_indices_to_nodes(mol,list(lst_paths[-1]))
ind = list_end_points_indices(gg)
print(ind)
print(sort_atoms_in_indices(ind[0], gg),sort_atoms_in_indices(ind[1], gg))
[11, 14] [11, 10, 12, 14] [14, 12, 10, 11]
for p in lst_paths:
print('='*10)
lst=[]
for bond_ix in p:
bond = mol.GetBondWithIdx(bond_ix)
lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
print(lst)
gg = bond_indices_to_nodes(mol, list(p))
ind = list_end_points_indices(gg)
print('start and end', ind)
print('function output',sort_atoms_in_indices(ind[0],gg))
========== [(0, 1), (1, 3), (3, 4)] start and end [0, 4] function output [0, 1, 3, 4] ========== [(1, 3), (3, 4), (4, 5)] start and end [1, 5] function output [1, 3, 4, 5] ========== [(1, 3), (3, 4), (9, 4)] start and end [1, 9] function output [1, 3, 4, 9] ========== [(1, 2), (1, 3), (3, 4)] start and end [2, 4] function output [2, 1, 3, 4] ========== [(3, 4), (4, 5), (5, 6)] start and end [3, 6] function output [3, 4, 5, 6] ========== [(3, 4), (9, 4), (8, 9)] start and end [3, 8] function output [3, 4, 9, 8] ========== [(4, 5), (5, 6), (6, 7)] start and end [4, 7] function output [4, 5, 6, 7] ========== [(9, 4), (8, 9), (7, 8)] start and end [4, 7] function output [4, 9, 8, 7] ========== [(4, 5), (9, 4), (8, 9)] start and end [5, 8] function output [5, 4, 9, 8] ========== [(5, 6), (6, 7), (7, 8)] start and end [5, 8] function output [5, 6, 7, 8] ========== [(5, 6), (6, 7), (7, 10)] start and end [5, 10] function output [5, 6, 7, 10] ========== [(5, 6), (4, 5), (9, 4)] start and end [6, 9] function output [6, 5, 4, 9] ========== [(6, 7), (7, 8), (8, 9)] start and end [6, 9] function output [6, 7, 8, 9] ========== [(6, 7), (7, 10), (10, 11)] start and end [6, 11] function output [6, 7, 10, 11] ========== [(6, 7), (7, 10), (10, 12)] start and end [6, 12] function output [6, 7, 10, 12] ========== [(7, 10), (10, 12), (12, 13)] start and end [7, 13] function output [7, 10, 12, 13] ========== [(7, 10), (10, 12), (12, 14)] start and end [7, 14] function output [7, 10, 12, 14] ========== [(7, 8), (7, 10), (10, 11)] start and end [8, 11] function output [8, 7, 10, 11] ========== [(7, 8), (7, 10), (10, 12)] start and end [8, 12] function output [8, 7, 10, 12] ========== [(8, 9), (7, 8), (7, 10)] start and end [9, 10] function output [9, 8, 7, 10] ========== [(10, 11), (10, 12), (12, 13)] start and end [11, 13] function output [11, 10, 12, 13] ========== [(10, 11), (10, 12), (12, 14)] start and end [11, 14] function output [11, 10, 12, 14]
def generate_linear_fragments(mol, n_len=3):
lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)
smiles = []
lst_indices = []
for p in lst_paths:
lst=[]
for bond_ix in p:
bond = mol.GetBondWithIdx(bond_ix)
lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
gg = bond_indices_to_nodes(mol, list(p))
ind = list_end_points_indices(gg)
atom_indices = list(sort_atoms_in_indices(ind[0],gg))
lst_indices += [atom_indices]
smi=''
for ix in range(n_len):
smi += mol.GetAtomWithIdx(atom_indices[ix]).GetSymbol()
bond = mol.GetBondBetweenAtoms(atom_indices[ix],atom_indices[ix+1])
if bond.GetBondType() is Chem.BondType.SINGLE:
smi += '-'# explicitly shown, compared to aromatic bond.
elif bond.GetBondType() is Chem.BondType.DOUBLE:
smi += '='
elif bond.GetBondType() is Chem.BondType.TRIPLE:
smi += '#'
elif bond.GetBondType() is Chem.BondType.QUADRUPLE:
smi += '$'
else:
smi += mol.GetAtomWithIdx(atom_indices[ix+1]).GetSymbol()
smiles += [smi]
return smiles, lst_indices
from pprint import pprint
for l in range(1, 6):
smis, atom_ixs = generate_linear_fragments(mol, l)
else:
pprint(atom_ixs)
# print('length items:', l,len(smis))
[[0, 1, 3, 4, 5, 6], [0, 1, 3, 4, 9, 8], [1, 3, 4, 5, 6, 7], [1, 3, 4, 9, 8, 7], [2, 1, 3, 4, 5, 6], [2, 1, 3, 4, 9, 8], [3, 4, 5, 6, 7, 8], [3, 4, 5, 6, 7, 10], [3, 4, 9, 8, 7, 6], [3, 4, 9, 8, 7, 10], [4, 5, 6, 7, 8, 9], [4, 5, 6, 7, 10, 11], [4, 5, 6, 7, 10, 12], [4, 9, 8, 7, 6, 5], [4, 9, 8, 7, 10, 11], [4, 9, 8, 7, 10, 12], [5, 4, 9, 8, 7, 6], [5, 4, 9, 8, 7, 10], [5, 6, 7, 10, 12, 13], [5, 6, 7, 10, 12, 14], [6, 5, 4, 9, 8, 7], [7, 6, 5, 4, 9, 8], [8, 7, 6, 5, 4, 9], [9, 4, 5, 6, 7, 10], [9, 8, 7, 10, 12, 13], [9, 8, 7, 10, 12, 14]]
smis, atom_ixs = generate_linear_fragments(mol, l)
smis[:5]
['C-C-C-CCC', 'C-C-C-CCC', 'C-C-CCCC', 'C-C-CCCC', 'C-C-C-CCC']
n_len=3 # 3-lengths paths
lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)
for p in lst_paths:
print('='*10)
lst=[]
for bond_ix in p:
bond = mol.GetBondWithIdx(bond_ix)
lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
gg = bond_indices_to_nodes(mol, list(p))
ind = list_end_points_indices(gg)
atom_indices = list(sort_atoms_in_indices(ind[0],gg))
smi=''
for ix in range(n_len):
smi += mol.GetAtomWithIdx(atom_indices[ix]).GetSymbol()
bond = mol.GetBondBetweenAtoms(atom_indices[ix],atom_indices[ix+1])
if bond.GetBondType() is Chem.BondType.DOUBLE:
smi += '='
elif bond.GetBondType() is Chem.BondType.TRIPLE:
smi += '#'
else:
print(bond.GetBondType())
else:
smi += mol.GetAtomWithIdx(atom_indices[ix+1]).GetSymbol()
print(smi)
========== SINGLE SINGLE SINGLE CCCC ========== SINGLE SINGLE AROMATIC CCCC ========== SINGLE SINGLE AROMATIC CCCC ========== SINGLE SINGLE SINGLE CCCC ========== SINGLE AROMATIC AROMATIC CCCC ========== SINGLE AROMATIC AROMATIC CCCC ========== AROMATIC AROMATIC AROMATIC CCCC ========== AROMATIC AROMATIC AROMATIC CCCC ========== AROMATIC AROMATIC AROMATIC CCCC ========== AROMATIC AROMATIC AROMATIC CCCC ========== AROMATIC AROMATIC SINGLE CCCC ========== AROMATIC AROMATIC AROMATIC CCCC ========== AROMATIC AROMATIC AROMATIC CCCC ========== AROMATIC SINGLE SINGLE CCCC ========== AROMATIC SINGLE SINGLE CCCC ========== SINGLE SINGLE CCC=O ========== SINGLE SINGLE SINGLE CCCO ========== AROMATIC SINGLE SINGLE CCCC ========== AROMATIC SINGLE SINGLE CCCC ========== AROMATIC AROMATIC SINGLE CCCC ========== SINGLE SINGLE CCC=O ========== SINGLE SINGLE SINGLE CCCO
The code below using FindAllPathsOfLengthN
is also not the best, because atom indices in tup_atom_inter_ix
are randomly aligned.
n_len=4
lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,
useBonds=True,
useHs=True)
for pathset in lst_paths:
tup_atom_inter_ix = {mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx()
for bond_ix in pathset[1:-1]} | {
mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()
for bond_ix in pathset[1:-1]}
bond_ix = pathset[0]
atom_ix_start={mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx(),
mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()}-tup_atom_inter_ix
bond_ix = pathset[-1]
atom_ix_end={mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx(),
mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()}-tup_atom_inter_ix
print(list(atom_ix_start)[0],list(atom_ix_end)[0])
print(list(atom_ix_start)[0],*list(tup_atom_inter_ix),list(atom_ix_end)[0])
print(*rdmolops.GetShortestPath(mol,
list(atom_ix_start)[0],list(atom_ix_end)[0]))
0 5 0 1 3 4 5 0 1 3 4 5 0 9 0 1 3 4 9 0 1 3 4 9 1 6 1 3 4 5 6 1 3 4 5 6 1 8 1 9 3 4 8 1 3 4 9 8 2 5 2 1 3 4 5 2 1 3 4 5 2 9 2 1 3 4 9 2 1 3 4 9 3 7 3 4 5 6 7 3 4 5 6 7 3 7 3 8 9 4 7 3 4 5 6 7 4 8 4 5 6 7 8 4 9 8 4 10 4 5 6 7 10 4 5 6 7 10 4 6 4 8 9 7 6 4 5 6 4 10 4 8 9 7 10 4 5 6 7 10 5 7 5 8 9 4 7 5 6 7 5 9 5 8 6 7 9 5 4 9 5 11 5 10 6 7 11 5 6 7 10 11 5 12 5 10 6 7 12 5 6 7 10 12 6 8 6 9 4 5 8 6 7 8 6 13 6 10 12 7 13 6 7 10 12 13 6 14 6 10 12 7 14 6 7 10 12 14 7 9 7 4 5 6 9 7 8 9 8 13 8 10 12 7 13 8 7 10 12 13 8 14 8 10 12 7 14 8 7 10 12 14 9 11 9 8 10 7 11 9 8 7 10 11 9 12 9 8 10 7 12 9 8 7 10 12