Notebook

Extract of Linear Fragments¶

Functions defined in this notebook is:

bond_indices_to_nodes(mol, bond_indices)
- input: mol Chem.Mol
- input: bond_indices possible pairs of bond indices, that form linear fragments
- generate pairs of bond indices that form linear fragments
list_end_points_indices(G)
- input: G networkx.Graph
sort_atoms_in_indices(ix_start,g)
- input: ix_start index of the endpoint
- input: g networkx.Graph
generate_linear_fragments(mol, n_len=3)
- input: mol Chem.Mol
- input: n_len number of length of the fragment
- generate all possible linear fragments in SMILES-like format with explicit single bond

In [1]:

from rdkit.Chem import rdmolops
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw import DrawingOptions

In [2]:

smi='CC(C)CC1=CC=C(C=C1)C(C)C(=O)O'
mol=Chem.MolFromSmiles(smi)

In [3]:

def show_mol(m):
    DrawingOptions.includeAtomNumbers=True
    view= Chem.Draw.MolToImage(m)
    DrawingOptions.includeAtomNumbers=False
    return view
show_mol(mol)

Warning: unable to load font metrics from dir c:\users\shojiro_shibayama\miniconda3\envs\cheminfo\lib\site-packages\rdkit\sping\PIL\pilfonts

Out[3]:

Irrelavant method¶

rdmolops.FindAllSubgraphsOfLengthN is not useful this time, because we want to extract linear fragment, not branched fragments.

In [4]:

import networkx as nx


n_len=3 # 3-lengths paths
lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)

G = nx.Graph()
lst=[]
for bond_ix in lst_paths[16]:# 1, 16
    bond=mol.GetBondWithIdx(bond_ix)
    lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
G.add_edges_from(lst)
print(G.adj)
print(G.adjacency())

{7: {10: {}}, 10: {7: {}, 12: {}}, 12: {10: {}, 14: {}}, 14: {12: {}}}
<dict_itemiterator object at 0x000001BA6C083C28>

In [5]:

def bond_indices_to_nodes(m, bond_indices):
    G = nx.Graph()
    lst=[]
    for bond_ix in bond_indices:
        bond= m.GetBondWithIdx(bond_ix)
        lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
    G.add_edges_from(lst)
    return G
def list_end_points_indices(g):
    return [k for k,v in g.adjacency()if len(v)==1]
def sort_atoms_in_indices(ix_start,g):
    indices_sorted = []
    atom_indices = set(g.nodes)
    atom_indices -= {ix_start}
    next_ix = ix_start
    indices_sorted += [next_ix]
    while any(k in atom_indices for k in list(g.adj[next_ix].keys())):
        if list(g.adj[next_ix].keys())[0] in atom_indices:
            next_ix = list(g.adj[next_ix].keys())[0]
        elif list(g.adj[next_ix].keys())[1] in atom_indices:
            next_ix = list(g.adj[next_ix].keys())[1]
        indices_sorted += [next_ix]
        atom_indices -= {next_ix}
    return indices_sorted

gg = bond_indices_to_nodes(mol,list(lst_paths[-1]))
ind = list_end_points_indices(gg)
print(ind)
print(sort_atoms_in_indices(ind[0], gg),sort_atoms_in_indices(ind[1], gg))

[11, 14]
[11, 10, 12, 14] [14, 12, 10, 11]

In [6]:

for p in lst_paths:
    print('='*10)
    lst=[]
    for bond_ix in p:
        bond = mol.GetBondWithIdx(bond_ix)
        lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
    print(lst)
    gg = bond_indices_to_nodes(mol, list(p))
    ind = list_end_points_indices(gg)
    print('start and end', ind)
    print('function output',sort_atoms_in_indices(ind[0],gg))

==========
[(0, 1), (1, 3), (3, 4)]
start and end [0, 4]
function output [0, 1, 3, 4]
==========
[(1, 3), (3, 4), (4, 5)]
start and end [1, 5]
function output [1, 3, 4, 5]
==========
[(1, 3), (3, 4), (9, 4)]
start and end [1, 9]
function output [1, 3, 4, 9]
==========
[(1, 2), (1, 3), (3, 4)]
start and end [2, 4]
function output [2, 1, 3, 4]
==========
[(3, 4), (4, 5), (5, 6)]
start and end [3, 6]
function output [3, 4, 5, 6]
==========
[(3, 4), (9, 4), (8, 9)]
start and end [3, 8]
function output [3, 4, 9, 8]
==========
[(4, 5), (5, 6), (6, 7)]
start and end [4, 7]
function output [4, 5, 6, 7]
==========
[(9, 4), (8, 9), (7, 8)]
start and end [4, 7]
function output [4, 9, 8, 7]
==========
[(4, 5), (9, 4), (8, 9)]
start and end [5, 8]
function output [5, 4, 9, 8]
==========
[(5, 6), (6, 7), (7, 8)]
start and end [5, 8]
function output [5, 6, 7, 8]
==========
[(5, 6), (6, 7), (7, 10)]
start and end [5, 10]
function output [5, 6, 7, 10]
==========
[(5, 6), (4, 5), (9, 4)]
start and end [6, 9]
function output [6, 5, 4, 9]
==========
[(6, 7), (7, 8), (8, 9)]
start and end [6, 9]
function output [6, 7, 8, 9]
==========
[(6, 7), (7, 10), (10, 11)]
start and end [6, 11]
function output [6, 7, 10, 11]
==========
[(6, 7), (7, 10), (10, 12)]
start and end [6, 12]
function output [6, 7, 10, 12]
==========
[(7, 10), (10, 12), (12, 13)]
start and end [7, 13]
function output [7, 10, 12, 13]
==========
[(7, 10), (10, 12), (12, 14)]
start and end [7, 14]
function output [7, 10, 12, 14]
==========
[(7, 8), (7, 10), (10, 11)]
start and end [8, 11]
function output [8, 7, 10, 11]
==========
[(7, 8), (7, 10), (10, 12)]
start and end [8, 12]
function output [8, 7, 10, 12]
==========
[(8, 9), (7, 8), (7, 10)]
start and end [9, 10]
function output [9, 8, 7, 10]
==========
[(10, 11), (10, 12), (12, 13)]
start and end [11, 13]
function output [11, 10, 12, 13]
==========
[(10, 11), (10, 12), (12, 14)]
start and end [11, 14]
function output [11, 10, 12, 14]

In [7]:

def generate_linear_fragments(mol, n_len=3):
    lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)
    smiles = []
    lst_indices = []
    for p in lst_paths:
        lst=[]
        for bond_ix in p:
            bond = mol.GetBondWithIdx(bond_ix)
            lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
        gg = bond_indices_to_nodes(mol, list(p))
        ind = list_end_points_indices(gg)
        atom_indices = list(sort_atoms_in_indices(ind[0],gg))
        lst_indices += [atom_indices]
        smi=''
        for ix in range(n_len):
            smi += mol.GetAtomWithIdx(atom_indices[ix]).GetSymbol()
            bond = mol.GetBondBetweenAtoms(atom_indices[ix],atom_indices[ix+1])        
            if bond.GetBondType() is Chem.BondType.SINGLE:
                smi += '-'# explicitly shown, compared to aromatic bond.
            elif bond.GetBondType() is Chem.BondType.DOUBLE:
                smi += '='
            elif bond.GetBondType() is Chem.BondType.TRIPLE:
                smi += '#'
            elif bond.GetBondType() is Chem.BondType.QUADRUPLE:
                smi += '$'
        else:
            smi += mol.GetAtomWithIdx(atom_indices[ix+1]).GetSymbol()
        smiles += [smi]
    return smiles, lst_indices

In [8]:

from pprint import pprint
for l in range(1, 6):
    smis, atom_ixs = generate_linear_fragments(mol, l)
else:
    pprint(atom_ixs)
#     print('length items:', l,len(smis))

[[0, 1, 3, 4, 5, 6],
 [0, 1, 3, 4, 9, 8],
 [1, 3, 4, 5, 6, 7],
 [1, 3, 4, 9, 8, 7],
 [2, 1, 3, 4, 5, 6],
 [2, 1, 3, 4, 9, 8],
 [3, 4, 5, 6, 7, 8],
 [3, 4, 5, 6, 7, 10],
 [3, 4, 9, 8, 7, 6],
 [3, 4, 9, 8, 7, 10],
 [4, 5, 6, 7, 8, 9],
 [4, 5, 6, 7, 10, 11],
 [4, 5, 6, 7, 10, 12],
 [4, 9, 8, 7, 6, 5],
 [4, 9, 8, 7, 10, 11],
 [4, 9, 8, 7, 10, 12],
 [5, 4, 9, 8, 7, 6],
 [5, 4, 9, 8, 7, 10],
 [5, 6, 7, 10, 12, 13],
 [5, 6, 7, 10, 12, 14],
 [6, 5, 4, 9, 8, 7],
 [7, 6, 5, 4, 9, 8],
 [8, 7, 6, 5, 4, 9],
 [9, 4, 5, 6, 7, 10],
 [9, 8, 7, 10, 12, 13],
 [9, 8, 7, 10, 12, 14]]

In [9]:

smis, atom_ixs = generate_linear_fragments(mol, l)
smis[:5]

Out[9]:

['C-C-C-CCC', 'C-C-C-CCC', 'C-C-CCCC', 'C-C-CCCC', 'C-C-C-CCC']

In [10]:

n_len=3 # 3-lengths paths
lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,useBonds=True,useHs=True)
for p in lst_paths:
    print('='*10)
    lst=[]
    for bond_ix in p:
        bond = mol.GetBondWithIdx(bond_ix)
        lst+=[(bond.GetBeginAtomIdx(),bond.GetEndAtomIdx())]
    gg = bond_indices_to_nodes(mol, list(p))
    ind = list_end_points_indices(gg)
    atom_indices = list(sort_atoms_in_indices(ind[0],gg))
    
    smi=''
    for ix in range(n_len):
        smi += mol.GetAtomWithIdx(atom_indices[ix]).GetSymbol()
        bond = mol.GetBondBetweenAtoms(atom_indices[ix],atom_indices[ix+1])        
        if bond.GetBondType() is Chem.BondType.DOUBLE:
            smi += '='
        elif bond.GetBondType() is Chem.BondType.TRIPLE:
            smi += '#'
        else:
            print(bond.GetBondType())
    else:
        smi += mol.GetAtomWithIdx(atom_indices[ix+1]).GetSymbol()
    print(smi)

==========
SINGLE
SINGLE
SINGLE
CCCC
==========
SINGLE
SINGLE
AROMATIC
CCCC
==========
SINGLE
SINGLE
AROMATIC
CCCC
==========
SINGLE
SINGLE
SINGLE
CCCC
==========
SINGLE
AROMATIC
AROMATIC
CCCC
==========
SINGLE
AROMATIC
AROMATIC
CCCC
==========
AROMATIC
AROMATIC
AROMATIC
CCCC
==========
AROMATIC
AROMATIC
AROMATIC
CCCC
==========
AROMATIC
AROMATIC
AROMATIC
CCCC
==========
AROMATIC
AROMATIC
AROMATIC
CCCC
==========
AROMATIC
AROMATIC
SINGLE
CCCC
==========
AROMATIC
AROMATIC
AROMATIC
CCCC
==========
AROMATIC
AROMATIC
AROMATIC
CCCC
==========
AROMATIC
SINGLE
SINGLE
CCCC
==========
AROMATIC
SINGLE
SINGLE
CCCC
==========
SINGLE
SINGLE
CCC=O
==========
SINGLE
SINGLE
SINGLE
CCCO
==========
AROMATIC
SINGLE
SINGLE
CCCC
==========
AROMATIC
SINGLE
SINGLE
CCCC
==========
AROMATIC
AROMATIC
SINGLE
CCCC
==========
SINGLE
SINGLE
CCC=O
==========
SINGLE
SINGLE
SINGLE
CCCO

Proof that GetShortestPath is not the best in this case¶

The code below using FindAllPathsOfLengthN is also not the best, because atom indices in tup_atom_inter_ix are randomly aligned.

In [11]:

n_len=4
lst_paths = rdmolops.FindAllPathsOfLengthN(mol,n_len,
                                           useBonds=True,
                                           useHs=True)
for pathset in lst_paths:
    tup_atom_inter_ix = {mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx()
    for bond_ix in pathset[1:-1]} | {
        mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()
    for bond_ix in pathset[1:-1]}
    bond_ix = pathset[0]
    atom_ix_start={mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx(),
     mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()}-tup_atom_inter_ix

    bond_ix = pathset[-1]
    atom_ix_end={mol.GetBondWithIdx(bond_ix).GetBeginAtomIdx(),
     mol.GetBondWithIdx(bond_ix).GetEndAtomIdx()}-tup_atom_inter_ix
    print(list(atom_ix_start)[0],list(atom_ix_end)[0])
    print(list(atom_ix_start)[0],*list(tup_atom_inter_ix),list(atom_ix_end)[0])
    print(*rdmolops.GetShortestPath(mol,
                             list(atom_ix_start)[0],list(atom_ix_end)[0]))

0 5
0 1 3 4 5
0 1 3 4 5
0 9
0 1 3 4 9
0 1 3 4 9
1 6
1 3 4 5 6
1 3 4 5 6
1 8
1 9 3 4 8
1 3 4 9 8
2 5
2 1 3 4 5
2 1 3 4 5
2 9
2 1 3 4 9
2 1 3 4 9
3 7
3 4 5 6 7
3 4 5 6 7
3 7
3 8 9 4 7
3 4 5 6 7
4 8
4 5 6 7 8
4 9 8
4 10
4 5 6 7 10
4 5 6 7 10
4 6
4 8 9 7 6
4 5 6
4 10
4 8 9 7 10
4 5 6 7 10
5 7
5 8 9 4 7
5 6 7
5 9
5 8 6 7 9
5 4 9
5 11
5 10 6 7 11
5 6 7 10 11
5 12
5 10 6 7 12
5 6 7 10 12
6 8
6 9 4 5 8
6 7 8
6 13
6 10 12 7 13
6 7 10 12 13
6 14
6 10 12 7 14
6 7 10 12 14
7 9
7 4 5 6 9
7 8 9
8 13
8 10 12 7 13
8 7 10 12 13
8 14
8 10 12 7 14
8 7 10 12 14
9 11
9 8 10 7 11
9 8 7 10 11
9 12
9 8 10 7 12
9 8 7 10 12