In [14]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

m1 = Chem.MolFromSmiles('O=C1CN(N=Cc2ccc([N+](=O)[O-])o2)C(=O)N1')
m2 = Chem.MolFromSmiles('CCCC1COC(Cn2cncn2)(c2ccc(Cl)cc2Cl)O1')
m3 = Chem.MolFromSmiles('CCCCCC=O')
# similar to m1
m4 = Chem.MolFromSmiles('CCCC1COC(Cn2cncn2)(c2ccc(Cl)cc2O)O1')

mols = [m1, m2, m3, m4]
In [15]:
fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024, useFeatures=False) for m in mols]
indx_fps = dict()
for indx, fp in enumerate(fps):
    indx_fps[indx] = fp
In [16]:
def diversity(mols_fp, threshold):
    diverse = []
    similar = []
    
    for m1 in mols_fp:
        m1_fp = mols_fp[m1]
        sim_vals = []
        for m2 in mols_fp:
            m2_fp = mols_fp[m2]
            # if different molecules else skip
            if m1 != m2:
                sim_vals.append(DataStructs.FingerprintSimilarity(m1_fp, m2_fp, metric=DataStructs.TanimotoSimilarity))
            # end if
        # end for
        # Get Maximum similarity
        sim_vals = max(sim_vals)
        if sim_vals >= threshold:
            similar.append(m1)
        else:
            diverse.append(m1)
    # end for
    
    return (diverse, similar)
                
In [17]:
(diverse, similar) = diversity(indx_fps, 0.8)
print (diverse, similar)
([0, 2], [1, 3])
In [ ]: