from rdkit import Chem from rdkit.Chem import AllChem from rdkit import DataStructs m1 = Chem.MolFromSmiles('O=C1CN(N=Cc2ccc([N+](=O)[O-])o2)C(=O)N1') m2 = Chem.MolFromSmiles('CCCC1COC(Cn2cncn2)(c2ccc(Cl)cc2Cl)O1') m3 = Chem.MolFromSmiles('CCCCCC=O') # similar to m1 m4 = Chem.MolFromSmiles('CCCC1COC(Cn2cncn2)(c2ccc(Cl)cc2O)O1') mols = [m1, m2, m3, m4] fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024, useFeatures=False) for m in mols] indx_fps = dict() for indx, fp in enumerate(fps): indx_fps[indx] = fp def diversity(mols_fp, threshold): diverse = [] similar = [] for m1 in mols_fp: m1_fp = mols_fp[m1] sim_vals = [] for m2 in mols_fp: m2_fp = mols_fp[m2] # if different molecules else skip if m1 != m2: sim_vals.append(DataStructs.FingerprintSimilarity(m1_fp, m2_fp, metric=DataStructs.TanimotoSimilarity)) # end if # end for # Get Maximum similarity sim_vals = max(sim_vals) if sim_vals >= threshold: similar.append(m1) else: diverse.append(m1) # end for return (diverse, similar) (diverse, similar) = diversity(indx_fps, 0.8) print (diverse, similar)