In [27]:
import itertools
import gzip

import pandas
import rdkit.Chem
import rdkit.Chem.AllChem
import rdkit.DataStructs
In [ ]:
 
In [4]:
# Download DrugBank SDF file of structures
! wget --timestamping --directory-prefix download http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip
! unzip -d download download/all.sdf.zip
! rm download/all.sdf.zip
--2015-05-18 14:03:23--  http://www.drugbank.ca/system/downloads/current/structures/all.sdf.zip
Resolving www.drugbank.ca (www.drugbank.ca)... 162.243.242.197
Connecting to www.drugbank.ca (www.drugbank.ca)|162.243.242.197|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4362948 (4.2M) [application/zip]
Server file no newer than local file ‘download/all.sdf.zip’ -- not retrieving.

Archive:  download/all.sdf.zip
  inflating: download/all.sdf        
In [9]:
# Read SDF File
supplier = rdkit.Chem.SDMolSupplier('download/all.sdf')
molecules = [mol for mol in supplier if mol is not None]
len(molecules)
Out[9]:
6743
In [21]:
# Calculate fingerprints
fingerprints = dict()
for mol in molecules:
    drugbank_id = mol.GetProp('DATABASE_ID')
    fingerprint = rdkit.Chem.AllChem.GetMorganFingerprint(mol, 2)
    fingerprints[drugbank_id] = fingerprint
In [29]:
# Calculate pairwise compound similarities
similarity_rows = list()
for (id0, fp0), (id1, fp1) in itertools.combinations(fingerprints.items(), 2):
    similarity = rdkit.DataStructs.DiceSimilarity(fp0, fp1)
    similarity = round(similarity, 4)
    similarity_rows.append([id0, id1, similarity])
In [30]:
# Create a DataFrame of pairwise similarities
similarity_df = pandas.DataFrame(similarity_rows, columns=['compound0', 'compound1', 'similarity'])
with gzip.open('data/similarity.tsv.gz', 'w') as write_file:
    similarity_df.to_csv(write_file, sep='\t', index=False)
similarity_df.head()
Out[30]:
compound0 compound1 similarity
0 DB05107 DB08426 0.0966
1 DB05107 DB05105 0.0441
2 DB05107 DB05104 0.1457
3 DB05107 DB08423 0.1268
4 DB05107 DB05102 0.1000

5 rows × 3 columns

In [44]:
# Save a similarity tsv with only compounds in our slim drugbank set
drugbank_slim_df = pandas.read_table('data/drugbank-slim.tsv')
slim_ids = set(drugbank_slim_df.drugbank_id)
similarity_slim_df = similarity_df[similarity_df.compound0.isin(slim_ids) & similarity_df.compound1.isin(slim_ids)]
with gzip.open('data/similarity-slim.tsv.gz', 'w') as write_file:
    similarity_slim_df.to_csv(write_file, sep='\t', index=False)
float(len(similarity_slim_df)) / len(similarity_df)
Out[44]:
0.05267679727458775
In [ ]:
 
In [39]:
# histogram of similarities
import matplotlib.pyplot as plt
%matplotlib inline

plt.hist(similarity_df.similarity, 100);
In [49]:
# histogram of similarities in slim subset
plt.hist(list(similarity_slim_df.similarity), 100);
In [ ]: