If using NumPy => 1.16 PyTables > 3.44 will be required. Install PyTables 3.51 if you're running this notebook in colab. Any other dependency is already installed in colab's default env.
Remember to restart the runtime after upgrading PyTables with pip!!!
You will also need to download ChEMBL25 FPSim2 database file.
Did you know BTW that we recently updated FPSim2 replacing it's Cython core by C++ binded with PyBind11 with improved performance and that it's now also compatible with Windows and Python 3.7?
Preflight config, run this cell only the first time
# update PyTables, you'll need to restart the environment in colab after the install!!!
# !pip install tables==3.5.1
# download ChEMBL25 FPSim2 FP db
# !wget "http://ftp.ebi.ac.uk/pub/databases/chembl/fpsim2/chembl_25.h5"
import cupy as cp
import tables as tb
import numpy as np
fps variable contains fingerprints (2048 bit hashed Morgan, radius 2) for all ChEMBL25 database molecules. Each row represents a molecule and it's structure is the following:
array([84419, 0,140737488355328, 17592186044416, 1024, 1099549376512, 0, 0, 0, 0, 0, 9007199254741248, 0,16777216, 0, 2305843009213693952, 0, 1073741824, 0, 0, 2199023255552, 0, 0, 0, 0, 0, 0, 32, 0, 0, 34359738372, 0, 0, 15], dtype=uint64)
First array's element is the ChEMBL molregno and last one is the count of ON bits in it's fingerprint (popcount). The 32 values in between are the 2048 fingerprint bits grouped as 64bit unsigned integers.
Molecules in FP db are sorted by popcount, which is needed to apply the bounds for sublinear time found in this classic paper: 10.1021/ci600358f
# using same FPsim2 ChEMBL FP database :)
fp_filename = "chembl_25.h5"
with tb.open_file(fp_filename, mode="r") as fp_file:
fps = fp_file.root.fps[:]
num_fields = len(fps[0])
fps = fps.view("u8")
fps = fps.reshape(int(fps.size / num_fields), num_fields)
# we'll use popcnt_ranges for the bounds optimisaiton, it stores
# the ranges for each popcount in the database
popcnt_ranges = fp_file.root.config[3]
# aspirin, ChEMBL molregno 1280
query_molregno = 1280
CuPy's ElementWise kernel will apply the same operation for each row. This makes sense to us because we would like to calc similarity for all molecules in the FP db file for given a query molecule .
You probably noticed that we are using i variable which is not declared in the code... this is a special variable that indicates the index within the loop
__popcll is a GPU instruction similar to the ones found in CPU which efficiently counts the number of 1's in a bit array
taniEW = cp.ElementwiseKernel(
in_params="raw T db, raw U query, uint64 in_width, float32 threshold",
out_params="raw V out",
operation=r"""
int comm_sum = 0;
for(int j = 1; j < in_width - 1; ++j){
int pos = i * in_width + j;
comm_sum += __popcll(db[pos] & query[j]);
}
float coeff = 0.0;
coeff = query[in_width - 1] + db[i * in_width + in_width - 1] - comm_sum;
if (coeff != 0.0)
coeff = comm_sum / coeff;
out[i] = coeff >= threshold ? coeff : 0.0;
""",
name='taniEW',
options=('-std=c++14',),
reduce_dims=False
)
# get the query molecule from the FP database
query = cp.asarray(fps[(fps[:,0] == query_molregno)][0])
# copy the database to GPU
database = cp.asarray(fps)
def cupy_elementwise_search(db, query, threshold):
# init the results variable
sim = cp.zeros(database.shape[0], dtype="f4")
# set the threshold variable and run the search
threshold = cp.asarray(threshold, dtype="f4")
taniEW(db, query, db.shape[1], threshold, sim, size=db.shape[0])
mask = sim.nonzero()[0]
np_sim = cp.asnumpy(sim[mask])
np_ids = cp.asnumpy(db[:,0][mask])
dtype = np.dtype([("mol_id", "u4"), ("coeff", "f4")])
results = np.empty(len(np_ids), dtype=dtype)
results["mol_id"] = np_ids
results["coeff"] = np_sim
results[::-1].sort(order='coeff')
return results
results = cupy_elementwise_search(database, query, 0.7)
results
array([( 1280, 1. ), (2096455, 0.8888889 ), ( 271022, 0.85714287), ( 875057, 0.7 )], dtype=[('mol_id', '<u4'), ('coeff', '<f4')])
We got aspirin back :) let's time it
%%timeit
results = cupy_elementwise_search(database, query, 0.7)
10 loops, best of 3: 32.4 ms per loop
def get_tanimoto_bounds(qpopcnt, ranges, threshold):
range_to_screen = []
for count, c_range in ranges:
max_sim = min(qpopcnt, count) / max(qpopcnt, count)
if max_sim >= threshold:
range_to_screen.append(c_range)
if range_to_screen:
range_to_screen = (range_to_screen[0][0],
range_to_screen[len(range_to_screen) - 1][1])
return range_to_screen
def cupy_elementwise_search_bounds(db, query, popcnt_ranges, threshold):
# get the range of molecules to screen
rk = get_tanimoto_bounds(int(query[-1]), popcnt_ranges, threshold)
# set the threshold variable
threshold = cp.asarray(threshold, dtype="f4")
# get the subset of molecule ids
ids = db[:,0][slice(*rk)]
subset_size = int(rk[1]-rk[0])
# init the results variable
sim = cp.zeros(subset_size, dtype=cp.float32)
# run the search. It will compile the kernel only the first time it runs
taniEW(db[slice(*rk)], query, db.shape[1], threshold, sim, size=subset_size)
# get all non 0 values and ids
mask = sim.nonzero()[0]
np_sim = cp.asnumpy(sim[mask])
np_ids = cp.asnumpy(ids[mask])
# create results numpy array
dtype = np.dtype([("mol_id", "u4"), ("coeff", "f4")])
results = np.empty(len(np_ids), dtype=dtype)
results["mol_id"] = np_ids
results["coeff"] = np_sim
results[::-1].sort(order='coeff')
return results
results = cupy_elementwise_search_bounds(database, query, popcnt_ranges, 0.7)
results
array([( 1280, 1. ), (2096455, 0.8888889 ), ( 271022, 0.85714287), ( 875057, 0.7 )], dtype=[('mol_id', '<u4'), ('coeff', '<f4')])
%%timeit
results = cupy_elementwise_search_bounds(database, query, popcnt_ranges, 0.7)
100 loops, best of 3: 3.47 ms per loop
We'll be directly running a bounds optimised version
I would recommend to read at least two first chapters of CUDA C Programming Guide in order to better understand what's going on here.
In this example we are setting a Grid of Thread Blocks with the size of the database. Each Thread Block will have a size of 32 which is the number of uint64 fields storing the 2048 bit fingerprint.
Each Thread Block is calculating the popcount of the intersection between each uint64 field of the query and the db molecule present in the block in parallel. When all threads finish, in a syncronized way, thread 0 in each block sums the popcounts and calculates the similarity.
# load database as cupy arrays
database = cp.asarray(fps[:,1:-1])
ids = cp.asarray(fps[:,0])
popcnts = cp.asarray(fps[:,-1])
# quering with aspirin (molregno 1280) as usual :P
query = fps[(fps[:,0] == query_molregno)]
taniRAW = cp.RawKernel(
r"""
extern "C" __global__
void taniRAW(const unsigned long long int* query,
const unsigned long long int* qcount,
const unsigned long long int* db,
const unsigned long long int* popcnts,
float* threshold,
float* out) {{
// Shared block array. Only visible for threads in same block
__shared__ int common[{block}];
int tid = blockDim.x * blockIdx.x + threadIdx.x;
common[threadIdx.x] = __popcll(query[threadIdx.x] & db[tid]);
// threads need to wait until all threads finish
__syncthreads();
// thread 0 in each block sums the common bits
// and calcs the final coeff
if(0 == threadIdx.x)
{{
int comm_sum = 0;
for(int i=0; i<{block}; i++)
comm_sum += common[i];
float coeff = 0.0;
coeff = *qcount + popcnts[blockIdx.x] - comm_sum;
if (coeff != 0.0)
coeff = comm_sum / coeff;
out[blockIdx.x] = coeff >= *threshold ? coeff : 0.0;
}}
}}
""".format(block=database.shape[1]),
name="taniRAW",
options=('-std=c++14',),
)
def cupy_sim_search_bounds(db, db_popcnts, db_ids, query, popcnt_ranges, threshold):
c_query = cp.asarray(query[:,1:-1])
qpopcnt = cp.asarray(query[:,-1])
# get the range of the molecule subset to screen
rk = get_tanimoto_bounds(int(query[:,-1]), popcnt_ranges, threshold)
threshold = cp.asarray(threshold, dtype="f4")
# get the subset of molecule ids
subset_size = int(rk[1]-rk[0])
ids2 = db_ids[slice(*rk)]
# init results array
sim = cp.zeros(subset_size, dtype=cp.float32)
# run the search, it compiles the kernel only the first time it runs
# grid, block and arguments
taniRAW((subset_size,),
(db.shape[1],),
(c_query, qpopcnt, db[slice(*rk)], db_popcnts[slice(*rk)], threshold, sim))
# get all non 0 values and ids
mask = sim.nonzero()[0]
np_sim = cp.asnumpy(sim[mask])
np_ids = cp.asnumpy(ids2[mask])
# create results numpy array
dtype = np.dtype([("mol_id", "u4"), ("coeff", "f4")])
results = np.empty(len(np_ids), dtype=dtype)
results["mol_id"] = np_ids
results["coeff"] = np_sim
results[::-1].sort(order='coeff')
return results
results = cupy_sim_search_bounds(database, popcnts, ids, query, popcnt_ranges, 0.7)
results
array([( 1280, 1. ), (2096455, 0.8888889 ), ( 271022, 0.85714287), ( 875057, 0.7 )], dtype=[('mol_id', '<u4'), ('coeff', '<f4')])
%%timeit
results = cupy_sim_search_bounds(database, popcnts, ids, query, popcnt_ranges, 0.7)
100 loops, best of 3: 2.64 ms per loop
RawKernel implementation looks like faster! :)