# update PyTables, you'll need to restart the environment in colab after the install!!!
# !pip install tables==3.5.1

# download ChEMBL25 FPSim2 FP db
# !wget "http://ftp.ebi.ac.uk/pub/databases/chembl/fpsim2/chembl_25.h5"

import cupy as cp
import tables as tb
import numpy as np

# using same FPsim2 ChEMBL FP database :)
fp_filename = "chembl_25.h5"
with tb.open_file(fp_filename, mode="r") as fp_file:
    fps = fp_file.root.fps[:]
    num_fields = len(fps[0])
    fps = fps.view("u8")
    fps = fps.reshape(int(fps.size / num_fields), num_fields)
    # we'll use popcnt_ranges for the bounds optimisaiton, it stores 
    # the ranges for each popcount in the database
    popcnt_ranges = fp_file.root.config[3]

# aspirin, ChEMBL molregno 1280
query_molregno = 1280

taniEW = cp.ElementwiseKernel(
    in_params="raw T db, raw U query, uint64 in_width, float32 threshold",
    out_params="raw V out",
    operation=r"""
    int comm_sum = 0;
    for(int j = 1; j < in_width - 1; ++j){
        int pos = i * in_width + j;
        comm_sum += __popcll(db[pos] & query[j]);
    }
    float coeff = 0.0;
    coeff = query[in_width - 1] + db[i * in_width + in_width - 1] - comm_sum;
    if (coeff != 0.0)
        coeff = comm_sum / coeff;
    out[i] = coeff >= threshold ? coeff : 0.0;
    """,
    name='taniEW',
    options=('-std=c++14',),
    reduce_dims=False
)

# get the query molecule from the FP database
query = cp.asarray(fps[(fps[:,0] == query_molregno)][0])
# copy the database to GPU
database = cp.asarray(fps)

def cupy_elementwise_search(db, query, threshold):
    # init the results variable  
    sim = cp.zeros(database.shape[0], dtype="f4")
    
    # set the threshold variable and run the search
    threshold = cp.asarray(threshold, dtype="f4")
    taniEW(db, query, db.shape[1], threshold, sim, size=db.shape[0])

    mask = sim.nonzero()[0]
    np_sim = cp.asnumpy(sim[mask])
    np_ids = cp.asnumpy(db[:,0][mask])
    
    dtype = np.dtype([("mol_id", "u4"), ("coeff", "f4")])
    results = np.empty(len(np_ids), dtype=dtype)
    results["mol_id"] = np_ids
    results["coeff"] = np_sim
    results[::-1].sort(order='coeff')
    return results

results = cupy_elementwise_search(database, query, 0.7)
results

%%timeit
results = cupy_elementwise_search(database, query, 0.7)


def get_tanimoto_bounds(qpopcnt, ranges, threshold):
    range_to_screen = []
    for count, c_range in ranges:
        max_sim = min(qpopcnt, count) / max(qpopcnt, count)
        if max_sim >= threshold:
            range_to_screen.append(c_range)
    if range_to_screen:
        range_to_screen = (range_to_screen[0][0], 
                           range_to_screen[len(range_to_screen) - 1][1])
    return range_to_screen

def cupy_elementwise_search_bounds(db, query, popcnt_ranges, threshold):
  
    # get the range of molecules to screen
    rk = get_tanimoto_bounds(int(query[-1]), popcnt_ranges, threshold)
    
    # set the threshold variable
    threshold = cp.asarray(threshold, dtype="f4")

    # get the subset of molecule ids
    ids = db[:,0][slice(*rk)]
    subset_size = int(rk[1]-rk[0])

    # init the results variable
    sim = cp.zeros(subset_size, dtype=cp.float32)

    # run the search. It will compile the kernel only the first time it runs
    taniEW(db[slice(*rk)], query, db.shape[1], threshold, sim, size=subset_size)

    # get all non 0 values and ids
    mask = sim.nonzero()[0]
    np_sim = cp.asnumpy(sim[mask])
    np_ids = cp.asnumpy(ids[mask])

    # create results numpy array
    dtype = np.dtype([("mol_id", "u4"), ("coeff", "f4")])
    results = np.empty(len(np_ids), dtype=dtype)
    results["mol_id"] = np_ids
    results["coeff"] = np_sim
    results[::-1].sort(order='coeff')
    return results

results = cupy_elementwise_search_bounds(database, query, popcnt_ranges, 0.7)
results

%%timeit
results = cupy_elementwise_search_bounds(database, query, popcnt_ranges, 0.7)


# load database as cupy arrays
database = cp.asarray(fps[:,1:-1])
ids = cp.asarray(fps[:,0])
popcnts = cp.asarray(fps[:,-1])

# quering with aspirin (molregno 1280) as usual :P
query = fps[(fps[:,0] == query_molregno)]


taniRAW = cp.RawKernel(
    r"""
    extern "C" __global__
    void taniRAW(const unsigned long long int* query, 
                 const unsigned long long int* qcount, 
                 const unsigned long long int* db,
                 const unsigned long long int* popcnts,
                 float* threshold,
                 float* out) {{

        // Shared block array. Only visible for threads in same block
        __shared__ int common[{block}];

        int tid = blockDim.x * blockIdx.x + threadIdx.x;
        common[threadIdx.x] = __popcll(query[threadIdx.x] & db[tid]);

        // threads need to wait until all threads finish
        __syncthreads();

        // thread 0 in each block sums the common bits 
        // and calcs the final coeff
        if(0 == threadIdx.x) 
        {{
            int comm_sum = 0;
            for(int i=0; i<{block}; i++)
                comm_sum += common[i];

            float coeff = 0.0;
            coeff = *qcount + popcnts[blockIdx.x] - comm_sum;
            if (coeff != 0.0)
                coeff = comm_sum / coeff;
            out[blockIdx.x] = coeff >= *threshold ? coeff : 0.0;
        }}
    }}
    """.format(block=database.shape[1]),
    name="taniRAW",
    options=('-std=c++14',),
)

def cupy_sim_search_bounds(db, db_popcnts, db_ids, query, popcnt_ranges, threshold):
  
    c_query = cp.asarray(query[:,1:-1])
    qpopcnt = cp.asarray(query[:,-1])

    # get the range of the molecule subset to screen
    rk = get_tanimoto_bounds(int(query[:,-1]), popcnt_ranges, threshold)
    
    threshold = cp.asarray(threshold, dtype="f4")

    # get the subset of molecule ids
    subset_size = int(rk[1]-rk[0])
    ids2 = db_ids[slice(*rk)]

    # init results array
    sim = cp.zeros(subset_size, dtype=cp.float32)

    # run the search, it compiles the kernel only the first time it runs
    # grid, block and arguments
    taniRAW((subset_size,), 
            (db.shape[1],), 
            (c_query, qpopcnt, db[slice(*rk)], db_popcnts[slice(*rk)], threshold, sim))

    # get all non 0 values and ids
    mask = sim.nonzero()[0]
    np_sim = cp.asnumpy(sim[mask])
    np_ids = cp.asnumpy(ids2[mask])

    # create results numpy array
    dtype = np.dtype([("mol_id", "u4"), ("coeff", "f4")])
    results = np.empty(len(np_ids), dtype=dtype)
    results["mol_id"] = np_ids
    results["coeff"] = np_sim
    results[::-1].sort(order='coeff')
    return results

results = cupy_sim_search_bounds(database, popcnts, ids, query, popcnt_ranges, 0.7)
results

%%timeit
results = cupy_sim_search_bounds(database, popcnts, ids, query, popcnt_ranges, 0.7)