import pandas as pd
import os
import pickle
import numpy as np
import scipy.sparse as sp
import scipy.io as spio
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import isolearn.io as isoio
import isolearn.keras as iso
import scipy.optimize as spopt
from scipy.stats import pearsonr
from analyze_random_mpra_logistic_regression_helpers import *
Using TensorFlow backend.
#Load plasmid data
plasmid_dict = isoio.load('../data/random_mpra_legacy/combined_library/processed_data_lifted/apa_plasmid_data_legacy')
df = plasmid_dict['plasmid_df']
#Filter data on sublibrary Alien2
keep_index = np.nonzero(df['library_index'] == 20)[0]
df = df.iloc[keep_index].copy().reset_index(drop=True)
#Filter on min read count
keep_index = np.nonzero(df['total_count'] >= 1)[0]
df = df.iloc[keep_index].copy().reset_index(drop=True)
print('n = ' + str(len(df)))
/home/johli/anaconda3/envs/aparent/lib/python3.6/site-packages/numpy/core/fromnumeric.py:56: FutureWarning: Series.nonzero() is deprecated and will be removed in a future version.Use Series.to_numpy().nonzero() instead return getattr(obj, method)(*args, **kwds)
n = 602450
#Generate training and test set indexes
test_set_size=8000
plasmid_index = np.arange(len(df), dtype=np.int)
train_index = plasmid_index[:-test_set_size]
test_index = plasmid_index[train_index.shape[0]:]
print('Training set size = ' + str(train_index.shape[0]))
print('Test set size = ' + str(test_index.shape[0]))
Training set size = 594450 Test set size = 8000
df = mask_constant_sequence_regions(df)
df = align_on_cse(df)
#Initialize hexamer count data generator (separated by USE, CSE and DSE regions)
hexamer_gens = {
gen_id : iso.DataGenerator(
idx,
{
'df' : df
},
batch_size=len(idx),
inputs = [
{
'id' : 'use',
'source_type' : 'dataframe',
'source' : 'df',
'extractor' : lambda row, index: row['seq_var_aligned'][:46],
'encoder' : iso.NMerEncoder(n_mer_len=6, count_n_mers=True),
'sparse' : True,
'sparse_mode' : 'col'
},
{
'id' : 'cse',
'source_type' : 'dataframe',
'source' : 'df',
'extractor' : lambda row, index: row['seq_var_aligned'][50:56],
'encoder' : iso.NMerEncoder(n_mer_len=6, count_n_mers=True),
'sparse' : True,
'sparse_mode' : 'col'
},
{
'id' : 'dse',
'source_type' : 'dataframe',
'source' : 'df',
'extractor' : lambda row, index: row['seq_var_aligned'][59:99],
'encoder' : iso.NMerEncoder(n_mer_len=6, count_n_mers=True),
'sparse' : True,
'sparse_mode' : 'col'
},
{
'id' : 'fdse',
'source_type' : 'dataframe',
'source' : 'df',
'extractor' : lambda row, index: row['seq_var_aligned'][99:],
'encoder' : iso.NMerEncoder(n_mer_len=6, count_n_mers=True),
'sparse' : True,
'sparse_mode' : 'col'
},
{
'id' : 'lib',
'source_type' : 'dataframe',
'source' : 'df',
'extractor' : lambda row, index: row['library_index'],
'encoder' : iso.CategoricalEncoder(n_categories=36, categories=np.arange(36, dtype=np.int).tolist()),
'sparsify' : True
},
],
outputs = [
{
'id' : 'proximal_usage',
'source_type' : 'dataframe',
'source' : 'df',
'extractor' : lambda row, index: row['proximal_count'] / row['total_count'],
'transformer' : lambda t: t
}
],
randomizers = [],
shuffle = False,
) for gen_id, idx in [('train', train_index), ('test', test_index)]
}
#Generate hexamer occurrence count matrices and corresponding isoform proportions
[X_train_use, X_train_cse, X_train_dse, X_train_fdse, X_train_lib], y_train = hexamer_gens['train'][0]
y_train = y_train[0]
[X_test_use, X_test_cse, X_test_dse, X_test_fdse, X_test_lib], y_test = hexamer_gens['test'][0]
y_test = y_test[0]
#Concatenate hexamer count matrices
X_train = sp.csc_matrix(sp.hstack([X_train_lib, X_train_use, X_train_cse, X_train_dse, X_train_fdse]))
X_test = sp.csc_matrix(sp.hstack([X_test_lib, X_test_use, X_test_cse, X_test_dse, X_test_fdse]))
print("Starting logistic n-mer regression...")
w_init = np.zeros(X_train.shape[1] + 1)
lambda_penalty = 0
(w_bundle, _, _) = spopt.fmin_l_bfgs_b(log_loss, w_init, fprime=log_loss_gradient, args=(X_train, y_train, lambda_penalty), maxiter = 200)
print("Regression finished.")
#Collect weights
w_0 = w_bundle[0]
w_L = w_bundle[1:1 + 36]
w = w_bundle[1 + 36:]
Starting logistic n-mer regression... Regression finished.
#Store weights
data_version = 'doubledope'
model_version = '6mer_v_pasaligned_margin'
w_bundle_no_lib = np.concatenate([np.array([w_0]), w], axis=0)
np.save('apa_regression_' + model_version + '_' + data_version + '_weights', w_bundle)
stored_nmer_weights = {
'nmer' : [t[1] for t in sorted(hexamer_gens['train'].encoders['use'].encoder.decode_map.items(), key=lambda t: t[0])],
'use' : w[: 4096].tolist(),
'cse' : w[4096: 2 * 4096].tolist(),
'dse' : w[2 * 4096: 3 * 4096].tolist(),
'fdse' : w[3 * 4096: 4 * 4096].tolist(),
}
nmer_df = pd.DataFrame(stored_nmer_weights)
nmer_df = nmer_df[['nmer', 'use', 'cse', 'dse', 'fdse']]
nmer_df.to_csv('apa_regression_' + model_version + '_' + data_version + '_weights.csv', index=False, sep='\t')
#Load weights
#data_version = 'doubledope'
#model_version = '6mer_v_pasaligned_margin'
#w_bundle = np.load('apa_regression_' + model_version + '_' + data_version + '_weights.npy')
#Collect weights
#w_0 = w_bundle[0]
#w_L = w_bundle[1:1 + 36]
#w = w_bundle[1 + 36:]
#Evaluate isoform proportion predictions on test set
y_test_pred = get_y_pred(X_test, np.concatenate([w_L, w]), w_0)
r_val, p_val = pearsonr(y_test_pred, y_test)
print("Test set R^2 = " + str(round(r_val * r_val, 2)) + ", p = " + str(p_val) + ", n = " + str(X_test.shape[0]))
#Plot test set scatter
f = plt.figure(figsize=(5, 5))
plt.scatter(y_test_pred, y_test, color='black', s=5, alpha=0.05)
plt.xticks([0.0, 0.25, 0.5, 0.75, 1.0], fontsize=14)
plt.yticks([0.0, 0.25, 0.5, 0.75, 1.0], fontsize=14)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel('Pred Proximal Usage', fontsize=14)
plt.ylabel('True Proximal Usage', fontsize=14)
plt.title(data_version + ' (R^2 = ' + str(round(r_val * r_val, 2)) + ', n = ' + str(X_test.shape[0]) + ')', fontsize=14)
plt.tight_layout()
plt.show()
Test set R^2 = 0.57, p = 0.0, n = 8000
#Evaluate isoform logodds predictions on test set
def safe_log(x, minval=0.01):
return np.log(x.clip(min=minval))
y_test_pred = get_y_pred(X_test, np.concatenate([w_L, w]), w_0)
#Compute Log Odds values
keep_index = (y_test < 0.99999)
y_test_valid = y_test[keep_index]
y_test_pred_valid = y_test_pred[keep_index]
logodds_test = np.ravel(safe_log(y_test_valid / (1. - y_test_valid)))
logodds_test_pred = np.ravel(safe_log(y_test_pred_valid / (1. - y_test_pred_valid)))
r_val, p_val = pearsonr(logodds_test_pred, logodds_test)
print("Test set R^2 = " + str(round(r_val * r_val, 2)) + ", p = " + str(p_val) + ", n = " + str(X_test.shape[0]))
#Plot test set scatter
f = plt.figure(figsize=(5, 5))
plt.scatter(logodds_test_pred, logodds_test, s = np.pi * (2 * np.ones(1))**2, alpha=0.05, color='black')
min_x = max(np.min(logodds_test_pred), np.min(logodds_test))
max_x = min(np.max(logodds_test_pred), np.max(logodds_test))
min_y = max(np.min(logodds_test_pred), np.min(logodds_test))
max_y = min(np.max(logodds_test_pred), np.max(logodds_test))
plt.plot([min_x, max_x], [min_y, max_y], alpha=0.5, color='darkblue', linewidth=3)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel('Pred Proximal Usage', fontsize=14)
plt.ylabel('True Proximal Usage', fontsize=14)
plt.axis([np.min(logodds_test_pred), np.max(logodds_test_pred), np.min(logodds_test), np.max(logodds_test)])
plt.title(data_version + ' (R^2 = ' + str(round(r_val * r_val, 2)) + ', n = ' + str(X_test.shape[0]) + ')', fontsize=14)
plt.tight_layout()
plt.show()
Test set R^2 = 0.64, p = 0.0, n = 8000