import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, LSTM, ConvLSTM2D, GRU, BatchNormalization, LocallyConnected2D, Permute
from keras.layers import Concatenate, Reshape, Softmax, Conv2DTranspose, Embedding, Multiply
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import regularizers
from keras import backend as K
import keras.losses
import tensorflow as tf
from tensorflow.python.framework import ops
import isolearn.keras as iso
import numpy as np
import tensorflow as tf
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)
import pandas as pd
import os
import pickle
import numpy as np
import scipy.sparse as sp
import scipy.io as spio
import matplotlib.pyplot as plt
import isolearn.io as isoio
import isolearn.keras as isol
from genesis.visualization import *
from genesis.generator import *
from genesis.predictor import *
from genesis.optimizer import *
from definitions.generator.aparent_deconv_conv_generator_concat_alllayers import load_generator_network
from definitions.predictor.aparent import load_saved_predictor
import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
Using TensorFlow backend.
#Define target isoform loss function
def get_cleavage_loss(target_poses, use_start=0, use_end=70, use_target_bits=1.8, cse_start=70, cse_end=76, cse_target_bits=1.8, dse_start=76, dse_end=125, dse_target_bits=1.8, entropy_weight=0.0, similarity_weight=0.0, punish_dn_cse=0.0, punish_up_c=0.0, punish_dn_c=0.0, reward_dn_ggcc=0.0, punish_up_aa=0.0, punish_dn_aa=0.0) :
target_cuts = np.zeros((len(target_poses), 206))
for i, target_pos in enumerate(target_poses) :
target_cuts[i, target_pos] = 1.0
masked_use_entropy_mse = get_target_entropy_sme_masked(pwm_start=use_start, pwm_end=use_end, target_bits=use_target_bits)
cse_entropy_mse = get_target_entropy_sme(pwm_start=cse_start, pwm_end=cse_end, target_bits=cse_target_bits)
masked_dse_entropy_mse = get_target_entropy_sme_masked(pwm_start=dse_start, pwm_end=dse_end, target_bits=dse_target_bits)
punish_dn_cse_func = get_punish_cse(pwm_start=74, pwm_end=dse_end)
punish_up_c_func = get_punish_c(pwm_start=use_start, pwm_end=use_end)
punish_dn_c_func = get_punish_c(pwm_start=dse_start, pwm_end=dse_end)
punish_dn_gg_func = get_punish_gg(pwm_start=80, pwm_end=110)
punish_dn_cc_func = get_punish_cc(pwm_start=80, pwm_end=110)
punish_up_aa_func = get_punish_aa(pwm_start=use_start, pwm_end=use_end)
punish_dn_aa_func = get_punish_aa(pwm_start=dse_start, pwm_end=dse_end)
pwm_sample_entropy_func = get_pwm_margin_sample_entropy_masked(pwm_start=70-60, pwm_end=76+60, margin=0.5, shift_1_nt=True)
#pwm_sample_entropy_func = get_pwm_sample_entropy_masked(pwm_start=70-60, pwm_end=76+60)
#pwm_sample_entropy_up_func = get_pwm_margin_sample_entropy_masked(pwm_start=70-60, pwm_end=70-4, margin=0.5, shift_1_nt=True)
#pwm_sample_entropy_dn_func = get_pwm_margin_sample_entropy_masked(pwm_start=76, pwm_end=76+60, margin=0.5, shift_1_nt=True)
extra_sim = np.ones((len(target_poses), 1, 205, 4, 1))
for i in range(len(target_poses)) :
extra_sim[i, 0, 70-4:76, :, 0] = 0.0
def loss_func(loss_tensors) :
_, _, _, sequence_class, pwm_logits_1, pwm_logits_2, pwm_1, pwm_2, sampled_pwm_1, sampled_pwm_2, mask, sampled_mask, iso_pred, cut_pred, iso_score_pred, cut_score_pred = loss_tensors
#Create target cuts with sample axis (ax=1)
cut_targets = K.constant(target_cuts)
cut_true = K.gather(cut_targets, sequence_class[:, 0])
cut_true = K.tile(K.expand_dims(cut_true, axis=1), (1, K.shape(sampled_pwm_1)[1], 1))
#Specify costs
cut_loss = 1.0 * K.mean(kl_divergence(cut_true, cut_pred), axis=1)
seq_loss = 0.0
seq_loss += punish_dn_cse * K.mean(punish_dn_cse_func(sampled_pwm_1), axis=1)
seq_loss += punish_up_c * K.mean(punish_up_c_func(sampled_pwm_1), axis=1)
seq_loss += punish_dn_c * K.mean(punish_dn_c_func(sampled_pwm_1), axis=1)
seq_loss += punish_up_aa * K.mean(punish_up_aa_func(sampled_pwm_1), axis=1)
seq_loss += punish_dn_aa * K.mean(punish_dn_aa_func(sampled_pwm_1), axis=1)
seq_loss += reward_dn_ggcc * (-1.) * punish_dn_gg_func(pwm_1) * punish_dn_cc_func(pwm_1)
extra_sims = K.constant(extra_sim)
extra_sim_mask = K.gather(extra_sims, sequence_class[:, 0])
extra_sim_mask = K.tile(extra_sim_mask, (1, K.shape(sampled_pwm_1)[1], 1, 1, 1))
entropy_loss = entropy_weight * (masked_use_entropy_mse(pwm_1, mask) + cse_entropy_mse(pwm_1) + masked_dse_entropy_mse(pwm_1, mask))
entropy_loss += similarity_weight * K.mean(pwm_sample_entropy_func(sampled_pwm_1, sampled_pwm_2, sampled_mask * extra_sim_mask), axis=1)
#entropy_loss += similarity_weight * K.mean(pwm_sample_entropy_up_func(sampled_pwm_1, sampled_pwm_2, sampled_mask), axis=1) / 2.0
#entropy_loss += similarity_weight * K.mean(pwm_sample_entropy_dn_func(sampled_pwm_1, sampled_pwm_2, sampled_mask), axis=1) / 2.0
#Compute total loss
total_loss = cut_loss + seq_loss + entropy_loss
return total_loss
return loss_func
class EpochVariableCallback(Callback):
def __init__(self, my_variable, my_func):
self.my_variable = my_variable
self.my_func = my_func
def on_epoch_end(self, epoch, logs={}):
K.set_value(self.my_variable, self.my_func(K.get_value(self.my_variable), epoch))
#Function for running GENESIS
def run_genesis(sequence_templates, loss_func, library_contexts, model_path, batch_size=32, n_samples=1, n_epochs=10, steps_per_epoch=100) :
#Build Generator Network
_, generator = build_generator(batch_size, len(sequence_templates[0]), load_generator_network, n_classes=len(sequence_templates), n_samples=n_samples, sequence_templates=sequence_templates, batch_normalize_pwm=False)
#Build Predictor Network and hook it on the generator PWM output tensor
_, pwm_predictor = build_predictor(generator, load_saved_predictor(model_path, library_contexts=library_contexts), batch_size, n_samples=1, eval_mode='pwm')
_, sample_predictor = build_predictor(generator, load_saved_predictor(model_path, library_contexts=library_contexts), batch_size, n_samples=n_samples, eval_mode='sample')
for layer in pwm_predictor.layers :
if 'aparent' in layer.name :
layer.name += "_pwmversion"
#Build Loss Model (In: Generator seed, Out: Loss function)
_, pwm_loss_model = build_loss_model(pwm_predictor, loss_func)
_, sample_loss_model = build_loss_model(sample_predictor, loss_func)
dual_loss_out = Lambda(lambda x: 0.5 * x[0] + 0.5 * x[1])([pwm_loss_model.outputs[0], sample_loss_model.outputs[0]])
loss_model = Model(inputs=pwm_loss_model.inputs, outputs=dual_loss_out)
#Specify Optimizer to use
#opt = keras.optimizers.SGD(lr=0.1)
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
#Compile Loss Model (Minimize self)
loss_model.compile(loss=lambda true, pred: pred, optimizer=opt)
#Fit Loss Model
train_history = loss_model.fit(
[], np.ones((1, 1)), #Dummy training example
epochs=n_epochs,
steps_per_epoch=steps_per_epoch
)
return generator, sample_predictor, train_history
#Specfiy file path to pre-trained predictor network
save_dir = os.path.join(os.getcwd(), '../../../aparent/saved_models')
saved_predictor_model_name = 'aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd.h5'
saved_predictor_model_path = os.path.join(save_dir, saved_predictor_model_name)
#Maximize isoform proportions for all native minigene libraries
sequence_templates = [
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT',
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT',
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT',
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNATNNNNNNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT',
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNATNNNNNNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT',
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATNNNNNNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT',
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATNNNNNNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT',
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATNNNNNNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT',
'CCCTACACGACGCTCTTCCGATCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATNNGTGTCCTGCCCGGTCGGCTTGAGCGCGTGGGTCTCGTTTAGATGCTGCGCCTAACCCTAAGCAGATTCTTCATGCAATTGT'
]
library_contexts = [
'simple',
'simple',
'simple',
'simple',
'simple',
'simple',
'simple',
'simple',
'simple'
]
cut_positions = [
80,
85,
90,
95,
100,
105,
110,
115,
120
]
#Train APA Cleavage GENESIS Network
print("Training GENESIS (multi-class)")
#Number of PWMs to generate per objective
batch_size = 36
#Number of One-hot sequences to sample from the PWM at each grad step
n_samples = 10
#Number of epochs per objective to optimize
n_epochs = 50
#Number of steps (grad updates) per epoch
steps_per_epoch = 500
loss = get_cleavage_loss(
cut_positions,
use_start=22,
use_end=70,
use_target_bits=1.95,
cse_start=70,
cse_end=76,
cse_target_bits=1.95,
dse_start=76,
dse_end=125,
dse_target_bits=1.95,
entropy_weight=1.0,
similarity_weight=5.0,
punish_dn_cse=1.0,
punish_up_c=0.007,
punish_dn_c=0.010,
reward_dn_ggcc=0.0,
punish_up_aa=0.0,
punish_dn_aa=0.025
)
genesis_generator, genesis_predictor, train_history = run_genesis(sequence_templates, loss, library_contexts, saved_predictor_model_path, batch_size, n_samples, n_epochs, steps_per_epoch)
genesis_generator.get_layer('lambda_rand_sequence_class').function = lambda inp: inp
genesis_generator.get_layer('lambda_rand_input_1').function = lambda inp: inp
genesis_generator.get_layer('lambda_rand_input_2').function = lambda inp: inp
genesis_predictor.get_layer('lambda_rand_sequence_class').function = lambda inp: inp
genesis_predictor.get_layer('lambda_rand_input_1').function = lambda inp: inp
genesis_predictor.get_layer('lambda_rand_input_2').function = lambda inp: inp
# Save model and weights
save_dir = 'saved_models'
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
model_name = 'genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_generator.h5'
model_path = os.path.join(save_dir, model_name)
genesis_generator.save(model_path)
print('Saved trained model at %s ' % model_path)
model_name = 'genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_predictor.h5'
model_path = os.path.join(save_dir, model_name)
genesis_predictor.save(model_path)
print('Saved trained model at %s ' % model_path)
Training GENESIS (multi-class)
/home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
Epoch 1/50 500/500 [==============================] - 52s 103ms/step - loss: 1.4310 Epoch 2/50 500/500 [==============================] - 46s 91ms/step - loss: 0.7629 Epoch 3/50 500/500 [==============================] - 46s 92ms/step - loss: 0.6554 Epoch 4/50 500/500 [==============================] - 46s 92ms/step - loss: 0.6108 Epoch 5/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5951 Epoch 6/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5700 Epoch 7/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5618 Epoch 8/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5458 Epoch 9/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5316 Epoch 10/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5167 Epoch 11/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5141 Epoch 12/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5107 Epoch 13/50 500/500 [==============================] - 46s 92ms/step - loss: 0.5009 Epoch 14/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4928 Epoch 15/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4927 Epoch 16/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4815 Epoch 17/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4808 Epoch 18/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4748 Epoch 19/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4691 Epoch 20/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4670 Epoch 21/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4580 Epoch 22/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4430 Epoch 23/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4308 Epoch 24/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4281 Epoch 25/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4306 Epoch 26/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4193 Epoch 27/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4190 Epoch 28/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4191 Epoch 29/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4167 Epoch 30/50 500/500 [==============================] - 46s 93ms/step - loss: 0.4169 Epoch 31/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4097 Epoch 32/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4141 Epoch 33/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4129 Epoch 34/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4084 Epoch 35/50 500/500 [==============================] - 46s 93ms/step - loss: 0.4085 Epoch 36/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4053 Epoch 37/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4010 Epoch 38/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4020 Epoch 39/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4022 Epoch 40/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4023 Epoch 41/50 500/500 [==============================] - 46s 92ms/step - loss: 0.4008 Epoch 42/50 500/500 [==============================] - 46s 92ms/step - loss: 0.3940 Epoch 43/50 500/500 [==============================] - 46s 92ms/step - loss: 0.3982 Epoch 44/50 500/500 [==============================] - 46s 93ms/step - loss: 0.3982 Epoch 45/50 500/500 [==============================] - 46s 93ms/step - loss: 0.3942 Epoch 46/50 500/500 [==============================] - 46s 93ms/step - loss: 0.3944 Epoch 47/50 500/500 [==============================] - 46s 93ms/step - loss: 0.3939 Epoch 48/50 500/500 [==============================] - 46s 93ms/step - loss: 0.3949 Epoch 49/50 500/500 [==============================] - 46s 93ms/step - loss: 0.3954 Epoch 50/50 500/500 [==============================] - 46s 92ms/step - loss: 0.3954 Saved trained model at saved_models/genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_generator.h5 Saved trained model at saved_models/genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_predictor.h5
#(Re-)Load GENESIS model
save_dir = os.path.join(os.getcwd(), 'saved_models/target_cleavage')
model_name = 'genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_generator.h5'
model_path = os.path.join(save_dir, model_name)
generator = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax})
save_dir = os.path.join(os.getcwd(), 'saved_models/target_cleavage')
model_name = 'genesis_cleavage_pwm_and_multisample_marginsimilarity_hardersimilarity_simple_sym_kl_concat_retry_1_predictor.h5'
model_path = os.path.join(save_dir, model_name)
predictor = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax})
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
n = 36 * 6
sequence_class = np.array([j for i in [[k] * 20 for k in range(9)] for j in i] + [0] * 200)[:n].reshape(-1, 1)
noise_1 = np.random.uniform(-1, 1, (n, 100))
noise_2 = np.random.uniform(-1, 1, (n, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36)
_, _, _, optimized_pwm, _, _, _, _, _, iso_pred, cut_pred, _, _ = pred_outputs
#Plot one PWM sequence logo per optimized objective (Experiment 'Punish A-runs')
for pwm_index in range(9 * 20) :
sequence_template = sequence_templates[sequence_class[pwm_index, 0]]
target_cut = cut_positions[sequence_class[pwm_index, 0]]
pwm = np.expand_dims(optimized_pwm[pwm_index, :, :, 0], axis=0)
cut = np.expand_dims(cut_pred[pwm_index, 0, :], axis=0)
iso = np.expand_dims(np.sum(cut[:, target_cut:target_cut + 1], axis=-1), axis=-1)
plot_seqprop_logo(pwm, iso, cut, annotate_peaks='max', sequence_template=sequence_template, figsize=(12.5, 1.5), width_ratios=[1, 8], logo_height=0.8, usage_unit='fraction', plot_start=70-49, plot_end=76+49, save_figs=False, fig_name='cleavage_genesis_target_cut_' + str(target_cut) + "_pwm_index_" + str(pwm_index), fig_dpi=150)
#Estimate duplication rates
def get_consensus_sequence(pwm) :
consensus = ''
for j in range(pwm.shape[0]) :
nt_ix = np.argmax(pwm[j, :])
if nt_ix == 0 :
consensus += 'A'
elif nt_ix == 1 :
consensus += 'C'
elif nt_ix == 2 :
consensus += 'G'
elif nt_ix == 3 :
consensus += 'T'
return consensus
f = plt.figure(figsize=(5, 4))
n_classes = 9
n_sequences_large_list = [10, 100, 1000, 10000, 100000]
dup_rates = []
save_figs = False
for n_sequences_large in n_sequences_large_list :
n_sequences_ceil_large = int(n_sequences_large / 36) * 36 + 36
print("N Sequences = " + str(n_sequences_large))
sequence_class = np.array([np.random.randint(n_classes)] * n_sequences_ceil_large).reshape(-1, 1)
noise_1 = np.random.uniform(-1, 1, (n_sequences_ceil_large, 100))
noise_2 = np.random.uniform(-1, 1, (n_sequences_ceil_large, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36)
_, _, _, optimized_pwm_large, _, sampled_pwm_large, _, _, _, _, _, _, _ = pred_outputs
onehots_large = sampled_pwm_large[:, 0, :, :, :]
consensus_seqs_large = []
for i in range(onehots_large.shape[0]) :
consensus_seqs_large.append(get_consensus_sequence(onehots_large[i, :, :, 0]))
consensus_seqs_large = np.array(consensus_seqs_large, dtype=np.object)
#Sample first n_sequences
onehots_large_kept = onehots_large[:n_sequences_large, :, :]
consensus_large_seqs_kept = consensus_seqs_large[:n_sequences_large]
n_unique_seqs_kept = len(np.unique(consensus_large_seqs_kept))
print("Number of unique sequences = " + str(n_unique_seqs_kept))
dup_rate = 1. - n_unique_seqs_kept / n_sequences_large
dup_rates.append(dup_rate)
print("Duplication rate = " + str(round(dup_rate, 4)))
plt.plot(np.arange(len(n_sequences_large_list)), dup_rates, linewidth=3, linestyle='-', color='black')
plt.scatter(np.arange(len(n_sequences_large_list)), dup_rates, c='blue', s=45)
plt.xlabel("# Sequences", fontsize=14)
plt.ylabel("Duplication Rate", fontsize=14)
plt.xticks(np.arange(len(n_sequences_large_list)), n_sequences_large_list, fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.ylim(0, np.max(dup_rates) * 1.10 + 0.001)
plt.title("Duplication rate curve", fontsize=14)
plt.tight_layout()
if save_figs :
plt.savefig("apa_cleavage_dup_rate_curve.eps")
plt.savefig("apa_cleavage_dup_rate_curve.svg")
plt.savefig("apa_cleavage_dup_rate_curve.png", transparent=True, dpi=150)
plt.show()
N Sequences = 10 Number of unique sequences = 10 Duplication rate = 0.0 N Sequences = 100 Number of unique sequences = 100 Duplication rate = 0.0 N Sequences = 1000 Number of unique sequences = 1000 Duplication rate = 0.0 N Sequences = 10000 Number of unique sequences = 10000 Duplication rate = 0.0 N Sequences = 100000 Number of unique sequences = 100000 Duplication rate = 0.0
import scipy
import scipy.sparse as sp
n = 36 * 100
sequence_class = np.array([np.random.randint(n_classes)] * n).reshape(-1, 1)
noise_1 = np.random.uniform(-1, 1, (n, 100))
noise_2 = np.random.uniform(-1, 1, (n, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36)
_, _, _, optimized_pwm, _, sampled_pwm, _, _, _, _, _, _, _ = pred_outputs
onehots = sampled_pwm[:, 0, :, :, :]
#Evaluate Sequence diversity
def get_consensus_sequence(pwm) :
consensus = ''
for j in range(pwm.shape[0]) :
nt_ix = np.argmax(pwm[j, :])
if nt_ix == 0 :
consensus += 'A'
elif nt_ix == 1 :
consensus += 'C'
elif nt_ix == 2 :
consensus += 'G'
elif nt_ix == 3 :
consensus += 'T'
return consensus
n_sequences = 1000
consensus_seqs = []
for i in range(onehots.shape[0]) :
consensus_seqs.append(get_consensus_sequence(onehots[i, :, :, 0]))
consensus_seqs = np.array(consensus_seqs, dtype=np.object)
#Sample first n_sequences
onehots_kept = onehots[:n_sequences, :, :]
consensus_seqs_kept = consensus_seqs[:n_sequences]
print("Number of unique sequences = " + str(len(np.unique(consensus_seqs_kept))))
#Calculate average/std nucleotide entropy
nt_entropies = []
for j in range(onehots_kept.shape[1]) :
if sequence_templates[0][j] == 'N' :
p_A = np.sum(onehots_kept[:, j, 0, 0]) / n_sequences
p_C = np.sum(onehots_kept[:, j, 1, 0]) / n_sequences
p_G = np.sum(onehots_kept[:, j, 2, 0]) / n_sequences
p_T = np.sum(onehots_kept[:, j, 3, 0]) / n_sequences
nt_entropy = 0
if p_A * p_C * p_G * p_T > 0. :
nt_entropy = - (p_A * np.log2(p_A) + p_C * np.log2(p_C) + p_G * np.log2(p_G) + p_T * np.log2(p_T))
nt_entropies.append(nt_entropy)
nt_entropies = np.array(nt_entropies)
print("Mean NT Entropy = " + str(round(np.mean(nt_entropies), 4)))
print("Std NT Entropy = " + str(round(np.std(nt_entropies), 4)))
#Calculate hexamer entropies
hexamer_encoder = isol.NMerEncoder(n_mer_len=6, count_n_mers=True)
hexamers = isol.SparseBatchEncoder(encoder=hexamer_encoder)([x[20:128] for x in consensus_seqs_kept])
print(hexamers.shape)
hexamer_sum = np.ravel(hexamers.sum(axis=0))
hexamers_probs = hexamer_sum / np.sum(hexamer_sum)
n_nonzero_hexamers = len(np.nonzero(hexamer_sum > 0)[0])
print("Number of unique hexamers = " + str(n_nonzero_hexamers))
hexamer_entropy = -1. * np.sum(hexamers_probs[hexamer_sum > 0] * np.log2(hexamers_probs[hexamer_sum > 0]))
print("Hexamer Entropy = " + str(hexamer_entropy))
#Calculate average/std hexamer entropy
nonzero_index = np.nonzero(hexamer_sum > 0)[0]
hexamer_entropies = []
for j in range(n_nonzero_hexamers) :
p_on = len(np.nonzero(hexamers[:, nonzero_index[j]] > 0)[0]) / hexamers.shape[0]
p_off = 1. - p_on
hexamer_entropy = 0
if p_on * p_off > 0. :
hexamer_entropy = -(p_on * np.log2(p_on) + p_off * np.log2(p_off))
hexamer_entropies.append(hexamer_entropy)
hexamer_entropies = np.array(hexamer_entropies)
print("Mean Binary Hexamer Entropy = " + str(round(np.mean(hexamer_entropies), 4)))
print("Std Binary Hexamer Entropy = " + str(round(np.std(hexamer_entropies), 4)))
Number of unique sequences = 1000 Mean NT Entropy = 0.8703 Std NT Entropy = 0.7867 (1000, 4096) Number of unique hexamers = 1727 Hexamer Entropy = 9.068884708802472 Mean Binary Hexamer Entropy = 0.2089 Std Binary Hexamer Entropy = 0.2554
#Helper function to get the conensus sequence from a PWM
def get_consensus_sequence(pwm) :
consensus = ''
for j in range(pwm.shape[0]) :
nt_ix = np.argmax(pwm[j, :])
if nt_ix == 0 :
consensus += 'A'
elif nt_ix == 1 :
consensus += 'C'
elif nt_ix == 2 :
consensus += 'G'
elif nt_ix == 3 :
consensus += 'T'
return consensus
n_classes = 9
n_sequences = 1000
n_sequences_ceil = int(n_sequences / 36) * 36 + 36
pwms = []
consensus_seqs = []
onehot_seqs = []
iso_preds = []
cut_preds = []
objectives = []
onehot_encoder = isol.OneHotEncoder(205)
for k in range(n_classes) :
print('Predicting sequences for objective ' + str(k) + '...')
sequence_class = np.tile(np.array([[k]]), (n_sequences_ceil, 1))
noise_1 = np.random.uniform(-1, 1, (n_sequences_ceil, 100))
noise_2 = np.random.uniform(-1, 1, (n_sequences_ceil, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36)
_, _, _, optimized_pwm, _, _, _, _, _, iso_pred, cut_pred, _, _ = pred_outputs
optimized_pwm = np.expand_dims(optimized_pwm[:n_sequences, :, :, 0], axis=0)
cut_pred = cut_pred[:n_sequences, 0, :]
obj_ix = cut_positions[k]
iso_pred = np.expand_dims(np.sum(cut_pred[:, obj_ix-1:obj_ix+2], axis=-1), axis=0)
cut_pred = np.expand_dims(cut_pred, axis=0)
pwms.append(optimized_pwm)
iso_preds.append(iso_pred)
cut_preds.append(cut_pred)
consensus_seqs.append([])
onehot_seqs.append([])
for i in range(n_sequences) :
consensus_seqs[k].append(get_consensus_sequence(optimized_pwm[0, i, :, :]))
onehot_seqs[k].append(onehot_encoder(get_consensus_sequence(optimized_pwm[0, i, :, :])))
consensus_seqs[k] = np.expand_dims(np.array(consensus_seqs[k], dtype=np.object), axis=0)
onehot_seqs[k] = np.expand_dims(onehot_seqs[k], axis=0)
objectives.append(np.zeros((1, n_sequences)) + k)
pwms = np.concatenate(pwms, axis=0)
iso_preds = np.concatenate(iso_preds, axis=0)
cut_preds = np.concatenate(cut_preds, axis=0)
consensus_seqs = np.concatenate(consensus_seqs, axis=0)
onehot_seqs = np.concatenate(onehot_seqs, axis=0)
objectives = np.concatenate(objectives, axis=0)
print('pwms.shape = ' + str(pwms.shape))
print('iso_preds.shape = ' + str(iso_preds.shape))
print('cut_preds.shape = ' + str(cut_preds.shape))
print('consensus_seqs.shape = ' + str(consensus_seqs.shape))
print('onehot_seqs.shape = ' + str(onehot_seqs.shape))
print('objectives.shape = ' + str(objectives.shape))
Predicting sequences for objective 0... Predicting sequences for objective 1... Predicting sequences for objective 2... Predicting sequences for objective 3... Predicting sequences for objective 4... Predicting sequences for objective 5... Predicting sequences for objective 6... Predicting sequences for objective 7... Predicting sequences for objective 8... pwms.shape = (9, 1000, 205, 4) iso_preds.shape = (9, 1000) cut_preds.shape = (9, 1000, 206) consensus_seqs.shape = (9, 1000) onehot_seqs.shape = (9, 1000, 205, 4) objectives.shape = (9, 1000)
flat_onehot_seqs = np.reshape(onehot_seqs, (n_classes * n_sequences, 205 * 4))
flat_objectives = np.ravel(objectives)
obj_poses = cut_positions
flat_labels = np.array(['CSE+' + str(obj_poses[int(k)] - 75) for k in flat_objectives.tolist()], dtype=np.object)
flat_onehot_seqs_opt = flat_onehot_seqs[:, 24*4:124*4]
flat_onehot_seqs_dse = flat_onehot_seqs[:, 75*4:124*4]
from scipy.stats import pearsonr
import seaborn as sns
#sns.set()
#Uniquely generated sequences per objective
unique_counts = np.array([len(np.unique(consensus_seqs[k, :])) for k in range(n_classes)])
f = plt.figure(figsize=(6, 4))
plt.bar(np.arange(n_classes), unique_counts)#, edgecolor='darkgrey', linewidth=2, color='black'
plt.xticks(np.arange(n_classes), ['CSE+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.xlabel('Objective', fontsize=18)
plt.ylabel('Unique sequences', fontsize=18)
plt.tight_layout()
plt.show()
#Objective cut log odds violin plot
save_figs = False
cut_logodds = [np.log(iso_preds[k, :] / (1.0 - iso_preds[k, :])) for k in range(n_classes)]
for k in range(n_classes) :
print("Mean Cut Log Odds (CSE+" + str(obj_poses[int(k)] - 75) + ") = " + str(round(np.mean(cut_logodds[k]), 4)))
print("Std Cut Log Odds (CSE+" + str(obj_poses[int(k)] - 75) + ") = " + str(round(np.std(cut_logodds[k]), 4)))
f = plt.figure(figsize=(6, 4))
sns.violinplot(data=cut_logodds)
plt.xticks(np.arange(n_classes), ['CSE+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.xlabel('Objective', fontsize=18)
plt.ylabel('Objective cut odds (log)', fontsize=18)
plt.tight_layout()
fig_name = 'cleavage_genesis_logodds_violin'
if save_figs :
plt.savefig(fig_name + '.png', transparent=True, dpi=150)
plt.savefig(fig_name + '.svg')
plt.savefig(fig_name + '.eps')
plt.show()
#Average cut profile per objective
cut_preds_prox = cut_preds[:, :, :-1] / np.expand_dims(np.sum(cut_preds[:, :, :-1], axis=-1), axis=-1)
flat_cut_preds = np.reshape(cut_preds_prox, (n_classes * n_sequences, 205))
avg_cut_poses = np.sum(np.arange(205).reshape(1, -1) * flat_cut_preds, axis=-1)
targ_cut_poses = np.array([obj_poses[int(k)] for k in flat_objectives])
avg_pos_r, avg_pos_p_val = pearsonr(avg_cut_poses, targ_cut_poses)
avg_pos_r2 = avg_pos_r**2
print("Avg pos R^2 = " + str(round(avg_pos_r2, 4)) + ", p = " + str(round(avg_pos_p_val, 10)))
avgcut = np.mean(cut_preds[:, :, :-1] / np.expand_dims(np.sum(cut_preds[:, :, :-1], axis=-1), axis=-1), axis=1)
f = plt.figure(figsize=(6, 4))
for k in range(n_classes) :
plt.plot(np.arange(205-75), avgcut[k, 75:], linewidth=2)
plt.xticks([obj_poses[int(k)] - 75 for k in range(n_classes)], ['CSE+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.xlim(0, 50)
plt.xlabel('Objective', fontsize=18)
plt.ylabel('Objective cut proportion (%)', fontsize=18)
plt.tight_layout()
fig_name = 'cleavage_genesis_average_cut'
if save_figs :
plt.savefig(fig_name + '.png', transparent=True, dpi=150)
plt.savefig(fig_name + '.svg')
plt.savefig(fig_name + '.eps')
plt.show()
Mean Cut Log Odds (CSE+5) = 2.0981 Std Cut Log Odds (CSE+5) = 0.3425 Mean Cut Log Odds (CSE+10) = 3.1699 Std Cut Log Odds (CSE+10) = 0.4999 Mean Cut Log Odds (CSE+15) = 3.5382 Std Cut Log Odds (CSE+15) = 0.467 Mean Cut Log Odds (CSE+20) = 3.0305 Std Cut Log Odds (CSE+20) = 0.4574 Mean Cut Log Odds (CSE+25) = 2.9291 Std Cut Log Odds (CSE+25) = 0.3765 Mean Cut Log Odds (CSE+30) = 2.6425 Std Cut Log Odds (CSE+30) = 0.3897 Mean Cut Log Odds (CSE+35) = 1.7261 Std Cut Log Odds (CSE+35) = 0.3207 Mean Cut Log Odds (CSE+40) = 1.6062 Std Cut Log Odds (CSE+40) = 0.2851 Mean Cut Log Odds (CSE+45) = 1.3136 Std Cut Log Odds (CSE+45) = 0.239
Avg pos R^2 = 0.9981, p = 0.0
#PCA visualization, full sequence
save_figs = False
print('Starting PCA...')
pca = PCA(n_components=50)
flat_onehot_seqs_pca = pca.fit_transform(flat_onehot_seqs_opt)
print('Done with PCA.')
#tSNE visualization
print('Starting tSNE...')
tsne = TSNE(n_components=2)
flat_onehot_seqs_tsne = tsne.fit_transform(flat_onehot_seqs_pca)
print('Done with tSNE.')
f = plt.figure(figsize = (8, 8))
for label in ['CSE+5', 'CSE+10', 'CSE+15', 'CSE+20', 'CSE+25', 'CSE+30', 'CSE+35', 'CSE+40', 'CSE+45'] :
plt.scatter(flat_onehot_seqs_tsne[flat_labels == label, 0], flat_onehot_seqs_tsne[flat_labels == label, 1], label=label)
plt.xlabel('tSNE 1', fontsize=18)
plt.ylabel('tSNE 2', fontsize=18)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.legend(fontsize=14)
plt.tight_layout()
fig_name = 'cleavage_genesis_tsne'
if save_figs :
plt.savefig(fig_name + '.png', transparent=True, dpi=150)
plt.savefig(fig_name + '.svg')
plt.savefig(fig_name + '.eps')
plt.show()
Starting PCA... Done with PCA. Starting tSNE... Done with tSNE.
#PCA visualization, DSE only
save_figs = False
print('Starting PCA...')
pca = PCA(n_components=50)
flat_onehot_seqs_pca = pca.fit_transform(flat_onehot_seqs_dse)
print('Done with PCA.')
#tSNE visualization
print('Starting tSNE...')
tsne = TSNE(n_components=2)
flat_onehot_seqs_tsne = tsne.fit_transform(flat_onehot_seqs_pca)
print('Done with tSNE.')
f = plt.figure(figsize = (8, 8))
for label in ['CSE+5', 'CSE+10', 'CSE+15', 'CSE+20', 'CSE+25', 'CSE+30', 'CSE+35', 'CSE+40', 'CSE+45'] :
plt.scatter(flat_onehot_seqs_tsne[flat_labels == label, 0], flat_onehot_seqs_tsne[flat_labels == label, 1], label=label)
plt.xlabel('tSNE 1', fontsize=18)
plt.ylabel('tSNE 2', fontsize=18)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.legend(fontsize=14)
plt.tight_layout()
fig_name = 'cleavage_genesis_tsne_dse_only'
if save_figs :
plt.savefig(fig_name + '.png', transparent=True, dpi=150)
plt.savefig(fig_name + '.svg')
plt.savefig(fig_name + '.eps')
plt.show()
Starting PCA... Done with PCA. Starting tSNE... Done with tSNE.
#Load designed MPRA data (Collapsed over experiment replicates)
seq_dict = isoio.load('~/aparent/data/prepared_data/apa_array_data/apa_array_data_master_seq')
print("n = " + str(len(seq_dict['array_df'])))
print('Avg read count (avg over barcodes) = ' + str(np.mean(seq_dict['array_df'].query("n_barcodes >= 2")['mean_total_count'])))
print('Pooled read count (pooled over barcodes) = ' + str(np.mean(seq_dict['array_df'].query("n_barcodes >= 2")['pooled_total_count'])))
n = 39833 Avg read count (avg over barcodes) = 431.92676310713546 Pooled read count (pooled over barcodes) = 3656.5562880452626
seq_df = seq_dict['array_df']
seq_cuts = seq_dict['pooled_cuts']
#Get observed cleavage count matrix
cut_true = np.concatenate([np.array(seq_cuts[:, 180 + 20: 180 + 205].todense()), np.array(seq_cuts[:, -1].todense()).reshape(-1, 1)], axis=-1)
#Add small pseudo count to true cuts
cut_true += 0.0005
cut_true = cut_true / np.sum(cut_true, axis=-1).reshape(-1, 1)
seq_df['cut_prob_true'] = [cut_true[i, :] for i in range(len(seq_df))]
unique_experiments = seq_df.query("experiment == 'max_cut'")['subexperiment'].unique()
seq_df_filtered = seq_df.query("mean_total_count >= 100 and n_barcodes >= 5")
cut_df_filtered = seq_df.query("experiment == 'max_cut' and mean_total_count >= 10").copy()
#Map and evaluate sequences
def eval_logloss(cut_prob, target_pos) :
target = np.zeros(cut_prob.shape[0])
target[target_pos] = 0.8
target[target_pos - 1] = 0.10
target[target_pos + 1] = 0.10
ent = -np.dot(target, np.log((cut_prob + 0.001) / np.sum(cut_prob + 0.001)))
return ent
start_pos = 58
end_pos = 108
cut_poses = np.arange(end_pos - start_pos) + (start_pos - 56)
true_cut_column = 'cut_prob_true'
avg_position_true = []
cut_probs_true = []
log_loss = []
for index, row in cut_df_filtered.iterrows() :
cut_prob_true = np.ravel(row[true_cut_column])
cut_prob_true = cut_prob_true[start_pos:end_pos] / np.sum(cut_prob_true[start_pos:end_pos])
cut_probs_true.append(cut_prob_true)
avg_pos_true = np.dot(cut_prob_true, (np.arange(cut_prob_true.shape[0]) + start_pos))
avg_position_true.append(avg_pos_true)
cut_pos_caught = False
for cut_pos in [60, 65, 70, 75, 80, 85, 90, 95, 100] :
if str(cut_pos) in row['subexperiment'] :
log_loss.append(eval_logloss(np.copy(cut_prob_true), cut_pos - 55 - (start_pos - 56)))
cut_pos_caught = True
break
if not cut_pos_caught :
log_loss.append(0)
cut_df_filtered['proxcut_prob_true'] = cut_probs_true
cut_df_filtered['avgpos_true'] = avg_position_true
cut_df_filtered['loss_logloss'] = log_loss
cut_snv_df_filtered = cut_df_filtered.query("variant == 'snv'").copy()
cut_df_filtered = cut_df_filtered.query("variant != 'snv'").copy()
print("Total number of Max Cut sequences = " + str(len(cut_df_filtered)))
df_human = seq_df_filtered.query("experiment == 'clinvar_wt' and variant == 'wt'")
human_cutprobs = np.array(df_human[true_cut_column].values.tolist())[:, start_pos:end_pos]
human_cutprobs = human_cutprobs / np.sum(human_cutprobs, axis=1).reshape(-1, 1)
human_cutprob = np.mean(human_cutprobs, axis=0)
objective_poses = [5, 10, 15, 20, 25, 30, 35, 40, 45]
seq_objective_df = cut_df_filtered.copy().reset_index()[['master_seq', 'experiment', 'subexperiment', 'variant', 'gene', 'cut_prob_true', 'avgpos_true']]
Total number of Max Cut sequences = 1274
#Map and collect array data
cut_nt = 'AT'
aruns_objective = ''#_aruns
ggcc_objective = ''#_GGCC
subexperiments = {
0 : [cut_nt + aruns_objective + '_60', cut_nt + ggcc_objective + aruns_objective + '_ent_60'],#, cut_nt + aruns_objective + '_ent_60'
1 : [cut_nt + aruns_objective + '_65', cut_nt + ggcc_objective + aruns_objective + '_ent_65'],#, cut_nt + aruns_objective + '_ent_65'],
2 : [cut_nt + aruns_objective + '_70', cut_nt + ggcc_objective + aruns_objective + '_ent_70'],#, cut_nt + aruns_objective + '_ent_70'],
3 : [cut_nt + aruns_objective + '_75', cut_nt + ggcc_objective + aruns_objective + '_ent_75'],#, cut_nt + aruns_objective + '_ent_75'],
4 : [cut_nt + aruns_objective + '_80', cut_nt + ggcc_objective + aruns_objective + '_ent_80'],#, cut_nt + aruns_objective + '_ent_80'],
5 : [cut_nt + ggcc_objective + aruns_objective + '_85', cut_nt + ggcc_objective + aruns_objective + '_ent_85'],#, cut_nt + ggcc_objective + aruns_objective + '_ent_85'],
6 : [cut_nt + ggcc_objective + aruns_objective + '_90', cut_nt + ggcc_objective + aruns_objective + '_ent_90'],#, cut_nt + ggcc_objective + aruns_objective + '_ent_90'],
7 : [cut_nt + ggcc_objective + aruns_objective + '_95', cut_nt + ggcc_objective + aruns_objective + '_ent_95'],#, cut_nt + ggcc_objective + aruns_objective + '_ent_95'],
8 : [cut_nt + ggcc_objective + aruns_objective + '_100', cut_nt + ggcc_objective + aruns_objective + '_ent_100'],#, cut_nt + ggcc_objective + aruns_objective + '_ent_100'],
}
array_seqs = []
array_onehot_seqs = []
array_cut_preds = []
array_avg_poses = []
array_objectives = []
array_seqtypes = []
onehot_encoder = isol.OneHotEncoder(163)
for k in range(n_classes) :
print('Collecting sequences for objective ' + str(k) + '...')
sel_df = seq_objective_df.loc[seq_objective_df.subexperiment.isin(subexperiments[k])].copy()
sel_df['master_seq'] = sel_df.master_seq.str.slice(1, 164)
print('n_sequences = ' + str(len(sel_df)))
array_seqs.append(list(sel_df.master_seq.values))
array_seqs[k] = np.array(array_seqs[k], dtype=np.object)
array_onehot_seqs.append([])
for i in range(len(sel_df)) :
array_onehot_seqs[k].append(onehot_encoder(array_seqs[k][i]))
array_onehot_seqs[k] = np.array(array_onehot_seqs[k])
cut_pred = np.array(list(sel_df['cut_prob_true'].values))
array_cut_preds.append(cut_pred)
avg_pos = np.array(list(sel_df['avgpos_true'].values))
array_avg_poses.append(avg_pos)
array_objectives.append(np.zeros(len(sel_df)) + k)
array_seqtypes.append([])
array_seqtypes[k] = np.array(list(sel_df['variant'].values), dtype=np.object)
Collecting sequences for objective 0... n_sequences = 34 Collecting sequences for objective 1... n_sequences = 32 Collecting sequences for objective 2... n_sequences = 28 Collecting sequences for objective 3... n_sequences = 33 Collecting sequences for objective 4... n_sequences = 34 Collecting sequences for objective 5... n_sequences = 34 Collecting sequences for objective 6... n_sequences = 33 Collecting sequences for objective 7... n_sequences = 34 Collecting sequences for objective 8... n_sequences = 34
#One-hot L2-norm cluster centroid confusion matrix, DSE only
save_figs = False
cluster_means = np.mean(onehot_seqs[:, :, 76:125, :], axis=1).reshape(n_classes, -1)
l2_min_assignments = np.zeros((n_classes, n_classes))
n_correct = 0
n_tot = 0
for k in range(0, n_classes) :
for i in range(0, array_onehot_seqs[k].shape[0]) :
l2_distances = np.linalg.norm(cluster_means - array_onehot_seqs[k][i, 56:105, :].reshape(1, -1), ord=1, axis=-1)
l2_min_ix = np.argmin(l2_distances)
l2_min_assignments[k, l2_min_ix] += 1
if l2_min_ix == k :
n_correct += 1
n_tot += 1
print("Accuracy = " + str(n_correct) + " / " + str(n_tot) + " = " + str(round(n_correct / n_tot, 2)))
l2_min_assignments = l2_min_assignments / np.sum(l2_min_assignments, axis=1).reshape(-1, 1)
f = plt.figure(figsize=(6, 6))
plt.pcolor(l2_min_assignments, cmap='Reds', vmin=0, vmax=1)
plt.xticks(np.arange(n_classes) + 0.5, ['GENESIS+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45)
plt.yticks(np.arange(n_classes) + 0.5, ['ARRAY+' + str(obj_poses[int(k)] - 75) for k in range(n_classes)], fontsize=14, rotation=45)
plt.title('Mean L1-norm assignments (DSE only)', fontsize=18)
plt.xlabel('GENESIS', fontsize=18)
plt.ylabel('ARRAY', fontsize=18)
plt.tight_layout()
fig_name = 'cleavage_genesis_array_knn'
if save_figs :
plt.savefig(fig_name + '.png', transparent=True, dpi=150)
plt.savefig(fig_name + '.svg')
plt.savefig(fig_name + '.eps')
plt.show()
Accuracy = 257 / 296 = 0.87
from scipy.stats import pearsonr
import seaborn as sns
from matplotlib import colors
flat_pwms_collection = []
for class_i in range(0, n_classes) :
print("Target cut pos = " + str(obj_poses[class_i] - 75))
flat_pwms = np.zeros((pwms.shape[1], 205))
for i in range(pwms.shape[1]) :
for j in range(205) :
max_nt_ix = np.argmax(pwms[class_i, i, j, :])
flat_pwms[i, j] = max_nt_ix + 1
flat_pwms = flat_pwms[:20, 70-50: 76+50]#flat_pwms[:20, 70-45: 76+45]
print(flat_pwms.shape)
flat_pwms_collection.append(flat_pwms)
flat_pwms = np.concatenate(flat_pwms_collection[::-1], axis=0)
Target cut pos = 5 (20, 106) Target cut pos = 10 (20, 106) Target cut pos = 15 (20, 106) Target cut pos = 20 (20, 106) Target cut pos = 25 (20, 106) Target cut pos = 30 (20, 106) Target cut pos = 35 (20, 106) Target cut pos = 40 (20, 106) Target cut pos = 45 (20, 106)
#Plot diversity grid
save_figs = False
cmap = colors.ListedColormap(['red', 'blue', 'orange', 'darkgreen'])
bounds=[0, 1, 2, 3, 4, 5]
norm = colors.BoundaryNorm(bounds, cmap.N)
w = flat_pwms.shape[1]
f = plt.figure(figsize=(4, 12))
plt.imshow(flat_pwms[:20 * n_classes, :], aspect='equal', interpolation='nearest', origin='lower', cmap=cmap, norm=norm)
for class_i in range(1, n_classes) :
plt.plot([0, w], [class_i * 20, class_i * 20], linewidth=2, color='black', linestyle='--')
plt.xticks([], [])
plt.yticks([], [])
plt.xlim(0, w)
plt.ylim(0, 20 * n_classes)
plt.tight_layout()
if save_figs :
plt.savefig('genesis_apa_cleavage_simple_image_seqs.png', transparent=True, dpi=150)
plt.savefig('genesis_apa_cleavage_simple_image_seqs.svg')
plt.savefig('genesis_apa_cleavage_simple_image_seqs.eps')
plt.show()