import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, LSTM, ConvLSTM2D, GRU, BatchNormalization, LocallyConnected2D, Permute
from keras.layers import Concatenate, Reshape, Softmax, Conv2DTranspose, Embedding, Multiply
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import regularizers
from keras import backend as K
import keras.losses
import tensorflow as tf
from tensorflow.python.framework import ops
import isolearn.keras as iso
import numpy as np
import tensorflow as tf
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)
import pandas as pd
import os
import pickle
import numpy as np
import scipy.sparse as sp
import scipy.io as spio
import matplotlib.pyplot as plt
import isolearn.io as isoio
import isolearn.keras as isol
from genesis.visualization import *
from genesis.generator import *
from genesis.predictor import *
from genesis.optimizer import *
from definitions.generator.aparent_deconv_conv_generator_concat import load_generator_network
from definitions.predictor.aparent import load_saved_predictor
import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.stats import pearsonr
import seaborn as sns
from matplotlib import colors
Using TensorFlow backend.
#Specfiy file path to pre-trained predictor network
save_dir = os.path.join(os.getcwd(), 'saved_models')
saved_predictor_model_name = 'aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd.h5'
saved_predictor_model_path = os.path.join(save_dir, saved_predictor_model_name)
#Define target isoform loss function
def get_isoform_loss(target_isos, isoform_start=80, isoform_end=115, use_start=0, use_end=70, use_target_bits=1.8, cse_start=70, cse_end=76, cse_target_bits=1.8, dse_start=76, dse_end=125, dse_target_bits=1.8, entropy_weight=0.0, similarity_weight=0.0, similarity_margin=0.5, punish_dn_cse=0.0, punish_up_c=0.0, punish_dn_c=0.0, punish_up_g=0.0, punish_dn_g=0.0, punish_up_aa=0.0, punish_dn_aa=0.0) :
target_iso = np.zeros((len(target_isos), 1))
for i, t_iso in enumerate(target_isos) :
target_iso[i, 0] = t_iso
masked_use_entropy_mse = get_target_entropy_sme_masked(pwm_start=use_start, pwm_end=use_end, target_bits=use_target_bits)
cse_entropy_mse = get_target_entropy_sme(pwm_start=cse_start, pwm_end=cse_end, target_bits=cse_target_bits)
masked_dse_entropy_mse = get_target_entropy_sme_masked(pwm_start=dse_start, pwm_end=dse_end, target_bits=dse_target_bits)
punish_dn_cse_func = get_punish_cse(pwm_start=74, pwm_end=dse_end)
punish_up_c_func = get_punish_c(pwm_start=use_start, pwm_end=use_end)
punish_dn_c_func = get_punish_c(pwm_start=dse_start, pwm_end=dse_end)
punish_up_g_func = get_punish_g(pwm_start=use_start, pwm_end=use_end)
punish_dn_g_func = get_punish_g(pwm_start=use_start, pwm_end=use_end)
punish_up_aa_func = get_punish_aa(pwm_start=use_start, pwm_end=use_end)
punish_dn_aa_func = get_punish_aa(pwm_start=dse_start, pwm_end=dse_end)
pwm_sample_entropy_func = get_pwm_margin_sample_entropy_masked(pwm_start=70-60, pwm_end=76+60, margin=similarity_margin, shift_1_nt=True)
extra_sim = np.ones((len(target_isos), 1, 205, 4, 1))
for i in range(len(target_isos)) :
extra_sim[i, 0, 70-4:76, :, 0] = 0.0
def loss_func(loss_tensors) :
_, _, _, sequence_class, pwm_logits_1, pwm_logits_2, pwm_1, pwm_2, sampled_pwm_1, sampled_pwm_2, mask, sampled_mask, iso_pred, cut_pred, iso_score_pred, cut_score_pred = loss_tensors
#Create target isoform with sample axis
iso_targets = K.constant(target_iso)
iso_true = K.gather(iso_targets, sequence_class[:, 0])
iso_true = K.tile(K.expand_dims(iso_true, axis=-1), (1, K.shape(sampled_pwm_1)[1], 1))
#Specify costs
iso_loss = 2.0 * K.mean(symmetric_sigmoid_kl_divergence(iso_true, iso_pred), axis=1)
seq_loss = 0.0
seq_loss += punish_dn_cse * K.mean(punish_dn_cse_func(sampled_pwm_1), axis=1)
seq_loss += punish_up_c * K.mean(punish_up_c_func(sampled_pwm_1), axis=1)
seq_loss += punish_dn_c * K.mean(punish_dn_c_func(sampled_pwm_1), axis=1)
seq_loss += punish_up_g * K.mean(punish_up_g_func(sampled_pwm_1), axis=1)
seq_loss += punish_dn_g * K.mean(punish_dn_g_func(sampled_pwm_1), axis=1)
seq_loss += punish_up_aa * K.mean(punish_up_aa_func(sampled_pwm_1), axis=1)
seq_loss += punish_dn_aa * K.mean(punish_dn_aa_func(sampled_pwm_1), axis=1)
extra_sims = K.constant(extra_sim)
extra_sim_mask = K.gather(extra_sims, sequence_class[:, 0])
extra_sim_mask = K.tile(extra_sim_mask, (1, K.shape(sampled_pwm_1)[1], 1, 1, 1))
entropy_loss = entropy_weight * (masked_use_entropy_mse(pwm_1, mask) + cse_entropy_mse(pwm_1) + masked_dse_entropy_mse(pwm_1, mask))
entropy_loss += similarity_weight * K.mean(pwm_sample_entropy_func(sampled_pwm_1, sampled_pwm_2, sampled_mask * extra_sim_mask), axis=1)
#Compute total loss
total_loss = iso_loss + seq_loss + entropy_loss
return total_loss
return loss_func
class EpochVariableCallback(Callback):
def __init__(self, my_variable, my_func):
self.my_variable = my_variable
self.my_func = my_func
def on_epoch_end(self, epoch, logs={}):
K.set_value(self.my_variable, self.my_func(K.get_value(self.my_variable), epoch))
#Function for running GENESIS
def run_genesis(sequence_templates, loss_func, library_contexts, model_path, batch_size=32, n_samples=1, n_epochs=10, steps_per_epoch=100) :
#Build Generator Network
_, generator = build_generator(batch_size, len(sequence_templates[0]), load_generator_network, n_classes=len(sequence_templates), n_samples=n_samples, sequence_templates=sequence_templates, batch_normalize_pwm=False)
#Build Predictor Network and hook it on the generator PWM output tensor
_, pwm_predictor = build_predictor(generator, load_saved_predictor(model_path, library_contexts=library_contexts), batch_size, n_samples=1, eval_mode='pwm')
_, sample_predictor = build_predictor(generator, load_saved_predictor(model_path, library_contexts=library_contexts), batch_size, n_samples=n_samples, eval_mode='sample')
for layer in pwm_predictor.layers :
if 'aparent' in layer.name :
layer.name += "_pwmversion"
#Build Loss Model (In: Generator seed, Out: Loss function)
_, pwm_loss_model = build_loss_model(pwm_predictor, loss_func)
_, sample_loss_model = build_loss_model(sample_predictor, loss_func)
dual_loss_out = Lambda(lambda x: 0.5 * x[0] + 0.5 * x[1])([pwm_loss_model.outputs[0], sample_loss_model.outputs[0]])
loss_model = Model(inputs=pwm_loss_model.inputs, outputs=dual_loss_out)
#Specify Optimizer to use
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
#Compile Loss Model (Minimize self)
loss_model.compile(loss=lambda true, pred: pred, optimizer=opt)
#Fit Loss Model
train_history = loss_model.fit(
[], np.ones((1, 1)),
epochs=n_epochs,
steps_per_epoch=steps_per_epoch
)
return generator, sample_predictor, train_history
#Specfiy file path to pre-trained predictor network
save_dir = os.path.join(os.getcwd(), '../../../aparent/saved_models')
saved_predictor_model_name = 'aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd.h5'
saved_predictor_model_path = os.path.join(save_dir, saved_predictor_model_name)
#Maximize isoform proportions for all native minigene libraries
sequence_templates = [
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG'
]
library_contexts = [
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20'
]
target_isos = [
0.05,
0.25,
0.5,
0.75,
1.0
]
margin_similarities = [
0.3,
0.3,
0.3,
0.3,
0.5
]
#Train APA Cleavage GENESIS Network
print("Training GENESIS")
#Number of PWMs to generate per objective
batch_size = 36
#Number of One-hot sequences to sample from the PWM at each grad step
n_samples = 10
#Number of epochs per objective to optimize
n_epochs = 50
#Number of steps (grad updates) per epoch
steps_per_epoch = 500
for class_i in range(len(sequence_templates)) :
print("Target iso = " + str(target_isos[class_i]))
lib_name = library_contexts[class_i].split("_")[0]
print("Library context = " + str(lib_name))
K.clear_session()
loss = get_isoform_loss(
[target_isos[class_i]],
use_start=22,
use_end=70,
use_target_bits=1.95,
cse_start=70,
cse_end=76,
cse_target_bits=1.95,
dse_start=76,
dse_end=121,
dse_target_bits=1.95,
entropy_weight=1.0,
similarity_weight=5.0,
similarity_margin=margin_similarities[class_i],
punish_dn_cse=1.0,
punish_up_c=0.0015,
punish_dn_c=0.0001,
punish_up_g=0.0001,
punish_dn_g=0.0001,
punish_up_aa=0.00025,
punish_dn_aa=0.005
)
genesis_generator, genesis_predictor, train_history = run_genesis([sequence_templates[class_i]], loss, [library_contexts[class_i]], saved_predictor_model_path, batch_size, n_samples, n_epochs, steps_per_epoch)
genesis_generator.get_layer('lambda_rand_sequence_class').function = lambda inp: inp
genesis_generator.get_layer('lambda_rand_input_1').function = lambda inp: inp
genesis_generator.get_layer('lambda_rand_input_2').function = lambda inp: inp
genesis_predictor.get_layer('lambda_rand_sequence_class').function = lambda inp: inp
genesis_predictor.get_layer('lambda_rand_input_1').function = lambda inp: inp
genesis_predictor.get_layer('lambda_rand_input_2').function = lambda inp: inp
# Save model and weights
save_dir = 'saved_models'
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
model_name = 'genesis_target_isoform_' + str(target_isos[class_i]).replace(".", "") + '_pwm_and_multisample_marginsimilarity_hardersimilarity_' + str(lib_name) + '_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_' + str(margin_similarities[class_i]).replace(".", "") + '_generator.h5'
model_path = os.path.join(save_dir, model_name)
genesis_generator.save(model_path)
print('Saved trained model at %s ' % model_path)
model_name = 'genesis_target_isoform_' + str(target_isos[class_i]).replace(".", "") + '_pwm_and_multisample_marginsimilarity_hardersimilarity_' + str(lib_name) + '_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_' + str(margin_similarities[class_i]).replace(".", "") + '_predictor.h5'
model_path = os.path.join(save_dir, model_name)
genesis_predictor.save(model_path)
print('Saved trained model at %s ' % model_path)
#Specfiy file path to pre-trained predictor network
save_dir = os.path.join(os.getcwd(), 'saved_models')
saved_predictor_model_name = 'aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd.h5'
saved_predictor_model_path = os.path.join(save_dir, saved_predictor_model_name)
#Load GENESIS models and predict sample sequences
model_names = [
'genesis_target_isoform_005_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_025_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_05_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_075_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_10_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_05_retry_1',
]
sequence_templates = [
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG'
]
library_contexts = [
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20'
]
target_isos = [
0.05,
0.25,
0.5,
0.75,
1.0
]
for class_i in range(len(sequence_templates)-1, 0-1, -1) :
print("Target iso = " + str(target_isos[class_i]))
save_dir = os.path.join(os.getcwd(), 'saved_models/target_isoform')
model_name = model_names[class_i] + '_predictor.h5'
model_path = os.path.join(save_dir, model_name)
predictor = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax})
n = 36
sequence_class = np.array([0] * n).reshape(-1, 1) #np.random.uniform(-6, 6, (n, 1)) #
noise_1 = np.random.uniform(-1, 1, (n, 100))
noise_2 = np.random.uniform(-1, 1, (n, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36)
_, _, _, optimized_pwm, _, _, _, _, _, iso_pred, cut_pred, _, _ = pred_outputs
#Plot one PWM sequence logo per optimized objective (Experiment 'Punish A-runs')
for pwm_index in range(5) :
sequence_template = sequence_templates[class_i]
pwm = np.expand_dims(optimized_pwm[pwm_index, :, :, 0], axis=0)
cut = np.expand_dims(cut_pred[pwm_index, 0, :], axis=0)
iso = np.expand_dims(np.sum(cut[:, 80: 115], axis=-1), axis=-1)
plot_seqprop_logo(pwm, iso, cut, annotate_peaks='max', sequence_template=sequence_template, figsize=(12, 1.5), width_ratios=[1, 8], logo_height=0.8, usage_unit='fraction', plot_start=70-49, plot_end=76+49, save_figs=False, fig_name='target_isoform_genesis_tomm5_' + str(target_isos[class_i]).replace(".", "") + "_pwm_index_" + str(pwm_index), fig_dpi=150)
Target iso = 1.0
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
Target iso = 0.75
Target iso = 0.5
Target iso = 0.25
Target iso = 0.05
#Enrich for near-optimal sampled PAS sequences
model_names = [
'genesis_target_isoform_10_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_05_retry_1',
]
sequence_templates = [
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG'
]
library_contexts = [
'tomm5_up_c20n20_dn_n20'
]
target_isos = [
1.0
]
for class_i in range(len(sequence_templates)-1, 0-1, -1) :
print("Target iso = " + str(target_isos[class_i]))
save_dir = os.path.join(os.getcwd(), 'saved_models/target_isoform')
model_name = model_names[class_i] + '_predictor.h5'
model_path = os.path.join(save_dir, model_name)
predictor = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax})
n = 10000
n_ceil = int(n / 36) * 36 + 36
sequence_class = np.array([0] * n_ceil).reshape(-1, 1) #np.random.uniform(-6, 6, (n, 1)) #
noise_1 = np.random.uniform(-1, 1, (n_ceil, 100))
noise_2 = np.random.uniform(-1, 1, (n_ceil, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36)
_, _, _, optimized_pwm, _, _, _, _, _, iso_pred, cut_pred, _, _ = pred_outputs
#Plot one PWM sequence logo per optimized objective (Experiment 'Punish A-runs')
n_chosen = 0
for pwm_index in range(n) :
sequence_template = sequence_templates[class_i]
pwm = np.expand_dims(optimized_pwm[pwm_index, :, :, 0], axis=0)
cut = np.expand_dims(cut_pred[pwm_index, 0, :], axis=0)
iso = np.expand_dims(np.sum(cut[:, 80: 115], axis=-1), axis=-1)
logodds = np.log(iso[0, 0] / (1. - iso[0, 0]))
if logodds >= 5. :
n_chosen += 1
plot_seqprop_logo(pwm, iso, cut, annotate_peaks='max', sequence_template=sequence_template, figsize=(12, 1.5), width_ratios=[1, 8], logo_height=0.8, usage_unit='log', plot_start=70-49, plot_end=76+49, save_figs=False, fig_name='max_isoform_genesis_tomm5_' + str(target_isos[class_i]).replace(".", "") + "_pwm_index_" + str(pwm_index), fig_dpi=150)
if n_chosen > 10 :
break
Target iso = 1.0
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
model_names = [
'genesis_target_isoform_005_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_025_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_05_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_075_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_10_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_05_retry_1',
]
sequence_templates = [
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG'
]
library_contexts = [
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20'
]
target_isos = [
0.05,
0.25,
0.5,
0.75,
1.0
]
n_classes = 5
isoform_start = 80
isoform_end = 115
#Estimate duplication rates
def get_consensus_sequence(pwm) :
consensus = ''
for j in range(pwm.shape[0]) :
nt_ix = np.argmax(pwm[j, :])
if nt_ix == 0 :
consensus += 'A'
elif nt_ix == 1 :
consensus += 'C'
elif nt_ix == 2 :
consensus += 'G'
elif nt_ix == 3 :
consensus += 'T'
return consensus
f = plt.figure(figsize=(5, 4))
n_sequences_large_list = [10, 100, 1000, 10000, 100000]
ls = []
save_figs = False
for k in range(n_classes) :
print('Predicting sequences for objective ' + str(k) + '...')
dup_rates = []
for n_sequences_large in n_sequences_large_list :
n_sequences_ceil_large = int(n_sequences_large / 36) * 36 + 36
print("N Sequences = " + str(n_sequences_large))
save_dir = os.path.join(os.getcwd(), 'saved_models/target_isoform')
model_name = model_names[k] + '_predictor.h5'
model_path = os.path.join(save_dir, model_name)
predictor = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax})
sequence_class = np.tile(np.array([[0]]), (n_sequences_ceil_large, 1))
noise_1 = np.random.uniform(-1, 1, (n_sequences_ceil_large, 100))
noise_2 = np.random.uniform(-1, 1, (n_sequences_ceil_large, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36, verbose=True)
_, _, _, optimized_pwm_large, _, sampled_pwm_large, _, _, _, _, _, _, _ = pred_outputs
onehots_large = sampled_pwm_large[:, 0, :, :, :]
consensus_seqs_large = []
for i in range(onehots_large.shape[0]) :
consensus_seqs_large.append(get_consensus_sequence(onehots_large[i, :, :, 0]))
consensus_seqs_large = np.array(consensus_seqs_large, dtype=np.object)
#Sample first n_sequences
onehots_large_kept = onehots_large[:n_sequences_large, :, :]
consensus_large_seqs_kept = consensus_seqs_large[:n_sequences_large]
n_unique_seqs_kept = len(np.unique(consensus_large_seqs_kept))
print("Number of unique sequences = " + str(n_unique_seqs_kept))
dup_rate = 1. - n_unique_seqs_kept / n_sequences_large
dup_rates.append(dup_rate)
print("Duplication rate = " + str(round(dup_rate, 4)))
l1 = plt.plot(np.arange(len(n_sequences_large_list)), dup_rates, linewidth=3, linestyle='-', label="" + str(target_isos[k]))
plt.scatter(np.arange(len(n_sequences_large_list)), dup_rates, s=45)
ls.append(l1[0])
plt.xlabel("# Sequences", fontsize=14)
plt.ylabel("Duplication Rate", fontsize=14)
plt.xticks(np.arange(len(n_sequences_large_list)), n_sequences_large_list, fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.ylim(0, np.max(dup_rates) * 1.10 + 0.001)
plt.title("Duplication rate curve", fontsize=14)
plt.legend(handles=ls, fontsize=12)
plt.tight_layout()
if save_figs :
plt.savefig("apa_isoform_tomm5_dup_rate_curve.eps")
plt.savefig("apa_isoform_tomm5_dup_rate_curve.svg")
plt.savefig("apa_isoform_tomm5_dup_rate_curve.png", transparent=True, dpi=150)
plt.show()
Predicting sequences for objective 0... N Sequences = 10
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
36/36 [==============================] - 1s 22ms/step Number of unique sequences = 10 Duplication rate = 0.0 N Sequences = 100
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
108/108 [==============================] - 2s 15ms/step Number of unique sequences = 100 Duplication rate = 0.0 N Sequences = 1000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
1008/1008 [==============================] - 11s 11ms/step Number of unique sequences = 1000 Duplication rate = 0.0 N Sequences = 10000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
10008/10008 [==============================] - 116s 12ms/step Number of unique sequences = 10000 Duplication rate = 0.0 N Sequences = 100000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
100008/100008 [==============================] - 1097s 11ms/step Number of unique sequences = 100000 Duplication rate = 0.0 Predicting sequences for objective 1... N Sequences = 10
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
36/36 [==============================] - 1s 32ms/step Number of unique sequences = 10 Duplication rate = 0.0 N Sequences = 100
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
108/108 [==============================] - 2s 18ms/step Number of unique sequences = 100 Duplication rate = 0.0 N Sequences = 1000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
1008/1008 [==============================] - 11s 11ms/step Number of unique sequences = 1000 Duplication rate = 0.0 N Sequences = 10000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
10008/10008 [==============================] - 111s 11ms/step Number of unique sequences = 10000 Duplication rate = 0.0 N Sequences = 100000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
100008/100008 [==============================] - 1187s 12ms/step Number of unique sequences = 100000 Duplication rate = 0.0 Predicting sequences for objective 2... N Sequences = 10
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
36/36 [==============================] - 2s 62ms/step Number of unique sequences = 10 Duplication rate = 0.0 N Sequences = 100
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
108/108 [==============================] - 3s 27ms/step Number of unique sequences = 100 Duplication rate = 0.0 N Sequences = 1000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
1008/1008 [==============================] - 15s 15ms/step Number of unique sequences = 1000 Duplication rate = 0.0 N Sequences = 10000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
10008/10008 [==============================] - 139s 14ms/step Number of unique sequences = 10000 Duplication rate = 0.0 N Sequences = 100000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
100008/100008 [==============================] - 1286s 13ms/step Number of unique sequences = 100000 Duplication rate = 0.0 Predicting sequences for objective 3... N Sequences = 10
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
36/36 [==============================] - 2s 61ms/step Number of unique sequences = 10 Duplication rate = 0.0 N Sequences = 100
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
108/108 [==============================] - 3s 28ms/step Number of unique sequences = 100 Duplication rate = 0.0 N Sequences = 1000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
1008/1008 [==============================] - 12s 12ms/step Number of unique sequences = 1000 Duplication rate = 0.0 N Sequences = 10000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
10008/10008 [==============================] - 117s 12ms/step Number of unique sequences = 10000 Duplication rate = 0.0 N Sequences = 100000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
100008/100008 [==============================] - 1153s 12ms/step Number of unique sequences = 99999 Duplication rate = 0.0 Predicting sequences for objective 4... N Sequences = 10
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
36/36 [==============================] - 3s 80ms/step Number of unique sequences = 10 Duplication rate = 0.0 N Sequences = 100
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
108/108 [==============================] - 4s 33ms/step Number of unique sequences = 100 Duplication rate = 0.0 N Sequences = 1000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
1008/1008 [==============================] - 13s 13ms/step Number of unique sequences = 1000 Duplication rate = 0.0 N Sequences = 10000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
10008/10008 [==============================] - 116s 12ms/step Number of unique sequences = 10000 Duplication rate = 0.0 N Sequences = 100000
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
100008/100008 [==============================] - 1177s 12ms/step Number of unique sequences = 99904 Duplication rate = 0.001
#Helper function to get the conensus sequence from a PWM
def get_consensus_sequence(pwm) :
consensus = ''
for j in range(pwm.shape[0]) :
nt_ix = np.argmax(pwm[j, :])
if nt_ix == 0 :
consensus += 'A'
elif nt_ix == 1 :
consensus += 'C'
elif nt_ix == 2 :
consensus += 'G'
elif nt_ix == 3 :
consensus += 'T'
return consensus
n_sequences = 1000
n_sequences_ceil = int(n_sequences / 36) * 36 + 36
pwms = []
consensus_seqs = []
onehot_seqs = []
iso_preds = []
cut_preds = []
objectives = []
onehot_encoder = isol.OneHotEncoder(205)
for k in range(n_classes) :
print('Predicting sequences for objective ' + str(k) + '...')
save_dir = os.path.join(os.getcwd(), 'saved_models/target_isoform')
model_name = model_names[k] + '_predictor.h5'
model_path = os.path.join(save_dir, model_name)
predictor = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax})
sequence_class = np.tile(np.array([[0]]), (n_sequences_ceil, 1))
noise_1 = np.random.uniform(-1, 1, (n_sequences_ceil, 100))
noise_2 = np.random.uniform(-1, 1, (n_sequences_ceil, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36)
_, _, _, optimized_pwm, _, _, _, _, _, iso_pred, cut_pred, _, _ = pred_outputs
optimized_pwm = np.expand_dims(optimized_pwm[:n_sequences, :, :, 0], axis=0)
iso_pred = iso_pred[:n_sequences, 0, 0]
cut_pred = cut_pred[:n_sequences, 0, :]
#iso_pred = np.expand_dims(np.sum(cut_pred[:, obj_ix-1:obj_ix+2], axis=-1), axis=0)
iso_pred = np.expand_dims(iso_pred, axis=0)
cut_pred = np.expand_dims(cut_pred, axis=0)
pwms.append(optimized_pwm)
iso_preds.append(iso_pred)
cut_preds.append(cut_pred)
consensus_seqs.append([])
onehot_seqs.append([])
for i in range(n_sequences) :
consensus_seqs[k].append(get_consensus_sequence(optimized_pwm[0, i, :, :]))
onehot_seqs[k].append(onehot_encoder(get_consensus_sequence(optimized_pwm[0, i, :, :])))
consensus_seqs[k] = np.expand_dims(np.array(consensus_seqs[k], dtype=np.object), axis=0)
onehot_seqs[k] = np.expand_dims(onehot_seqs[k], axis=0)
objectives.append(np.zeros((1, n_sequences)) + k)
pwms = np.concatenate(pwms, axis=0)
iso_preds = np.concatenate(iso_preds, axis=0)
cut_preds = np.concatenate(cut_preds, axis=0)
consensus_seqs = np.concatenate(consensus_seqs, axis=0)
onehot_seqs = np.concatenate(onehot_seqs, axis=0)
objectives = np.concatenate(objectives, axis=0)
print('pwms.shape = ' + str(pwms.shape))
print('iso_preds.shape = ' + str(iso_preds.shape))
print('cut_preds.shape = ' + str(cut_preds.shape))
print('consensus_seqs.shape = ' + str(consensus_seqs.shape))
print('onehot_seqs.shape = ' + str(onehot_seqs.shape))
print('objectives.shape = ' + str(objectives.shape))
Predicting sequences for objective 0...
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
Predicting sequences for objective 1...
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
Predicting sequences for objective 2...
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
Predicting sequences for objective 3...
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
Predicting sequences for objective 4...
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
pwms.shape = (5, 1000, 205, 4) iso_preds.shape = (5, 1000) cut_preds.shape = (5, 1000, 206) consensus_seqs.shape = (5, 1000) onehot_seqs.shape = (5, 1000, 205, 4) objectives.shape = (5, 1000)
import scipy
import scipy.sparse as sp
for k in range(n_classes) :
print('Computing sequence diversity for objective ' + str(k) + '...')
consensus_seqs_ent = consensus_seqs[k]
consensus_seqs_ent = np.array(consensus_seqs_ent, dtype=np.object)
#Sample first n_sequences
onehots_kept = onehot_seqs[:n_sequences, :, :]
consensus_seqs_kept = consensus_seqs_ent[:n_sequences]
print("Number of unique sequences = " + str(len(np.unique(consensus_seqs_kept))))
#Calculate average/std nucleotide entropy
nt_entropies = []
for j in range(onehots_kept.shape[2]) :
if sequence_templates[0][j] == 'N' :
p_A = np.sum(onehots_kept[:, j, 0, 0]) / n_sequences
p_C = np.sum(onehots_kept[:, j, 1, 0]) / n_sequences
p_G = np.sum(onehots_kept[:, j, 2, 0]) / n_sequences
p_T = np.sum(onehots_kept[:, j, 3, 0]) / n_sequences
nt_entropy = 0
if p_A * p_C * p_G * p_T > 0. :
nt_entropy = - (p_A * np.log2(p_A) + p_C * np.log2(p_C) + p_G * np.log2(p_G) + p_T * np.log2(p_T))
nt_entropies.append(nt_entropy)
nt_entropies = np.array(nt_entropies)
print("Mean NT Entropy = " + str(round(np.mean(nt_entropies), 4)))
print("Std NT Entropy = " + str(round(np.std(nt_entropies), 4)))
#Calculate hexamer entropies
hexamer_encoder = isol.NMerEncoder(n_mer_len=6, count_n_mers=True)
hexamers = isol.SparseBatchEncoder(encoder=hexamer_encoder)([x[20:128] for x in consensus_seqs_kept])
print(hexamers.shape)
hexamer_sum = np.ravel(hexamers.sum(axis=0))
hexamers_probs = hexamer_sum / np.sum(hexamer_sum)
n_nonzero_hexamers = len(np.nonzero(hexamer_sum > 0)[0])
print("Number of unique hexamers = " + str(n_nonzero_hexamers))
hexamer_entropy = -1. * np.sum(hexamers_probs[hexamer_sum > 0] * np.log2(hexamers_probs[hexamer_sum > 0]))
print("Hexamer Entropy = " + str(hexamer_entropy))
#Calculate average/std hexamer entropy
nonzero_index = np.nonzero(hexamer_sum > 0)[0]
hexamer_entropies = []
for j in range(n_nonzero_hexamers) :
p_on = len(np.nonzero(hexamers[:, nonzero_index[j]] > 0)[0]) / hexamers.shape[0]
p_off = 1. - p_on
hexamer_entropy = 0
if p_on * p_off > 0. :
hexamer_entropy = -(p_on * np.log2(p_on) + p_off * np.log2(p_off))
hexamer_entropies.append(hexamer_entropy)
hexamer_entropies = np.array(hexamer_entropies)
print("Mean Binary Hexamer Entropy = " + str(round(np.mean(hexamer_entropies), 4)))
print("Std Binary Hexamer Entropy = " + str(round(np.std(hexamer_entropies), 4)))
Computing sequence diversity for objective 0... Number of unique sequences = 1000 Mean NT Entropy = 0.0 Std NT Entropy = 0.0 (1000, 4096) Number of unique hexamers = 2295 Hexamer Entropy = 9.006434105858517 Mean Binary Hexamer Entropy = 0.1415 Std Binary Hexamer Entropy = 0.1868 Computing sequence diversity for objective 1... Number of unique sequences = 1000 Mean NT Entropy = 0.0 Std NT Entropy = 0.0 (1000, 4096) Number of unique hexamers = 1874 Hexamer Entropy = 8.812055046748004 Mean Binary Hexamer Entropy = 0.1623 Std Binary Hexamer Entropy = 0.2119 Computing sequence diversity for objective 2... Number of unique sequences = 1000 Mean NT Entropy = 0.0 Std NT Entropy = 0.0 (1000, 4096) Number of unique hexamers = 1820 Hexamer Entropy = 8.971126120816717 Mean Binary Hexamer Entropy = 0.1771 Std Binary Hexamer Entropy = 0.203 Computing sequence diversity for objective 3... Number of unique sequences = 1000 Mean NT Entropy = 0.0 Std NT Entropy = 0.0 (1000, 4096) Number of unique hexamers = 2071 Hexamer Entropy = 8.792038050113916 Mean Binary Hexamer Entropy = 0.1457 Std Binary Hexamer Entropy = 0.1999 Computing sequence diversity for objective 4... Number of unique sequences = 1000 Mean NT Entropy = 0.0 Std NT Entropy = 0.0 (1000, 4096) Number of unique hexamers = 1142 Hexamer Entropy = 7.886487174677595 Mean Binary Hexamer Entropy = 0.1752 Std Binary Hexamer Entropy = 0.2703
#Specfiy file path to pre-trained predictor network
save_dir = os.path.join(os.getcwd(), 'saved_models')
saved_predictor_model_name = 'aparent_plasmid_iso_cut_distalpas_all_libs_no_sampleweights_sgd.h5'
saved_predictor_model_path = os.path.join(save_dir, saved_predictor_model_name)
aparent_model = load_model(saved_predictor_model_path)
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
#Create a new model that outputs the conv layer activation maps together with the isoform proportion
dense_out_model = Model(
inputs = aparent_model.inputs,
outputs = [
aparent_model.get_layer('dropout_1').output #dense_1
]
)
genesis_dense_outs = []
for k in range(onehot_seqs.shape[0]) :
genesis_dense_out = dense_out_model.predict(x=[np.expand_dims(onehot_seqs[k], axis=-1), np.zeros((onehot_seqs.shape[1], 13)), np.ones((onehot_seqs.shape[1], 1))])
print(genesis_dense_out.shape)
genesis_dense_outs.append(np.expand_dims(genesis_dense_out, axis=0))
genesis_dense_outs = np.concatenate(genesis_dense_outs, axis=0)
print(genesis_dense_outs.shape)
(1000, 256) (1000, 256) (1000, 256) (1000, 256) (1000, 256) (5, 1000, 256)
flat_onehot_seqs = np.reshape(onehot_seqs, (n_classes * n_sequences, 205 * 4))
flat_objectives = np.ravel(objectives)
class_targs = ['0.05', '0.25', '0.50', '0.75', '1.00']
flat_labels = np.array([class_targs[int(k)] for k in flat_objectives.tolist()], dtype=np.object)
flat_onehot_seqs_opt = flat_onehot_seqs[:, 25*4:118*4]
flat_onehot_seqs_dse = flat_onehot_seqs[:, 75*4:124*4]
flat_dense_outs = np.reshape(genesis_dense_outs, (n_classes * n_sequences, 256))
#Uniquely generated sequences per objective
unique_count = len(np.unique(consensus_seqs[0, :]))
print('n_unique_sequences = ' + str(unique_count))
#Target vs. Engineered Isoform Log Odds
save_figs = False
iso_props = [iso_preds[k, :] for k in range(n_classes)]
iso_logodds = [np.log(iso_preds[k, :] / (1.0 - iso_preds[k, :])) for k in range(n_classes)]
for k in range(n_classes) :
print("Mean Iso Prop (Obj " + str(k) + ") = " + str(round(np.mean(iso_props[k]), 4)))
print("Std Iso Prop (Obj " + str(k) + ") = " + str(round(np.std(iso_props[k]), 4)))
print("Mean Iso Log Odds (Obj " + str(k) + ") = " + str(round(np.mean(iso_logodds[k]), 4)))
print("Std Iso Log Odds (Obj " + str(k) + ") = " + str(round(np.std(iso_logodds[k]), 4)))
print("-------------------------")
f = plt.figure(figsize=(6, 4))
sns.violinplot(data=iso_logodds, axis=0)
plt.xticks(np.arange(5), ['0.05', '0.25', '0.50', '0.75', '1.00'], fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.ylabel('Isoform Log Odds', fontsize=18)
plt.tight_layout()
if save_figs :
plt.savefig('target_isoform_genesis_tomm5_iso_magnitude.png', transparent=True, dpi=150)
plt.savefig('target_isoform_genesis_tomm5_iso_magnitude.eps')
plt.savefig('target_isoform_genesis_tomm5_iso_magnitude.svg')
plt.show()
n_unique_sequences = 1000 Mean Iso Prop (Obj 0) = 0.0435 Std Iso Prop (Obj 0) = 0.0196 Mean Iso Log Odds (Obj 0) = -3.1873 Std Iso Log Odds (Obj 0) = 0.4638 ------------------------- Mean Iso Prop (Obj 1) = 0.2259 Std Iso Prop (Obj 1) = 0.0605 Mean Iso Log Odds (Obj 1) = -1.2656 Std Iso Log Odds (Obj 1) = 0.3552 ------------------------- Mean Iso Prop (Obj 2) = 0.4913 Std Iso Prop (Obj 2) = 0.0661 Mean Iso Log Odds (Obj 2) = -0.0356 Std Iso Log Odds (Obj 2) = 0.2704 ------------------------- Mean Iso Prop (Obj 3) = 0.7399 Std Iso Prop (Obj 3) = 0.068 Mean Iso Log Odds (Obj 3) = 1.0734 Std Iso Log Odds (Obj 3) = 0.3513 ------------------------- Mean Iso Prop (Obj 4) = 0.9967 Std Iso Prop (Obj 4) = 0.0016 Mean Iso Log Odds (Obj 4) = 5.8079 Std Iso Log Odds (Obj 4) = 0.4612 -------------------------
#Load APA random MPRA data
plasmid_dict = pickle.load(open('../../../apa/apa_plasmid_data.pickle', 'rb'))
plasmid_df = plasmid_dict['plasmid_df']
plasmid_cuts = plasmid_dict['plasmid_cuts']
unique_libraries = plasmid_df['library'].unique()
#Filter out chosen library context
keep_index = (plasmid_df['library'] == 'tomm5_up_c20n20_dn_n20') & (plasmid_df['total_count'] >= 20)
keep_index = np.nonzero(keep_index)[0]
library_df = plasmid_df.iloc[keep_index].copy().reset_index()
library_cuts = plasmid_cuts[keep_index]
print(len(library_df))
print(library_cuts.shape)
125949 (125949, 507)
encoder = isol.OneHotEncoder(205)
library_onehot = [np.reshape(encoder(row['seq'][:205]), (1, 205, 4, 1)) for _, row in library_df.iterrows()]
library_onehot = np.concatenate(library_onehot, axis=0)
library_iso = library_df['proximal_count'] / library_df['total_count']
library_dense_out = dense_out_model.predict(x=[library_onehot, np.zeros((library_onehot.shape[0], 13)), np.ones((library_onehot.shape[0], 1))])
print(library_dense_out.shape)
(125949, 256)
from sklearn.neighbors import KNeighborsRegressor
nn = KNeighborsRegressor(n_neighbors=50)
nn.fit(library_dense_out, library_iso)
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=50, p=2, weights='uniform')
nn_iso = nn.predict(flat_dense_outs)
#Target vs. Engineered Isoform Log Odds
save_figs = False
nn_iso_clipped = np.clip(nn_iso, 0.01, 0.99)
nn_iso_by_class = [np.ravel(nn_iso_clipped)[flat_labels == lab] for lab in class_targs]
nn_logodds_by_class = [np.log(nn_iso_by_class[k] / (1.0 - nn_iso_by_class[k])) for k in range(len(nn_iso_by_class))]
for k in range(len(nn_iso_by_class)) :
print("Mean Iso Prop (Obj " + str(k) + ") = " + str(round(np.mean(nn_iso_by_class[k]), 4)))
print("Std Iso Prop (Obj " + str(k) + ") = " + str(round(np.std(nn_iso_by_class[k]), 4)))
print("Mean Iso Log Odds (Obj " + str(k) + ") = " + str(round(np.mean(nn_logodds_by_class[k]), 4)))
print("Std Iso Log Odds (Obj " + str(k) + ") = " + str(round(np.std(nn_logodds_by_class[k]), 4)))
print("-------------------------")
f = plt.figure(figsize=(6, 4))
sns.violinplot(data=nn_iso_by_class[:4], axis=0)
#plt.xticks(np.arange(5), ['0.05', '0.25', '0.50', '0.75', '1.00'], fontsize=14, rotation=45)
plt.xticks(np.arange(4), ['0.05', '0.25', '0.50', '0.75'], fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.ylabel('Isoform Log Odds', fontsize=18)
plt.tight_layout()
if save_figs :
plt.savefig('target_isoform_genesis_tomm5_nn_iso_magnitude.png', transparent=True, dpi=150)
plt.savefig('target_isoform_genesis_tomm5_nn_iso_magnitude.eps')
plt.savefig('target_isoform_genesis_tomm5_nn_iso_magnitude.svg')
plt.show()
f = plt.figure(figsize=(6, 4))
sns.violinplot(data=nn_iso_by_class[1:], axis=0)
plt.xticks(np.arange(4), ['0.25', '0.50', '0.75', '1.00'], fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.ylabel('Isoform Log Odds', fontsize=18)
plt.tight_layout()
if save_figs :
plt.savefig('max_isoform_genesis_tomm5_nn_iso_magnitude.png', transparent=True, dpi=150)
plt.savefig('max_isoform_genesis_tomm5_nn_iso_magnitude.eps')
plt.savefig('max_isoform_genesis_tomm5_nn_iso_magnitude.svg')
plt.show()
Mean Iso Prop (Obj 0) = 0.0599 Std Iso Prop (Obj 0) = 0.0303 Mean Iso Log Odds (Obj 0) = -2.8676 Std Iso Log Odds (Obj 0) = 0.5108 ------------------------- Mean Iso Prop (Obj 1) = 0.251 Std Iso Prop (Obj 1) = 0.0695 Mean Iso Log Odds (Obj 1) = -1.1258 Std Iso Log Odds (Obj 1) = 0.3686 ------------------------- Mean Iso Prop (Obj 2) = 0.5322 Std Iso Prop (Obj 2) = 0.0705 Mean Iso Log Odds (Obj 2) = 0.1311 Std Iso Log Odds (Obj 2) = 0.2895 ------------------------- Mean Iso Prop (Obj 3) = 0.7366 Std Iso Prop (Obj 3) = 0.0718 Mean Iso Log Odds (Obj 3) = 1.0563 Std Iso Log Odds (Obj 3) = 0.3565 ------------------------- Mean Iso Prop (Obj 4) = 0.8805 Std Iso Prop (Obj 4) = 0.0049 Mean Iso Log Odds (Obj 4) = 1.998 Std Iso Log Odds (Obj 4) = 0.0468 -------------------------
#Load GENESIS models and predict sample sequences
model_names = [
'genesis_target_isoform_005_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_025_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_05_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_075_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_03_retry_1',
'genesis_target_isoform_10_pwm_and_multisample_marginsimilarity_hardersimilarity_tomm5_sym_kl_concat_lesserpunishupc_lesspunishaa_marginsim_05_retry_1',
]
sequence_templates = [
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG',
'TATTACCTGCGGCCGCAATTCTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTAAAATATAAAACTATTTGGGAAGTATGAAACNNNNNNNNNNNNNNNNNNNNACCCTTATCCCTGTGACGTTTGGCCTCTGACAATACTGGTATAATTGTAAATAATGTCAAACTCCGTTTTCTAGCAAGTATTAAGGG'
]
library_contexts = [
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20',
'tomm5_up_c20n20_dn_n20'
]
target_isos = [
0.05,
0.25,
0.5,
0.75,
1.0
]
flat_pwms_collection = []
for class_i in range(len(sequence_templates)) :
print("Target iso = " + str(target_isos[class_i]))
save_dir = os.path.join(os.getcwd(), 'saved_models/target_isoform')
model_name = model_names[class_i] + '_predictor.h5'
model_path = os.path.join(save_dir, model_name)
predictor = load_model(model_path, custom_objects={'st_sampled_softmax': st_sampled_softmax, 'st_hardmax_softmax': st_hardmax_softmax})
n = 36
sequence_class = np.array(([0] * n)).reshape(-1, 1)
noise_1 = np.random.uniform(-1, 1, (n, 100))
noise_2 = np.random.uniform(-1, 1, (n, 100))
pred_outputs = predictor.predict([sequence_class, noise_1, noise_2], batch_size=36)
_, _, _, optimized_pwm, _, _, _, _, _, iso_pred, cut_pred, _, _ = pred_outputs
pwms = optimized_pwm[:, :, :, 0]
flat_pwms = np.zeros((n, 205))
for i in range(n) :
for j in range(205) :
max_nt_ix = np.argmax(pwms[i, j, :])
flat_pwms[i, j] = max_nt_ix + 1
flat_pwms = flat_pwms[:20, 20: 120]
print(flat_pwms.shape)
flat_pwms_collection.append(flat_pwms)
flat_pwms = np.concatenate(flat_pwms_collection[::-1], axis=0)
Target iso = 0.05
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
(20, 100) Target iso = 0.25
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
(20, 100) Target iso = 0.5
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
(20, 100) Target iso = 0.75
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
(20, 100) Target iso = 1.0
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/saving.py:292: UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually. warnings.warn('No training configuration found in save file: '
(20, 100)
#Plot diversity grid
save_figs = False
cmap = colors.ListedColormap(['red', 'blue', 'orange', 'darkgreen'])
bounds=[0, 1, 2, 3, 4, 5]
norm = colors.BoundaryNorm(bounds, cmap.N)
f = plt.figure(figsize=(4, 12))
plt.imshow(flat_pwms[:100, :], aspect='equal', interpolation='nearest', origin='lower', cmap=cmap, norm=norm)
plt.plot([0, 100], [20, 20], linewidth=2, color='black', linestyle='--')
plt.plot([0, 100], [40, 40], linewidth=2, color='black', linestyle='--')
plt.plot([0, 100], [60, 60], linewidth=2, color='black', linestyle='--')
plt.plot([0, 100], [80, 80], linewidth=2, color='black', linestyle='--')
plt.xticks([], [])
plt.yticks([], [])
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.tight_layout()
if save_figs :
plt.savefig('genesis_apa_target_isoform_tomm5_image_seqs.png', transparent=True, dpi=150)
plt.savefig('genesis_apa_target_isoform_tomm5_image_seqs.svg')
plt.savefig('genesis_apa_target_isoform_tomm5_image_seqs.eps')
plt.show()