import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, LSTM, ConvLSTM2D, GRU, BatchNormalization, LocallyConnected2D, Permute
from keras.layers import Concatenate, Reshape, Softmax, Conv2DTranspose, Embedding, Multiply
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import regularizers
from keras import backend as K
import keras.losses
import tensorflow as tf
from tensorflow.python.framework import ops
import isolearn.keras as iso
import numpy as np
import tensorflow as tf
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)
import pandas as pd
import os
import pickle
import numpy as np
import scipy.sparse as sp
import scipy.io as spio
import matplotlib.pyplot as plt
import isolearn.keras as iso
from seqprop.visualization import *
from seqprop.generator import *
from seqprop.predictor import *
from seqprop.optimizer import *
from definitions.dragonn import load_saved_predictor
import warnings
warnings.simplefilter("ignore")
from keras.backend.tensorflow_backend import set_session
def contain_tf_gpu_mem_usage() :
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
set_session(sess)
contain_tf_gpu_mem_usage()
Using TensorFlow backend.
#Download DragoNN Tutorial 4 models
#These saved models are broken/deleted...
#!wget http://mitra.stanford.edu/kundaje/projects/dragonn/case1_spi1_model.hdf5
#!wget http://mitra.stanford.edu/kundaje/projects/dragonn/case1_ctcf_model.hdf5
#!wget http://mitra.stanford.edu/kundaje/projects/dragonn/old/case2_spi1_model.hdf5
#!wget http://mitra.stanford.edu/kundaje/projects/dragonn/case2_ctcf_model.hdf5
#!wget http://mitra.stanford.edu/kundaje/projects/dragonn/case3_model.hdf5
#!wget http://mitra.stanford.edu/kundaje/projects/dragonn/case4_spi1_model.hdf5
## Download SPI1 classification model
#!wget http://mitra.stanford.edu/kundaje/projects/dragonn/SPI1.classification.model.hdf5
#spi1_classification_model=load_dragonn_model("SPI1.classification.model.hdf5")
## Download SPI1 regression model
#!wget http://mitra.stanford.edu/kundaje/projects/dragonn/SPI1.regression.model.hdf5
#spi1_regression_model=load_dragonn_model("SPI1.regression.model.hdf5")
#Simple evolution algorithm
import random
import math
def vectorizeSequence(seq):
# the order of the letters is not arbitrary.
# Flip the matrix up-down and left-right for reverse compliment
ltrdict = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 'n':[0,0,0,0]}
return np.array([ltrdict[x] for x in seq])
def ret_rand_nuc():
x = random.randint(0,3)
if x == 0:
return [1,0,0,0] # A
if x == 1:
return [0,1,0,0] # C
if x == 2:
return [0,0,1,0] # G
if x == 3:
return [0,0,0,1] # T
def vector_to_nuc(arr, seq_len=50):
seq = ''
for i in range(seq_len):
if arr[i,0] == 1:
seq = seq + 'A'
if arr[i,1] == 1:
seq = seq + 'C'
if arr[i,2] == 1:
seq = seq + 'G'
if arr[i,3] == 1:
seq = seq + 'T'
return seq
def make_random_sequences(nbr_sequences, length, constant='', no_uaug=False, no_stop=False):
# Make randomize sequences, allowing for the inclusion / exclusion of uATGs / stop codons
seqs = []
nucs = {0:'A', 1:'T', 2:'C', 3:'G'}
i = 0
while i < nbr_sequences:
new_seq = ''
for n in range(length - len(constant)):
new_seq = new_seq + nucs[random.randint(0,3)]
new_seq = new_seq + constant
seqs.append(new_seq)
i+=1
return seqs
def simple_mutate(seq, nbr_bases=1, prob=1):
if nbr_bases > 1 and prob > random.random():
nbr_bases = nbr_bases
else:
nbr_bases = 1
for i in range(nbr_bases):
pos = random.randint(0, len(seq)-1)
seq[pos] = ret_rand_nuc()
return seq
def selection(seq, model, target_output_ix, scaler, nbr_bases_to_mutate=1, multi_mutate_prob=1, seq_len=50):
seqs = np.empty([2, seq_len, 4])
seqs[0] = seq.copy()
seqs[1] = simple_mutate(seq.copy(), nbr_bases=nbr_bases_to_mutate, prob=multi_mutate_prob)
scores = model.predict(np.expand_dims(seqs, axis=1))[:, target_output_ix].reshape(-1)
if scaler is not None :
scores = scaler.inverse_transform(scores)
if scores[1] > scores[0]:
return seqs[1]
else:
return seqs[0]
def evolve(model, target_output_ix, scaler, seq_length=50, iterations=200, nbr_sequences=10) :
# Dictionary where new sequences are saved
evolved_seqs = {}
# Number of bases to mutate if the probability to 'multi-mutate' is exceeded
nbr_bases_to_mutate = 2
# Probability to change multiple bases in an iteration
prob_of_multi_mutation = 0.5
# If using the original evolution model, set seq_len to 54. That model was
# trained on UTRs that included the first for basees of the CDS (ATGG).
seq_len = seq_length
rand_seqs = make_random_sequences(nbr_sequences, seq_len)
e_seqs = np.empty([len(rand_seqs), seq_len, 4])
i = 0
for seq in rand_seqs:
e_seqs[i]=vectorizeSequence(seq.lower())
i += 1
for x in range(nbr_sequences):
evolved_seqs[x] = np.empty((iterations+1,2), dtype=object)
for x in range(nbr_sequences):
evolved_seqs[x][0, 0] = vector_to_nuc(e_seqs[x])
evolved_seqs[x][0, 1] = model.predict(np.expand_dims(e_seqs, axis=1))[:, target_output_ix].reshape(-1)[x]
for gen in range(0, iterations):
for i in range(len(e_seqs)):
# Evolve to highest score
e_seqs[i] = selection(seq=e_seqs[i], model=model, target_output_ix=target_output_ix, scaler=scaler, nbr_bases_to_mutate=nbr_bases_to_mutate, multi_mutate_prob=prob_of_multi_mutation, seq_len=seq_len)
for x in range(nbr_sequences):
evolved_seqs[x][gen+1, 0] = vector_to_nuc(e_seqs[x])
evolved_seqs[x][gen+1, 1] = model.predict(np.expand_dims(e_seqs, axis=1))[:, target_output_ix].reshape(-1)[x]
if gen % 100 == 0:
print(gen)
if scaler is not None :
for x in range(nbr_sequences):
evolved_seqs[x][:,1] = scaler.inverse_transform(evolved_seqs[x][:, 1])
return evolved_seqs
#Simulated annealing (Basin hopping)
from scipy.optimize import basinhopping, OptimizeResult
class IdentityEncoder(iso.SequenceEncoder) :
def __init__(self, seq_len, channel_map) :
super(IdentityEncoder, self).__init__('identity', (seq_len, len(channel_map)))
self.seq_len = seq_len
self.n_channels = len(channel_map)
self.encode_map = channel_map
self.decode_map = {
nt: ix for ix, nt in self.encode_map.items()
}
def encode(self, seq) :
encoding = np.zeros((self.seq_len, self.n_channels))
for i in range(len(seq)) :
if seq[i] in self.encode_map :
channel_ix = self.encode_map[seq[i]]
encoding[i, channel_ix] = 1.
return encoding
def encode_inplace(self, seq, encoding) :
for i in range(len(seq)) :
if seq[i] in self.encode_map :
channel_ix = self.encode_map[seq[i]]
encoding[i, channel_ix] = 1.
def encode_inplace_sparse(self, seq, encoding_mat, row_index) :
raise NotImplementError()
def decode(self, encoding) :
seq = ''
for pos in range(0, encoding.shape[0]) :
argmax_nt = np.argmax(encoding[pos, :])
max_nt = np.max(encoding[pos, :])
seq += self.decode_map[argmax_nt]
return seq
def decode_sparse(self, encoding_mat, row_index) :
raise NotImplementError()
def get_step_func(predictor, sequence_template, acgt_encoder, n_swaps=1) :
available_positions = [
j for j in range(len(sequence_template)) if sequence_template[j] == 'N'
]
available_nt_dict = {
0 : [1, 2, 3],
1 : [0, 2, 3],
2 : [1, 0, 3],
3 : [1, 2, 0]
}
_predict_func = get_predict_func(predictor, len(sequence_template))
def _step_func(x, sequence_template=sequence_template, available_positions=available_positions, available_nt_dict=available_nt_dict, n_swaps=n_swaps) :
onehot = np.expand_dims(np.expand_dims(x.reshape((len(sequence_template), 4)), axis=0), axis=-1)
#Choose random position and nucleotide identity
for swap_i in range(n_swaps) :
rand_pos = np.random.choice(available_positions)
curr_nt = np.argmax(onehot[0, rand_pos, :, 0])
rand_nt = np.random.choice(available_nt_dict[curr_nt])
#Swap nucleotides
onehot[0, rand_pos, :, 0] = 0.
onehot[0, rand_pos, rand_nt, 0] = 1.
new_x = np.ravel(onehot)
return new_x
return _step_func
def get_predict_func(predictor, seq_len) :
def _predict_func(x, predictor=predictor, seq_len=seq_len) :
onehot = np.expand_dims(np.expand_dims(x.reshape((seq_len, 4)), axis=0), axis=0)
score_pred = predictor.predict(x=[onehot], batch_size=1)
return -score_pred[0, 0]
return _predict_func
def run_simulated_annealing(predictor, sequence_template, acgt_encoder, n_iters=1000, n_iters_per_temperate=100, temperature_init=1.0, temperature_func=None, n_swaps=1, verbose=False) :
if temperature_func is None :
temperature_func = lambda t, curr_iter, t_init=temperature_init, total_iters=n_iters: t
n_epochs = n_iters // n_iters_per_temperate
predict_func = get_predict_func(predictor, len(sequence_template))
step_func = get_step_func(predictor, sequence_template, acgt_encoder, n_swaps=n_swaps)
#Random initialization
random_sequence = ''.join([
sequence_template[j] if sequence_template[j] != 'N' else np.random.choice(['A', 'C', 'G', 'T'])
for j in range(len(sequence_template))
])
x0 = np.ravel(acgt_encoder.encode(random_sequence))
x = x0
temperature = temperature_init
tracked_scores = [predict_func(x)]
for epoch_ix in range(n_epochs) :
x_opt, f_opt = run_basinhopping(x, predict_func, step_func, n_iters=n_iters_per_temperate, temperature=temperature)
score_opt = -f_opt
tracked_scores.append(score_opt)
if verbose :
print("Iter " + str((epoch_ix + 1) * n_iters_per_temperate) + ", Temp = " + str(round(temperature, 4)) + ", Score = " + str(round(score_opt, 4)) + "...")
x = x_opt
temperature = temperature_func(temperature, (epoch_ix + 1) * n_iters_per_temperate)
onehot_opt = np.expand_dims(np.expand_dims(x.reshape((len(sequence_template), 4)), axis=0), axis=-1)
seq_opt = acgt_encoder.decode(onehot_opt[0, :, :, 0])
return seq_opt, np.array(tracked_scores)
def run_basinhopping(x, predict_func, step_func, n_iters=1000, temperature=1.0) :
def _dummy_min_opt(fun, x0, args=(), **options) :
return OptimizeResult(fun=fun(x0), x=x0, nit=0, nfev=0, success=True)
minimizer_kwargs = {
'method' : _dummy_min_opt,
'options' : { 'maxiter' : 0 }
}
opt_res = basinhopping(predict_func, x, minimizer_kwargs=minimizer_kwargs, stepsize=None, niter=n_iters, T=temperature, take_step=step_func)
return opt_res.x, opt_res.fun
def run_simulated_annealing_batch(saved_predictor, sequence_template, acgt_encoder, n_sequences=1, n_iters=1000, n_iters_per_temperate=100, temperature_init=1.0, temperature_func=None, n_swaps=1, verbose=False) :
f = plt.figure(figsize=(6, 4))
it_space = [0] + [(epoch_ix + 1) * n_iters_per_temperate for epoch_ix in range(n_iters // n_iters_per_temperate)]
temp = temperature_init
temp_space = [temp]
for j in range(1, len(it_space)) :
it = it_space[j]
temp = temperature_func(temp, it)
temp_space.append(temp)
plt.plot(it_space, temp_space, linewidth=2, color='black', linestyle='-')
plt.xlabel("Iteration", fontsize=14)
plt.ylabel("Temperature", fontsize=14)
plt.title("Anneal schedule", fontsize=14)
plt.xlim(0, np.max(it_space))
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()
plt.show()
optimized_seqs = []
optimized_trajs = []
for sequence_ix in range(n_sequences) :
seq, scores = run_simulated_annealing(saved_predictor, sequence_template, acgt_encoder, n_iters=n_iters, n_iters_per_temperate=n_iters_per_temperate, temperature_init=temperature_init, temperature_func=temperature_func, n_swaps=n_swaps, verbose=verbose)
optimized_seqs.append(seq)
optimized_trajs.append(scores.reshape(1, -1))
optimized_trajs = np.concatenate(optimized_trajs, axis=0)
print("[Basinhopping] Finished optimizing " + str(n_sequences) + " sequences.")
return optimized_seqs, optimized_trajs
#Define target isoform loss function
def get_earthmover_loss(pwm_start=0, pwm_end=70, pwm_target_bits=1.8, pwm_entropy_weight=0.0) :
punish_c = 0.0
punish_g = 0.0
punish_aa = 0.0
entropy_mse = get_margin_entropy(pwm_start=pwm_start, pwm_end=pwm_end, min_bits=pwm_target_bits)
punish_c_func = get_punish_c(pwm_start=pwm_start, pwm_end=pwm_end)
punish_g_func = get_punish_g(pwm_start=pwm_start, pwm_end=pwm_end)
punish_aa_func = get_punish_aa(pwm_start=pwm_start, pwm_end=pwm_end)
def loss_func(predictor_outputs) :
pwm_logits, pwm, sampled_pwm, pred_bind, pred_score = predictor_outputs
#Specify costs
fitness_loss = -1.0 * K.mean(pred_score[..., 0], axis=0)
seq_loss = 0.0
seq_loss += punish_c * K.mean(punish_c_func(sampled_pwm), axis=0)
seq_loss += punish_g * K.mean(punish_g_func(sampled_pwm), axis=0)
seq_loss += punish_aa * K.mean(punish_aa_func(sampled_pwm), axis=0)
entropy_loss = pwm_entropy_weight * entropy_mse(pwm)
#Compute total loss
total_loss = fitness_loss + seq_loss + entropy_loss
return K.reshape(K.sum(total_loss, axis=0), (1,))
def val_loss_func(predictor_outputs) :
pwm_logits, pwm, sampled_pwm, pred_bind, pred_score = predictor_outputs
#Specify costs
fitness_loss = -1.0 * K.mean(pred_score[..., 0], axis=0)
seq_loss = 0.0
seq_loss += punish_c * K.mean(punish_c_func(sampled_pwm), axis=0)
seq_loss += punish_g * K.mean(punish_g_func(sampled_pwm), axis=0)
seq_loss += punish_aa * K.mean(punish_aa_func(sampled_pwm), axis=0)
entropy_loss = pwm_entropy_weight * entropy_mse(pwm)
#Compute total loss
total_loss = fitness_loss + seq_loss + entropy_loss
return K.reshape(K.mean(total_loss, axis=0), (1,))
return loss_func, val_loss_func
def get_nop_transform() :
def _transform_func(pwm) :
return pwm
return _transform_func
class ValidationCallback(Callback):
def __init__(self, val_name, val_loss_model, val_steps) :
self.val_name = val_name
self.val_loss_model = val_loss_model
self.val_steps = val_steps
self.val_loss_history = []
#Track val loss
self.val_loss_history.append(self.val_loss_model.predict(x=None, steps=self.val_steps)[0])
def on_batch_end(self, batch, logs={}) :
#Track val loss
val_loss_value = self.val_loss_model.predict(x=None, steps=self.val_steps)[0]
self.val_loss_history.append(val_loss_value)
class DummyValidationCallback(Callback):
def __init__(self, val_name) :
self.val_name = val_name
self.val_loss_history = []
class DummyFlexibleSeqPropMonitor(Callback):
def __init__(self, measure_name='Measure') :
self.measure_name = measure_name
self.measure_history = [] = []
self.entropy_history = []
self.nt_swap_history = []
self.prev_optimized_pwm = None
self.n_epochs = 0
def _tmp_load_model(model_path) :
def dummy_pred_func(y_true, y_pred) :
return y_pred
saved_model = load_model(model_path, custom_objects={
'ambig_binary_crossentropy' : dummy_pred_func,
'precision' : dummy_pred_func,
'recall' : dummy_pred_func,
'specificity' : dummy_pred_func,
'fpr' : dummy_pred_func,
'fnr' : dummy_pred_func,
'fdr' : dummy_pred_func,
'f1' : dummy_pred_func
})
return saved_model
#Function for running SeqProp on a set of objectives to optimize
def run_seqprop(sequence_templates, loss_funcs, val_loss_funcs, transform_funcs, temperature_params, n_sequences=1, n_samples=1, n_valid_samples=1, eval_mode='sample', normalize_logits=False, n_epochs=10, steps_per_epoch=100) :
if eval_mode == 'basinhopping' :
acgt_encoder = IdentityEncoder(1000, {'A':0, 'C':1, 'G':2, 'T':3})
predictor = _tmp_load_model(model_path)
predictor = Model(
inputs=predictor.inputs,
outputs = [predictor.get_layer('dense_2').output]
)
predictor.compile(
loss='mse',
optimizer=keras.optimizers.SGD(lr=0.1)
)
n_iters_per_temperate, n_swaps, t_init, t_func = temperature_params[0]
_, evolved_scores = run_simulated_annealing_batch(predictor, sequence_templates[0], acgt_encoder, n_sequences=n_sequences, n_iters=steps_per_epoch, n_iters_per_temperate=n_iters_per_temperate, temperature_init=t_init, temperature_func=t_func, n_swaps=n_swaps, verbose=False)
evolved_losses = -evolved_scores
train_history = DummyValidationCallback('loss')
train_history.val_loss_history = np.mean(evolved_losses * n_sequences, axis=0).tolist()
valid_history = DummyValidationCallback('val_loss')
valid_history.val_loss_history = np.mean(evolved_losses, axis=0).tolist()
valid_monitor = DummyFlexibleSeqPropMonitor(measure_name='Score')
valid_monitor.measure_history = [evolved_scores[:, i] for i in range(evolved_scores.shape[1])]
return [None], [valid_monitor], [train_history], [valid_history]
elif eval_mode == 'evolve' :
predictor = _tmp_load_model(model_path)
evolved_res = evolve(predictor, 0, None, seq_length=len(sequence_templates[0]), iterations=steps_per_epoch, nbr_sequences=n_sequences)
evolved_scores = np.concatenate([evolved_res[i][:, 1].reshape(1, -1) for i in range(len(evolved_res))], axis=0)
evolved_losses = -evolved_scores
train_history = DummyValidationCallback('loss')
train_history.val_loss_history = np.mean(evolved_losses, axis=0).tolist()
valid_history = DummyValidationCallback('val_loss')
valid_history.val_loss_history = np.mean(evolved_losses, axis=0).tolist()
valid_monitor = DummyFlexibleSeqPropMonitor(measure_name='Score')
valid_monitor.measure_history = [evolved_scores[:, i] for i in range(evolved_scores.shape[1])]
return [None], [valid_monitor], [train_history], [valid_history]
n_objectives = len(sequence_templates)
seqprop_predictors = []
valid_monitors = []
train_histories = []
valid_histories = []
for obj_ix in range(n_objectives) :
print("Optimizing objective " + str(obj_ix) + '...')
sequence_template = sequence_templates[obj_ix]
loss_func = loss_funcs[obj_ix]
val_loss_func = val_loss_funcs[obj_ix]
transform_func = transform_funcs[obj_ix]
#Build Generator Network
_, seqprop_generator = build_generator(seq_length=len(sequence_template), n_sequences=n_sequences, n_samples=n_samples, sequence_templates=[sequence_template * n_sequences], batch_normalize_pwm=normalize_logits, pwm_transform_func=transform_func, validation_sample_mode='gumbel' if eval_mode == 'gumbel' else 'sample')
#for layer in seqprop_generator.layers :
# if 'policy' not in layer.name :
# layer.name += "_trainversion"
_, valid_generator = build_generator(seq_length=len(sequence_template), n_sequences=n_sequences, n_samples=n_valid_samples, sequence_templates=[sequence_template * n_sequences], batch_normalize_pwm=normalize_logits, pwm_transform_func=None, validation_sample_mode='sample', master_generator=seqprop_generator)
for layer in valid_generator.layers :
#if 'policy' not in layer.name :
layer.name += "_valversion"
#Build Predictor Network and hook it on the generator PWM output tensor
_, seqprop_predictor = build_predictor(seqprop_generator, load_saved_predictor(model_path, library_context=None), n_sequences=n_sequences, n_samples=n_samples, eval_mode='pwm' if eval_mode == 'pwm' else 'sample')
#for layer in seqprop_predictor.layers :
# if '_trainversion' not in layer.name and 'policy' not in layer.name :
# layer.name += "_trainversion"
_, valid_predictor = build_predictor(valid_generator, load_saved_predictor(model_path, library_context=None), n_sequences=n_sequences, n_samples=n_valid_samples, eval_mode='sample')
for layer in valid_predictor.layers :
if '_valversion' not in layer.name :# and 'policy' not in layer.name :
layer.name += "_valversion"
#Build Loss Model (In: Generator seed, Out: Loss function)
_, loss_model = build_loss_model(seqprop_predictor, loss_func)
_, valid_loss_model = build_loss_model(valid_predictor, val_loss_func)
#Specify Optimizer to use
#opt = keras.optimizers.SGD(lr=0.5)
#opt = keras.optimizers.SGD(lr=0.1, momentum=0.9, decay=0, nesterov=True)
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
#Compile Loss Model (Minimize self)
loss_model.compile(loss=lambda true, pred: pred, optimizer=opt)
def get_logit(p) :
return np.log(p / (1. - p))
#Specify callback entities
#measure_func = lambda pred_outs: np.mean(get_logit(np.expand_dims(pred_outs[0], axis=0) if len(pred_outs[0].shape) <= 2 else pred_outs[0]), axis=0)
measure_func = lambda pred_outs: np.mean(np.expand_dims(pred_outs[1], axis=0) if len(pred_outs[1].shape) <= 2 else pred_outs[1], axis=0)
#train_monitor = FlexibleSeqPropMonitor(predictor=seqprop_predictor, plot_on_train_end=False, plot_every_epoch=False, track_every_step=True, measure_func=measure_func, measure_name='Binding Log Odds', plot_pwm_start=500, plot_pwm_end=700, sequence_template=sequence_template, plot_pwm_indices=np.arange(n_sequences).tolist(), figsize=(12, 1.0))
valid_monitor = FlexibleSeqPropMonitor(predictor=valid_predictor, plot_on_train_end=True, plot_every_epoch=False, track_every_step=True, measure_func=measure_func, measure_name='Binding Log Odds', plot_pwm_start=500, plot_pwm_end=600, sequence_template=sequence_template, plot_pwm_indices=np.arange(n_sequences).tolist(), figsize=(12, 1.0))
train_history = ValidationCallback('loss', loss_model, 1)
valid_history = ValidationCallback('val_loss', valid_loss_model, 1)
callbacks =[
#EarlyStopping(monitor='loss', min_delta=0.001, patience=5, verbose=0, mode='auto'),
valid_monitor,
train_history,
valid_history
]
#Fit Loss Model
_ = loss_model.fit(
[], np.ones((1, 1)), #Dummy training example
epochs=n_epochs,
steps_per_epoch=steps_per_epoch,
callbacks=callbacks
)
valid_monitor.predictor = None
train_history.val_loss_model = None
valid_history.val_loss_model = None
seqprop_predictors.append(seqprop_predictor)
valid_monitors.append(valid_monitor)
train_histories.append(train_history)
valid_histories.append(valid_history)
return seqprop_predictors, valid_monitors, train_histories, valid_histories
#Specfiy file path to pre-trained predictor network
save_dir = os.path.join(os.getcwd(), '')
model_name = 'SPI1.classification.model.hdf5'
model_path = os.path.join(save_dir, model_name)
import random
def set_seed(seed_value) :
# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set the `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.set_random_seed(seed_value)
# 5. Configure a new global `tensorflow` session
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
seq_template = 'N' * 1000
rand_seed = 1177#14755
#Run SeqProp Optimization
print("Running optimization experiment 'DragoNN SPI1 Maximization'")
#Number of PWMs to generate per objective
n_sequences = 10
#Number of One-hot sequences to sample from the PWM at each grad step
n_samples = 1
#Number of epochs per objective to optimize
n_epochs = 1
#Number of steps (grad updates) per epoch
steps_per_epoch = 200
#Number of One-hot validation sequences to sample from the PWM
n_valid_samples = 10
experiment_name_list = [
'Basinhopping 1/0.01',
'Basinhopping 1/0.1',
'Basinhopping 1/1.0',
'Basinhopping 1/10.0',
'Basinhopping 2/0.1',
'Basinhopping 4/0.1',
'Basinhopping 8/0.1',
'Basinhopping 16/0.1',
'Basinhopping 32/0.1',
'Evolved',
'Sampled-IN'
]
eval_mode_list = [
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'evolve',
'sample'
]
normalize_logits_list = [
False,
False,
False,
False,
False,
False,
False,
False,
False,
False,
True
] #[False, True]
temperature_params_list = [
[10, 1, 0.01, lambda t, curr_iter, t_init=0.01, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 1.0, lambda t, curr_iter, t_init=1.0, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 10.0, lambda t, curr_iter, t_init=10.0, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 2, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 4, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 8, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 16, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 32, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[None, None, None, None],
[None, None, None, None]
]
result_dict = {
'Basinhopping 1/0.01' : {},
'Basinhopping 1/0.1' : {},
'Basinhopping 1/1.0' : {},
'Basinhopping 1/10.0' : {},
'Basinhopping 2/0.1' : {},
'Basinhopping 4/0.1' : {},
'Basinhopping 8/0.1' : {},
'Basinhopping 16/0.1' : {},
'Basinhopping 32/0.1' : {},
'Evolved' : {},
'Sampled-In' : {}
}
for experiment_name, eval_mode, normalize_logits, temperature_params in zip(experiment_name_list, eval_mode_list, normalize_logits_list, temperature_params_list) :
print("Experiment name = " + str(experiment_name))
print("Eval mode = " + str(eval_mode))
print("Normalize logits = " + str(normalize_logits))
K.clear_session()
set_seed(rand_seed)
sequence_templates = [
seq_template
]
losses, val_losses = zip(*[
get_earthmover_loss(
pwm_start=0,
pwm_end=1000,
pwm_target_bits=1.8,
pwm_entropy_weight=0.0
)
])
transforms = [
None
]
seqprop_predictors, valid_monitors, train_histories, valid_histories = run_seqprop(sequence_templates, losses, val_losses, transforms, [temperature_params], n_sequences, n_samples, n_valid_samples, eval_mode, normalize_logits, n_epochs, steps_per_epoch)
seqprop_predictor, valid_monitor, train_history, valid_history = seqprop_predictors[0], valid_monitors[0], train_histories[0], valid_histories[0]
result_dict[experiment_name] = {
'seqprop_predictor' : seqprop_predictor,
'valid_monitor' : valid_monitor,
'train_history' : train_history,
'valid_history' : valid_history,
}
Running optimization experiment 'DragoNN SPI1 Maximization' Experiment name = Basinhopping 1/0.01 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/1.0 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/10.0 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 2/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 4/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 8/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 16/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 32/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Evolved Eval mode = evolve Normalize logits = False 0 100 Experiment name = Sampled-IN Eval mode = sample Normalize logits = True Optimizing objective 0... Epoch 1/1 200/200 [==============================] - 9s 47ms/step - loss: -291.1948
linestyles = [
'-',
'-',
'-',
'-',
'--',
'--',
'--',
'--',
'--',
'-',
'-'
]
save_figs = True
fig_prefix = "eval_seqprop_dragonn_spi1_earthmover_vs_evolution_and_basinhopping_experiment_200_updates_"
print("--- Comparison of loss convergence ---")
for history_prefix in ['train', 'valid'] :
loss_normalizer = n_sequences if history_prefix == 'train' else 1.
y_label_prefix = 'Train' if history_prefix == 'train' else 'Validation'
f = plt.figure(figsize=(10, 4))
ls = []
min_y_val = -60.0
max_y_val = 10.0
for experiment_ix, experiment_name in enumerate(experiment_name_list) :
curr_history = result_dict[experiment_name][history_prefix + '_history']
n_x_coords = len(np.array(curr_history.val_loss_history))
x_coord_scale = (n_epochs * steps_per_epoch) / (n_x_coords - 1) #+ 1
l1 = plt.plot(np.arange(n_x_coords) * x_coord_scale, np.array(curr_history.val_loss_history) / loss_normalizer, linewidth=2, linestyle=linestyles[experiment_ix], label=experiment_name)
ls.append(l1[0])
max_y_val = max(max_y_val, np.max(curr_history.val_loss_history) / loss_normalizer)
plt.xlabel("Weight Updates", fontsize=16)
plt.ylabel(y_label_prefix + " Loss", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(0, n_epochs * steps_per_epoch)
plt.ylim(min_y_val, max_y_val)
#plt.legend(handles=ls, fontsize=14)
plt.legend(handles=ls, fontsize=14, bbox_to_anchor=(1.04,1), loc="upper left")
plt.tight_layout()
if save_figs :
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.png', transparent=True, dpi=150)
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.svg')
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.eps')
plt.show()
print("--- Comparison of SPI1 score convergence ---")
f = plt.figure(figsize=(10, 4))
ls = []
min_y_val = -10.0
max_y_val = 60.0
for experiment_ix, experiment_name in enumerate(experiment_name_list) :
curr_monitor = result_dict[experiment_name]['valid_monitor']
meas_history = curr_monitor.measure_history
meas_history = [np.mean(meas_history[k]) for k in range(len(meas_history))]
n_x_coords = len(np.array(meas_history))
x_coord_scale = (n_epochs * steps_per_epoch) / (n_x_coords - 1) #+ 1
l1 = plt.plot(np.arange(n_x_coords) * x_coord_scale, np.array(meas_history), linewidth=2, linestyle=linestyles[experiment_ix], label=experiment_name)
ls.append(l1[0])
#max_y_val = max(max_y_val, np.max(meas_history))
plt.xlabel("Weight Updates", fontsize=16)
plt.ylabel("Validation Binding Log Odds", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(0, n_epochs * steps_per_epoch)
plt.ylim(min_y_val, max_y_val)
#plt.legend(handles=ls, fontsize=14)
plt.legend(handles=ls, fontsize=14, bbox_to_anchor=(1.04,1), loc="upper left")
plt.tight_layout()
if save_figs :
plt.savefig(fig_prefix + '_valid_logodds_cmp.png', transparent=True, dpi=150)
plt.savefig(fig_prefix + '_valid_logodds_cmp.svg')
plt.savefig(fig_prefix + '_valid_logodds_cmp.eps')
plt.show()
--- Comparison of loss convergence ---
--- Comparison of SPI1 score convergence ---
seq_template = 'N' * 1000
rand_seed = 1177#14755
#Run SeqProp Optimization
print("Running optimization experiment 'DragoNN SPI1 Maximization'")
#Number of PWMs to generate per objective
n_sequences = 10
#Number of One-hot sequences to sample from the PWM at each grad step
n_samples = 1
#Number of epochs per objective to optimize
n_epochs = 1
#Number of steps (grad updates) per epoch
steps_per_epoch = 2000
#Number of One-hot validation sequences to sample from the PWM
n_valid_samples = 10
experiment_name_list = [
'Basinhopping 1/0.01',
'Basinhopping 1/0.1',
'Basinhopping 1/1.0',
'Basinhopping 1/10.0',
'Basinhopping 2/0.1',
'Basinhopping 4/0.1',
'Basinhopping 8/0.1',
'Basinhopping 16/0.1',
'Basinhopping 32/0.1',
'Evolved',
'Sampled-IN'
]
eval_mode_list = [
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'evolve',
'sample'
]
normalize_logits_list = [
False,
False,
False,
False,
False,
False,
False,
False,
False,
False,
True
] #[False, True]
temperature_params_list = [
[10, 1, 0.01, lambda t, curr_iter, t_init=0.01, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 1.0, lambda t, curr_iter, t_init=1.0, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 10.0, lambda t, curr_iter, t_init=10.0, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 2, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 4, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 8, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 16, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 32, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[None, None, None, None],
[None, None, None, None]
]
result_dict = {
'Basinhopping 1/0.01' : {},
'Basinhopping 1/0.1' : {},
'Basinhopping 1/1.0' : {},
'Basinhopping 1/10.0' : {},
'Basinhopping 2/0.1' : {},
'Basinhopping 4/0.1' : {},
'Basinhopping 8/0.1' : {},
'Basinhopping 16/0.1' : {},
'Basinhopping 32/0.1' : {},
'Evolved' : {},
'Sampled-In' : {}
}
for experiment_name, eval_mode, normalize_logits, temperature_params in zip(experiment_name_list, eval_mode_list, normalize_logits_list, temperature_params_list) :
print("Experiment name = " + str(experiment_name))
print("Eval mode = " + str(eval_mode))
print("Normalize logits = " + str(normalize_logits))
K.clear_session()
set_seed(rand_seed)
sequence_templates = [
seq_template
]
losses, val_losses = zip(*[
get_earthmover_loss(
pwm_start=0,
pwm_end=1000,
pwm_target_bits=1.8,
pwm_entropy_weight=0.0
)
])
transforms = [
None
]
seqprop_predictors, valid_monitors, train_histories, valid_histories = run_seqprop(sequence_templates, losses, val_losses, transforms, [temperature_params], n_sequences, n_samples, n_valid_samples, eval_mode, normalize_logits, n_epochs, steps_per_epoch)
seqprop_predictor, valid_monitor, train_history, valid_history = seqprop_predictors[0], valid_monitors[0], train_histories[0], valid_histories[0]
result_dict[experiment_name] = {
'seqprop_predictor' : seqprop_predictor,
'valid_monitor' : valid_monitor,
'train_history' : train_history,
'valid_history' : valid_history,
}
Running optimization experiment 'DragoNN SPI1 Maximization' Experiment name = Basinhopping 1/0.01 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/1.0 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/10.0 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 2/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 4/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 8/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 16/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 32/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Evolved Eval mode = evolve Normalize logits = False 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 Experiment name = Sampled-IN Eval mode = sample Normalize logits = True Optimizing objective 0... Epoch 1/1 2000/2000 [==============================] - 85s 43ms/step - loss: -714.6451
linestyles = [
'-',
'-',
'-',
'-',
'--',
'--',
'--',
'--',
'--',
'-',
'-'
]
save_figs = True
fig_prefix = "eval_seqprop_dragonn_spi1_earthmover_vs_evolution_and_basinhopping_experiment_2000_updates_"
print("--- Comparison of loss convergence ---")
for history_prefix in ['train', 'valid'] :
loss_normalizer = n_sequences if history_prefix == 'train' else 1.
y_label_prefix = 'Train' if history_prefix == 'train' else 'Validation'
f = plt.figure(figsize=(10, 4))
ls = []
min_y_val = -90.0
max_y_val = 10.0
for experiment_ix, experiment_name in enumerate(experiment_name_list) :
curr_history = result_dict[experiment_name][history_prefix + '_history']
n_x_coords = len(np.array(curr_history.val_loss_history))
x_coord_scale = (n_epochs * steps_per_epoch) / (n_x_coords - 1) #+ 1
l1 = plt.plot(np.arange(n_x_coords) * x_coord_scale, np.array(curr_history.val_loss_history) / loss_normalizer, linewidth=2, linestyle=linestyles[experiment_ix], label=experiment_name)
ls.append(l1[0])
max_y_val = max(max_y_val, np.max(curr_history.val_loss_history) / loss_normalizer)
plt.xlabel("Weight Updates", fontsize=16)
plt.ylabel(y_label_prefix + " Loss", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(0, n_epochs * steps_per_epoch)
plt.ylim(min_y_val, max_y_val)
#plt.legend(handles=ls, fontsize=14)
plt.legend(handles=ls, fontsize=14, bbox_to_anchor=(1.04,1), loc="upper left")
plt.tight_layout()
if save_figs :
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.png', transparent=True, dpi=150)
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.svg')
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.eps')
plt.show()
print("--- Comparison of SPI1 score convergence ---")
f = plt.figure(figsize=(10, 4))
ls = []
min_y_val = -10.0
max_y_val = 90.0
for experiment_ix, experiment_name in enumerate(experiment_name_list) :
curr_monitor = result_dict[experiment_name]['valid_monitor']
meas_history = curr_monitor.measure_history
meas_history = [np.mean(meas_history[k]) for k in range(len(meas_history))]
n_x_coords = len(np.array(meas_history))
x_coord_scale = (n_epochs * steps_per_epoch) / (n_x_coords - 1) #+ 1
l1 = plt.plot(np.arange(n_x_coords) * x_coord_scale, np.array(meas_history), linewidth=2, linestyle=linestyles[experiment_ix], label=experiment_name)
ls.append(l1[0])
#max_y_val = max(max_y_val, np.max(meas_history))
plt.xlabel("Weight Updates", fontsize=16)
plt.ylabel("Validation Binding Log Odds", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(0, n_epochs * steps_per_epoch)
plt.ylim(min_y_val, max_y_val)
#plt.legend(handles=ls, fontsize=14)
plt.legend(handles=ls, fontsize=14, bbox_to_anchor=(1.04,1), loc="upper left")
plt.tight_layout()
if save_figs :
plt.savefig(fig_prefix + '_valid_logodds_cmp.png', transparent=True, dpi=150)
plt.savefig(fig_prefix + '_valid_logodds_cmp.svg')
plt.savefig(fig_prefix + '_valid_logodds_cmp.eps')
plt.show()
--- Comparison of loss convergence ---
--- Comparison of SPI1 score convergence ---
seq_template = 'N' * 1000
rand_seed = 1177#14755
#Run SeqProp Optimization
print("Running optimization experiment 'DragoNN SPI1 Maximization'")
#Number of PWMs to generate per objective
n_sequences = 10
#Number of One-hot sequences to sample from the PWM at each grad step
n_samples = 1
#Number of epochs per objective to optimize
n_epochs = 1
#Number of steps (grad updates) per epoch
steps_per_epoch = 10000
#Number of One-hot validation sequences to sample from the PWM
n_valid_samples = 10
experiment_name_list = [
'Basinhopping 1/0.01',
'Basinhopping 1/0.1',
'Basinhopping 1/1.0',
'Basinhopping 1/10.0',
'Basinhopping 2/0.1',
'Basinhopping 4/0.1',
'Basinhopping 8/0.1',
'Basinhopping 16/0.1',
'Basinhopping 32/0.1',
'Evolved',
'Sampled-IN'
]
eval_mode_list = [
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'evolve',
'sample'
]
normalize_logits_list = [
False,
False,
False,
False,
False,
False,
False,
False,
False,
False,
True
] #[False, True]
temperature_params_list = [
[10, 1, 0.01, lambda t, curr_iter, t_init=0.01, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 1.0, lambda t, curr_iter, t_init=1.0, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 10.0, lambda t, curr_iter, t_init=10.0, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 2, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 4, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 8, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 16, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 32, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[None, None, None, None],
[None, None, None, None]
]
result_dict = {
'Basinhopping 1/0.01' : {},
'Basinhopping 1/0.1' : {},
'Basinhopping 1/1.0' : {},
'Basinhopping 1/10.0' : {},
'Basinhopping 2/0.1' : {},
'Basinhopping 4/0.1' : {},
'Basinhopping 8/0.1' : {},
'Basinhopping 16/0.1' : {},
'Basinhopping 32/0.1' : {},
'Evolved' : {},
'Sampled-In' : {}
}
for experiment_name, eval_mode, normalize_logits, temperature_params in zip(experiment_name_list, eval_mode_list, normalize_logits_list, temperature_params_list) :
print("Experiment name = " + str(experiment_name))
print("Eval mode = " + str(eval_mode))
print("Normalize logits = " + str(normalize_logits))
K.clear_session()
set_seed(rand_seed)
sequence_templates = [
seq_template
]
losses, val_losses = zip(*[
get_earthmover_loss(
pwm_start=0,
pwm_end=1000,
pwm_target_bits=1.8,
pwm_entropy_weight=0.0
)
])
transforms = [
None
]
seqprop_predictors, valid_monitors, train_histories, valid_histories = run_seqprop(sequence_templates, losses, val_losses, transforms, [temperature_params], n_sequences, n_samples, n_valid_samples, eval_mode, normalize_logits, n_epochs, steps_per_epoch)
seqprop_predictor, valid_monitor, train_history, valid_history = seqprop_predictors[0], valid_monitors[0], train_histories[0], valid_histories[0]
result_dict[experiment_name] = {
'seqprop_predictor' : seqprop_predictor,
'valid_monitor' : valid_monitor,
'train_history' : train_history,
'valid_history' : valid_history,
}
Running optimization experiment 'DragoNN SPI1 Maximization' Experiment name = Basinhopping 1/0.01 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/1.0 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/10.0 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 2/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 4/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 8/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 16/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 32/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Evolved Eval mode = evolve Normalize logits = False 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 Experiment name = Sampled-IN Eval mode = sample Normalize logits = True Optimizing objective 0... Epoch 1/1 10000/10000 [==============================] - 425s 43ms/step - loss: -830.0594
linestyles = [
'-',
'-',
'-',
'-',
'--',
'--',
'--',
'--',
'--',
'-',
'-'
]
save_figs = True
fig_prefix = "eval_seqprop_dragonn_spi1_earthmover_vs_evolution_and_basinhopping_experiment_10000_updates_"
print("--- Comparison of loss convergence ---")
for history_prefix in ['train', 'valid'] :
loss_normalizer = n_sequences if history_prefix == 'train' else 1.
y_label_prefix = 'Train' if history_prefix == 'train' else 'Validation'
f = plt.figure(figsize=(10, 4))
ls = []
min_y_val = -90.0
max_y_val = 10.0
for experiment_ix, experiment_name in enumerate(experiment_name_list) :
curr_history = result_dict[experiment_name][history_prefix + '_history']
n_x_coords = len(np.array(curr_history.val_loss_history))
x_coord_scale = (n_epochs * steps_per_epoch) / (n_x_coords - 1) #+ 1
l1 = plt.plot(np.arange(n_x_coords) * x_coord_scale, np.array(curr_history.val_loss_history) / loss_normalizer, linewidth=2, linestyle=linestyles[experiment_ix], label=experiment_name)
ls.append(l1[0])
max_y_val = max(max_y_val, np.max(curr_history.val_loss_history) / loss_normalizer)
plt.xlabel("Weight Updates", fontsize=16)
plt.ylabel(y_label_prefix + " Loss", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(0, n_epochs * steps_per_epoch)
plt.ylim(min_y_val, max_y_val)
#plt.legend(handles=ls, fontsize=14)
plt.legend(handles=ls, fontsize=14, bbox_to_anchor=(1.04,1), loc="upper left")
plt.tight_layout()
if save_figs :
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.png', transparent=True, dpi=150)
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.svg')
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.eps')
plt.show()
print("--- Comparison of SPI1 score convergence ---")
f = plt.figure(figsize=(10, 4))
ls = []
min_y_val = -10.0
max_y_val = 90.0
for experiment_ix, experiment_name in enumerate(experiment_name_list) :
curr_monitor = result_dict[experiment_name]['valid_monitor']
meas_history = curr_monitor.measure_history
meas_history = [np.mean(meas_history[k]) for k in range(len(meas_history))]
n_x_coords = len(np.array(meas_history))
x_coord_scale = (n_epochs * steps_per_epoch) / (n_x_coords - 1) #+ 1
l1 = plt.plot(np.arange(n_x_coords) * x_coord_scale, np.array(meas_history), linewidth=2, linestyle=linestyles[experiment_ix], label=experiment_name)
ls.append(l1[0])
#max_y_val = max(max_y_val, np.max(meas_history))
plt.xlabel("Weight Updates", fontsize=16)
plt.ylabel("Validation Binding Log Odds", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(0, n_epochs * steps_per_epoch)
plt.ylim(min_y_val, max_y_val)
#plt.legend(handles=ls, fontsize=14)
plt.legend(handles=ls, fontsize=14, bbox_to_anchor=(1.04,1), loc="upper left")
plt.tight_layout()
if save_figs :
plt.savefig(fig_prefix + '_valid_logodds_cmp.png', transparent=True, dpi=150)
plt.savefig(fig_prefix + '_valid_logodds_cmp.svg')
plt.savefig(fig_prefix + '_valid_logodds_cmp.eps')
plt.show()
--- Comparison of loss convergence ---
--- Comparison of SPI1 score convergence ---
seq_template = 'N' * 1000
rand_seed = 1177#14755
#Run SeqProp Optimization
print("Running optimization experiment 'DragoNN SPI1 Maximization'")
#Number of PWMs to generate per objective
n_sequences = 10
#Number of One-hot sequences to sample from the PWM at each grad step
n_samples = 1
#Number of epochs per objective to optimize
n_epochs = 1
#Number of steps (grad updates) per epoch
steps_per_epoch = 20000
#Number of One-hot validation sequences to sample from the PWM
n_valid_samples = 10
experiment_name_list = [
'Basinhopping 1/0.01',
'Basinhopping 1/0.1',
'Basinhopping 1/1.0',
'Basinhopping 1/10.0',
'Basinhopping 2/0.1',
'Basinhopping 4/0.1',
'Basinhopping 8/0.1',
'Basinhopping 16/0.1',
'Basinhopping 32/0.1',
'Evolved',
'Sampled-IN'
]
eval_mode_list = [
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'basinhopping',
'evolve',
'sample'
]
normalize_logits_list = [
False,
False,
False,
False,
False,
False,
False,
False,
False,
False,
True
] #[False, True]
temperature_params_list = [
[10, 1, 0.01, lambda t, curr_iter, t_init=0.01, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 1.0, lambda t, curr_iter, t_init=1.0, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 1, 10.0, lambda t, curr_iter, t_init=10.0, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 2, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 4, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 8, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 16, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[10, 32, 0.1, lambda t, curr_iter, t_init=0.1, total_iters=steps_per_epoch * n_epochs, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))],
[None, None, None, None],
[None, None, None, None]
]
result_dict = {
'Basinhopping 1/0.01' : {},
'Basinhopping 1/0.1' : {},
'Basinhopping 1/1.0' : {},
'Basinhopping 1/10.0' : {},
'Basinhopping 2/0.1' : {},
'Basinhopping 4/0.1' : {},
'Basinhopping 8/0.1' : {},
'Basinhopping 16/0.1' : {},
'Basinhopping 32/0.1' : {},
'Evolved' : {},
'Sampled-In' : {}
}
for experiment_name, eval_mode, normalize_logits, temperature_params in zip(experiment_name_list, eval_mode_list, normalize_logits_list, temperature_params_list) :
print("Experiment name = " + str(experiment_name))
print("Eval mode = " + str(eval_mode))
print("Normalize logits = " + str(normalize_logits))
K.clear_session()
set_seed(rand_seed)
sequence_templates = [
seq_template
]
losses, val_losses = zip(*[
get_earthmover_loss(
pwm_start=0,
pwm_end=1000,
pwm_target_bits=1.8,
pwm_entropy_weight=0.0
)
])
transforms = [
None
]
seqprop_predictors, valid_monitors, train_histories, valid_histories = run_seqprop(sequence_templates, losses, val_losses, transforms, [temperature_params], n_sequences, n_samples, n_valid_samples, eval_mode, normalize_logits, n_epochs, steps_per_epoch)
seqprop_predictor, valid_monitor, train_history, valid_history = seqprop_predictors[0], valid_monitors[0], train_histories[0], valid_histories[0]
result_dict[experiment_name] = {
'seqprop_predictor' : seqprop_predictor,
'valid_monitor' : valid_monitor,
'train_history' : train_history,
'valid_history' : valid_history,
}
Running optimization experiment 'DragoNN SPI1 Maximization' Experiment name = Basinhopping 1/0.01 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/1.0 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 1/10.0 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 2/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 4/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 8/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 16/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Basinhopping 32/0.1 Eval mode = basinhopping Normalize logits = False
[Basinhopping] Finished optimizing 10 sequences. Experiment name = Evolved Eval mode = evolve Normalize logits = False 0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13700 13800 13900 14000 14100 14200 14300 14400 14500 14600 14700 14800 14900 15000 15100 15200 15300 15400 15500 15600 15700 15800 15900 16000 16100 16200 16300 16400 16500 16600 16700 16800 16900 17000 17100 17200 17300 17400 17500 17600 17700 17800 17900 18000 18100 18200 18300 18400 18500 18600 18700 18800 18900 19000 19100 19200 19300 19400 19500 19600 19700 19800 19900 Experiment name = Sampled-IN Eval mode = sample Normalize logits = True Optimizing objective 0... Epoch 1/1 20000/20000 [==============================] - 872s 44ms/step - loss: -846.6390
linestyles = [
'-',
'-',
'-',
'-',
'--',
'--',
'--',
'--',
'--',
'-',
'-'
]
save_figs = True
fig_prefix = "eval_seqprop_dragonn_spi1_earthmover_vs_evolution_and_basinhopping_experiment_20000_updates_"
print("--- Comparison of loss convergence ---")
for history_prefix in ['train', 'valid'] :
loss_normalizer = n_sequences if history_prefix == 'train' else 1.
y_label_prefix = 'Train' if history_prefix == 'train' else 'Validation'
f = plt.figure(figsize=(10, 4))
ls = []
min_y_val = -90.0
max_y_val = 10.0
for experiment_ix, experiment_name in enumerate(experiment_name_list) :
curr_history = result_dict[experiment_name][history_prefix + '_history']
n_x_coords = len(np.array(curr_history.val_loss_history))
x_coord_scale = (n_epochs * steps_per_epoch) / (n_x_coords - 1) #+ 1
l1 = plt.plot(np.arange(n_x_coords) * x_coord_scale, np.array(curr_history.val_loss_history) / loss_normalizer, linewidth=2, linestyle=linestyles[experiment_ix], label=experiment_name)
ls.append(l1[0])
max_y_val = max(max_y_val, np.max(curr_history.val_loss_history) / loss_normalizer)
plt.xlabel("Weight Updates", fontsize=16)
plt.ylabel(y_label_prefix + " Loss", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(0, n_epochs * steps_per_epoch)
plt.ylim(min_y_val, max_y_val)
#plt.legend(handles=ls, fontsize=14)
plt.legend(handles=ls, fontsize=14, bbox_to_anchor=(1.04,1), loc="upper left")
plt.tight_layout()
if save_figs :
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.png', transparent=True, dpi=150)
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.svg')
plt.savefig(fig_prefix + history_prefix + '_loss_cmp.eps')
plt.show()
print("--- Comparison of SPI1 score convergence ---")
f = plt.figure(figsize=(10, 4))
ls = []
min_y_val = -10.0
max_y_val = 90.0
for experiment_ix, experiment_name in enumerate(experiment_name_list) :
curr_monitor = result_dict[experiment_name]['valid_monitor']
meas_history = curr_monitor.measure_history
meas_history = [np.mean(meas_history[k]) for k in range(len(meas_history))]
n_x_coords = len(np.array(meas_history))
x_coord_scale = (n_epochs * steps_per_epoch) / (n_x_coords - 1) #+ 1
l1 = plt.plot(np.arange(n_x_coords) * x_coord_scale, np.array(meas_history), linewidth=2, linestyle=linestyles[experiment_ix], label=experiment_name)
ls.append(l1[0])
#max_y_val = max(max_y_val, np.max(meas_history))
plt.xlabel("Weight Updates", fontsize=16)
plt.ylabel("Validation Binding Log Odds", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(0, n_epochs * steps_per_epoch)
plt.ylim(min_y_val, max_y_val)
#plt.legend(handles=ls, fontsize=14)
plt.legend(handles=ls, fontsize=14, bbox_to_anchor=(1.04,1), loc="upper left")
plt.tight_layout()
if save_figs :
plt.savefig(fig_prefix + '_valid_logodds_cmp.png', transparent=True, dpi=150)
plt.savefig(fig_prefix + '_valid_logodds_cmp.svg')
plt.savefig(fig_prefix + '_valid_logodds_cmp.eps')
plt.show()
--- Comparison of loss convergence ---
--- Comparison of SPI1 score convergence ---