import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, LSTM, ConvLSTM2D, GRU, BatchNormalization, LocallyConnected2D, Permute
from keras.layers import Concatenate, Reshape, Softmax, Conv2DTranspose, Embedding, Multiply
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import regularizers
from keras import backend as K
import keras.losses
import tensorflow as tf
from tensorflow.python.framework import ops
import isolearn.keras as iso
import numpy as np
import tensorflow as tf
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)
import pandas as pd
import os
import pickle
import numpy as np
import scipy.sparse as sp
import scipy.io as spio
import matplotlib.pyplot as plt
import isolearn.io as isoio
import isolearn.keras as isol
from genesis.visualization import *
from genesis.generator import *
from genesis.predictor import *
from genesis.optimizer import *
from definitions.generator.aparent_deconv_conv_generator_concat_trainmode import load_generator_network
from definitions.predictor.aparent_w_dense_functional import load_saved_predictor
import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.stats import pearsonr
import seaborn as sns
from matplotlib import colors
class IdentityEncoder(iso.SequenceEncoder) :
def __init__(self, seq_len, channel_map) :
super(IdentityEncoder, self).__init__('identity', (seq_len, len(channel_map)))
self.seq_len = seq_len
self.n_channels = len(channel_map)
self.encode_map = channel_map
self.decode_map = {
nt: ix for ix, nt in self.encode_map.items()
}
def encode(self, seq) :
encoding = np.zeros((self.seq_len, self.n_channels))
for i in range(len(seq)) :
if seq[i] in self.encode_map :
channel_ix = self.encode_map[seq[i]]
encoding[i, channel_ix] = 1.
return encoding
def encode_inplace(self, seq, encoding) :
for i in range(len(seq)) :
if seq[i] in self.encode_map :
channel_ix = self.encode_map[seq[i]]
encoding[i, channel_ix] = 1.
def encode_inplace_sparse(self, seq, encoding_mat, row_index) :
raise NotImplementError()
def decode(self, encoding) :
seq = ''
for pos in range(0, encoding.shape[0]) :
argmax_nt = np.argmax(encoding[pos, :])
max_nt = np.max(encoding[pos, :])
seq += self.decode_map[argmax_nt]
return seq
def decode_sparse(self, encoding_mat, row_index) :
raise NotImplementError()
Using TensorFlow backend.
#Load APA plasmid data (random mpra)
plasmid_dict = pickle.load(open('../../../aparent/apa_plasmid_data.pickle', 'rb'))
plasmid_df = plasmid_dict['plasmid_df']
plasmid_cuts = plasmid_dict['plasmid_cuts']
print("len(plasmid_df) = " + str(len(plasmid_df)))
#Filter data
kept_libraries = [22]
min_count = 6
min_usage = 0.0
if kept_libraries is not None :
keep_index = np.nonzero(plasmid_df.library_index.isin(kept_libraries))[0]
plasmid_df = plasmid_df.iloc[keep_index].copy()
plasmid_cuts = plasmid_cuts[keep_index, :]
if min_count is not None :
keep_index = np.nonzero(plasmid_df.total_count >= min_count)[0]
plasmid_df = plasmid_df.iloc[keep_index].copy()
plasmid_cuts = plasmid_cuts[keep_index, :]
if min_usage is not None :
prox_c = np.ravel(plasmid_cuts[:, 180+70+6:180+70+6+35].sum(axis=-1))
total_c = np.ravel(plasmid_cuts[:, 180:180+205].sum(axis=-1)) + np.ravel(plasmid_cuts[:, -1].todense())
keep_index = np.nonzero(prox_c / total_c >= min_usage)[0]
plasmid_df = plasmid_df.iloc[keep_index].copy()
plasmid_cuts = plasmid_cuts[keep_index, :]
print("len(plasmid_df) = " + str(len(plasmid_df)) + " (filtered)")
len(plasmid_df) = 3810974 len(plasmid_df) = 702209 (filtered)
shuffle_index = np.arange(len(plasmid_df))
np.random.shuffle(shuffle_index)
keep_index = shuffle_index[:50000]
plasmid_df = plasmid_df.iloc[keep_index].copy()
plasmid_cuts = plasmid_cuts[keep_index, :]
prox_c = np.ravel(plasmid_cuts[:, 180+70+6:180+70+6+35].sum(axis=-1))
total_c = np.ravel(plasmid_cuts[:, 180:180+205].sum(axis=-1)) + np.ravel(plasmid_cuts[:, -1].todense())
prox_use = prox_c / total_c
f = plt.figure(figsize=(6, 4))
plt.hist(prox_use, bins=40, linewidth=1, edgecolor='black')
plt.tight_layout()
plt.show()
#Store cached filtered dataframe
pickle.dump({'plasmid_df' : plasmid_df, 'plasmid_cuts' : plasmid_cuts}, open('apa_simple_cached_set_sample.pickle', 'wb'))
#Load cached dataframe
cached_dict = pickle.load(open('apa_simple_cached_set_sample.pickle', 'rb'))
plasmid_df = cached_dict['plasmid_df']
plasmid_cuts = cached_dict['plasmid_cuts']
print("len(plasmid_df) = " + str(len(plasmid_df)) + " (loaded)")
len(plasmid_df) = 50000 (loaded)
prox_c = np.ravel(plasmid_cuts[:, 180+70+6:180+70+6+35].sum(axis=-1))
total_c = np.ravel(plasmid_cuts[:, 180:180+205].sum(axis=-1)) + np.ravel(plasmid_cuts[:, -1].todense())
with open('apa_simple_seqs_sample.txt', 'wt') as f :
i = 0
for _, row in plasmid_df.iterrows() :
f.write(row['padded_seq'][180: 180 + 205] + "\t" + str(round(prox_c[i] / total_c[i], 4)) + "\n")
i += 1