user = 'userName'
model_name = 'metal_prediction_CNN'
import sys
import json
print ("Initializing global variables...", end=' ')
sys.stdout.flush()
# Filepaths
output_file = './logs/results.txt'
hist_path = model_path = fig_path = './logs/'
dict_path = './dictionaries/'
print ("Done")
print (" Filepath set to ./logs/")
##################################################
print ("Importing modules...", end=' ')
import modules
print ("Done")
##################################################
print ("Reading data from disk...", end=' ')
sys.stdout.flush()
import numpy as np
import pandas as pd
df = pd.read_parquet('./datasets/Metal_all_20180601.parquet')
seqs = np.array(df.sequence)
target = np.array(df.ligandId)
cluster_numbers = np.array(df.clusterNumber90)
for i in range(target.shape[0]):
target[i] = [label_dict[target[i]]]
print ("Done")
##################################################
print ("Loading dictionaries...", end=' ')
sys.stdout.flush()
# FOFE
vocab_dic_fofe = {}
with open(dict_path + "vocab_dict_fofe", 'r') as fp:
vocab_dic_fofe = json.load(fp)
print ("Done")
##################################################
print ("Performing cross validation split...", end=' ')
ratio = 0.9
split = int(ratio*len(seqs))
train_seqs, val_seqs = seqs[:split], seqs[split:]
train_label, val_label = target[:split], target[split:]
print ("Done")
print (" Ratio :", ratio)
print (" Train_range :", 0, "-", split-1)
print (" Val_range :", split, "-", len(seqs)-1)
df.groupby('ligandId').count()
label_dict = {}
with open(dict_path + "metal_dict", 'r') as fp:
label_dict = json.load(fp)
train_args = {'sequences': train_seqs,
'labels': train_label,
'translator': vocab_dic_fofe}
val_args = {'sequences': val_seqs,
'labels': val_label,
'translator': vocab_dic_fofe}
common_args = {'batch_size': 100,
'input_shape': (800,),
'label_shape': (8, ),
'shuffle': True}
train_gen = modules.FOFEGenerator(**train_args, **common_args)
val_gen = modules.FOFEGenerator(**val_args, **common_args)
# ProtVec:100, One-hot:20, blosum62:20, property:7
dimension = 800
cutoff = 8
import tensorflow as tf
import time
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(2017)
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv2D, MaxPooling2D, Convolution1D, MaxPooling1D, AveragePooling2D
from keras.layers import Activation, Flatten, Dense, Dropout, Reshape, Embedding, Input
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.optimizers import SGD
import numpy as np
import keras
from keras.models import Model, load_model
from keras.optimizers import Adam, SGD, RMSprop
# Visualization
from keras.utils import plot_model
input_shape = (dimension,)
input_0 = Input(shape=input_shape, dtype='float32')
input_0_reshape = Reshape((1,dimension,1), input_shape=(dimension,))(input_0)
conv2d_3 = Conv2D(2, (1, 3), padding='same')(input_0_reshape)
conv2d_5 = Conv2D(2, (1, 5), padding='same')(input_0_reshape)
conv2d_7 = Conv2D(2, (1, 7), padding='same')(input_0_reshape)
x = keras.layers.concatenate([conv2d_3,conv2d_5,conv2d_7])
x = Activation('relu')(x)
x = Flatten()(x)
x = Dense(cutoff, activation='relu')(x)
output_0 = Dense(cutoff, activation='softmax')(x)
model = Model(inputs=input_0, outputs=output_0)
# end of the MODEL
sgd = SGD(lr = 0.01, momentum = 0.9, decay = 0, nesterov = False)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
# model.summary()
model_args = {'model': model,
'generators': [train_gen, val_gen],
'callbacks': [],
'post_train_args': {'user': user,
'model': model_name,
'result': output_file,
'fig_path': fig_path}}
trainer = modules.Trainer(**model_args)
import warnings;
warnings.simplefilter('ignore')
trainer.start(epoch=15)
# serialize model to JSON
model_json = model.to_json()
with open("./models/metal_predict.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("./models/metal_predict.h5")
print("Saved model to disk")