This tutorial illustrate an alternative for performing variant effect prediction using a native keras model (as opposed to using the Janggu model wrapper).
This tutorial requires janggu>=0.10.0.
import os
import numpy as np
import h5py
from keras import backend as K
from keras.layers import Conv2D
from keras.layers import GlobalAveragePooling2D
from pkg_resources import resource_filename
from janggu import create_model
from janggu import predict_variant_effect
from janggu import inputlayer
from janggu import outputdense
from janggu.data import Bioseq
from janggu.data import Cover
from janggu.data import GenomicIndexer
from janggu.data import ReduceDim
from janggu.data import plotGenomeTrack
from janggu.layers import DnaConv2D
np.random.seed(1234)
First, we need to specify the output directory in which the results are stored and load the datasets. We also specify the number of epochs to train the model and the sequence feature order.
order = 3
epochs = 100
os.environ['JANGGU_OUTPUT'] = '/home/wkopp/janggu_examples'
# load the dataset
# The pseudo genome represents just a concatenation of all sequences
# in sample.fa and sample2.fa. Therefore, the results should be almost
# identically to the models obtained from classify_fasta.py.
REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
VCFFILE = resource_filename('janggu', 'resources/pseudo_snps.vcf')
# ROI contains regions spanning positive and negative examples
ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed')
ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed')
# PEAK_FILE only contains positive examples
PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
# Training input and labels are purely defined genomic coordinates
DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
roi=ROI_TRAIN_FILE,
binsize=200,
order=order)
LABELS = Cover.create_from_bed('peaks', roi=ROI_TRAIN_FILE,
bedfiles=PEAK_FILE,
binsize=200,
resolution=200)
DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
roi=ROI_TEST_FILE,
binsize=200,
order=order)
LABELS_TEST = Cover.create_from_bed('peaks',
roi=ROI_TEST_FILE,
bedfiles=PEAK_FILE,
binsize=200,
resolution=200)
Define and fit a new model
@inputlayer
@outputdense('sigmoid')
def double_stranded_model_dnaconv(inputs, inp, oup, params):
""" keras model for scanning both DNA strands.
A more elegant way of scanning both strands for motif occurrences
is achieved by the DnaConv2D layer wrapper, which internally
performs the convolution operation with the normal kernel weights
and the reverse complemented weights.
"""
with inputs.use('dna') as layer:
# the name in inputs.use() should be the same as the dataset name.
layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
activation=params[2]))(layer)
output = GlobalAveragePooling2D(name='motif')(layer)
return inputs, output
# create a new model object
model = create_model(template=double_stranded_model_dnaconv,
modelparams=(30, 21, 'relu'),
inputs=DNA,
outputs=ReduceDim(LABELS))
model.compile(optimizer='adadelta', loss='binary_crossentropy',
metrics=['acc'])
hist = model.fit(DNA, ReduceDim(LABELS), epochs=epochs)
print('#' * 40)
print('loss: {}, acc: {}'.format(hist.history['loss'][-1],
hist.history['acc'][-1]))
print('#' * 40)
WARNING:tensorflow:From /home/wkopp/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead. WARNING:tensorflow:From /home/wkopp/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead. WARNING:tensorflow:From /home/wkopp/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead. WARNING:tensorflow:From /home/wkopp/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead. WARNING:tensorflow:From /home/wkopp/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3657: The name tf.log is deprecated. Please use tf.math.log instead. WARNING:tensorflow:From /home/wkopp/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.where in 2.0, which has the same broadcast rule as np.where WARNING:tensorflow:From /home/wkopp/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1033: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead. Epoch 1/100 7797/7797 [==============================] - 12s 1ms/step - loss: 0.5612 - acc: 0.7353 Epoch 2/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.4607 - acc: 0.7981 Epoch 3/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.4115 - acc: 0.8222 Epoch 4/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.3785 - acc: 0.8424 Epoch 5/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.3513 - acc: 0.8552 Epoch 6/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.3255 - acc: 0.8656 Epoch 7/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.3037 - acc: 0.8821 Epoch 8/100 7797/7797 [==============================] - 12s 1ms/step - loss: 0.2801 - acc: 0.8916 Epoch 9/100 7797/7797 [==============================] - 12s 1ms/step - loss: 0.2601 - acc: 0.9006 Epoch 10/100 7797/7797 [==============================] - 12s 1ms/step - loss: 0.2412 - acc: 0.9125 Epoch 11/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.2240 - acc: 0.9207 Epoch 12/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.2083 - acc: 0.9277 Epoch 13/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1951 - acc: 0.9314 Epoch 14/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1822 - acc: 0.9395 Epoch 15/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1712 - acc: 0.9427 Epoch 16/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1621 - acc: 0.9450 Epoch 17/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1535 - acc: 0.9484 Epoch 18/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1448 - acc: 0.9514 Epoch 19/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1387 - acc: 0.9532 Epoch 20/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1311 - acc: 0.9597 Epoch 21/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1249 - acc: 0.9592 Epoch 22/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1196 - acc: 0.9618 Epoch 23/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1136 - acc: 0.9647 Epoch 24/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1091 - acc: 0.9663 Epoch 25/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.1039 - acc: 0.9668 Epoch 26/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0991 - acc: 0.9708 Epoch 27/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0965 - acc: 0.9699 Epoch 28/100 7797/7797 [==============================] - 12s 1ms/step - loss: 0.0921 - acc: 0.9724 Epoch 29/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0887 - acc: 0.9750 Epoch 30/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0860 - acc: 0.9761 Epoch 31/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0826 - acc: 0.9763 Epoch 32/100 7797/7797 [==============================] - 12s 2ms/step - loss: 0.0795 - acc: 0.9774 Epoch 33/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0769 - acc: 0.9810 Epoch 34/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0743 - acc: 0.9794 Epoch 35/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0714 - acc: 0.9811 Epoch 36/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0696 - acc: 0.9819 Epoch 37/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0669 - acc: 0.9837 Epoch 38/100 7797/7797 [==============================] - 14s 2ms/step - loss: 0.0642 - acc: 0.9845 Epoch 39/100 7797/7797 [==============================] - 12s 2ms/step - loss: 0.0620 - acc: 0.9845 Epoch 40/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0605 - acc: 0.9858 Epoch 41/100 7797/7797 [==============================] - 12s 1ms/step - loss: 0.0584 - acc: 0.9858 Epoch 42/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0565 - acc: 0.9867 Epoch 43/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0545 - acc: 0.9876 Epoch 44/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0538 - acc: 0.9882 Epoch 45/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0517 - acc: 0.9895 Epoch 46/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0503 - acc: 0.9887 Epoch 47/100 7797/7797 [==============================] - 12s 1ms/step - loss: 0.0489 - acc: 0.9887 Epoch 48/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0472 - acc: 0.9905 Epoch 49/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0459 - acc: 0.9905 Epoch 50/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0450 - acc: 0.9895 Epoch 51/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0432 - acc: 0.9900 Epoch 52/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0420 - acc: 0.9917 Epoch 53/100 7797/7797 [==============================] - 12s 2ms/step - loss: 0.0403 - acc: 0.9927 Epoch 54/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0394 - acc: 0.9922 Epoch 55/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0383 - acc: 0.9924 Epoch 56/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0368 - acc: 0.9933 Epoch 57/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0361 - acc: 0.9935 Epoch 58/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0352 - acc: 0.9932 Epoch 59/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.0342 - acc: 0.9937 Epoch 60/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0332 - acc: 0.9938 Epoch 61/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0322 - acc: 0.9940 Epoch 62/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.0314 - acc: 0.9944 Epoch 63/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0305 - acc: 0.9944 Epoch 64/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0296 - acc: 0.9958 Epoch 65/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0291 - acc: 0.9959 Epoch 66/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0284 - acc: 0.9955 Epoch 67/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0277 - acc: 0.9949 Epoch 68/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.0265 - acc: 0.9958 Epoch 69/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.0260 - acc: 0.9964 Epoch 70/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.0253 - acc: 0.9964 Epoch 71/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.0245 - acc: 0.9967 Epoch 72/100 7797/7797 [==============================] - 9s 1ms/step - loss: 0.0239 - acc: 0.9963 Epoch 73/100 7797/7797 [==============================] - 10s 1ms/step - loss: 0.0230 - acc: 0.9976 Epoch 74/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0228 - acc: 0.9967 Epoch 75/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0218 - acc: 0.9976 Epoch 76/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0216 - acc: 0.9974 Epoch 77/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0208 - acc: 0.9973 Epoch 78/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0201 - acc: 0.9982 Epoch 79/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0197 - acc: 0.9977 Epoch 80/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0192 - acc: 0.9982 Epoch 81/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0187 - acc: 0.9979 Epoch 82/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0183 - acc: 0.9985 Epoch 83/100 7797/7797 [==============================] - 12s 1ms/step - loss: 0.0177 - acc: 0.9986 Epoch 84/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0172 - acc: 0.9987 Epoch 85/100 7797/7797 [==============================] - 13s 2ms/step - loss: 0.0167 - acc: 0.9987 Epoch 86/100 7797/7797 [==============================] - 13s 2ms/step - loss: 0.0165 - acc: 0.9988 Epoch 87/100 7797/7797 [==============================] - 13s 2ms/step - loss: 0.0159 - acc: 0.9992 Epoch 88/100 7797/7797 [==============================] - 13s 2ms/step - loss: 0.0156 - acc: 0.9990 Epoch 89/100 7797/7797 [==============================] - 13s 2ms/step - loss: 0.0152 - acc: 0.9990 Epoch 90/100 7797/7797 [==============================] - 12s 2ms/step - loss: 0.0148 - acc: 0.9988 Epoch 91/100 7797/7797 [==============================] - 12s 2ms/step - loss: 0.0145 - acc: 0.9991 Epoch 92/100 7797/7797 [==============================] - 12s 2ms/step - loss: 0.0144 - acc: 0.9991 Epoch 93/100 7797/7797 [==============================] - 12s 2ms/step - loss: 0.0137 - acc: 0.9991 Epoch 94/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0132 - acc: 0.9995 Epoch 95/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0130 - acc: 0.9991 Epoch 96/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0124 - acc: 0.9996 Epoch 97/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0125 - acc: 0.9988 Epoch 98/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0123 - acc: 0.9994 Epoch 99/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0118 - acc: 0.9991 Epoch 100/100 7797/7797 [==============================] - 11s 1ms/step - loss: 0.0116 - acc: 0.9995 ######################################## loss: 0.01164846237482735, acc: 0.9994869821726305 ########################################
model.summary()
Model: "model_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dna (InputLayer) (None, 198, 1, 64) 0 _________________________________________________________________ dna_conv2d_1 (DnaConv2D) (None, 178, 1, 30) 40350 _________________________________________________________________ motif (GlobalAveragePooling2 (None, 30) 0 _________________________________________________________________ peaks (Dense) (None, 1) 31 ================================================================= Total params: 40,381 Trainable params: 40,381 Non-trainable params: 0 _________________________________________________________________
In the cell below, we use predict_variant_effect (rather than Janggu.predict_variant_effect). The function performs variant effect prediction equivalently compared to the method Janggu.predict_variant_effect. The only difference is that it requires a keras model as the first argument.
Note also, that as of janggu>=0.10.0, predict_variant_effect and Janggu.predict_variant_effect accept the reference genome as in fasta format directly (in addition to the Bioseq object, which was supported before only).
# output directory for the variant effect prediction
vcfoutput = os.path.join(os.environ['JANGGU_OUTPUT'], 'vcfoutput')
os.makedirs(vcfoutput, exist_ok=True)
# perform variant effect prediction using Bioseq object and
# a VCF file
scoresfile, variantsfile = predict_variant_effect(model,
REFGENOME,
VCFFILE,
conditions=['feature'],
output_folder=vcfoutput,
order=order)
scoresfile = os.path.join(vcfoutput, 'scores.hdf5')
variantsfile = os.path.join(vcfoutput, 'snps.bed.gz')
scores.hdf5 contains a variety of scores for each variant. The most important ones are refscore and altscore which are used to derive the score difference and the logoddsscore.
# parse the variant effect predictions (difference between
# reference and alternative variant) into a Cover object
# for the purpose of visualization
f = h5py.File(scoresfile, 'r')
for name in f:
print(name)
altscore diffscore labels logoddsscore refscore
f['diffscore'][:]
array([[ 0. ], [-0.000977], [-0.000977], [-0.000977], [-0.003906], [ 0. ]], dtype=float16)
So far, we have illustrated variant effect predictions by using: 1. Janggu.predict_variant_effect (method from the Janggu model wrapper), and 2. predict_variant_effect (using a pure keras model).
In case you want to use a different setup (e.g. an sklearn model) you can use the VariantStreamer to obtain variants and supply them to the custom model in a loop directly. This also enables the possibility to adjust the setup if e.g. transformations to the one-hot encoding are desired, if the output should be reported in a different way (predict_variant_effect outputs a hdf5 file), or if you want to experiment with different variant effect prediction scores.
from janggu.data import VariantStreamer
variantstreamer = VariantStreamer(REFGENOME,
VCFFILE,
binsize=200,
batch_size=128,
order=order)
We shall just use our existing keras model in the following, but any other model could be used. Furthermore, transformations on the one-hot encoding could be done as well if necessary.
The VariantStreamer produces mini-batches of pairs of reference and alternative allele variants embedded in the sequence context around the genomic locus.
model
<keras.engine.training.Model at 0x7f058f022dd8>
for names, chroms, positions, ref_alleles, alt_alleles, references, alternatives in variantstreamer.flow():
# names, chroms, positions, ref_alleles, alt_alleles are reported as meta information.
# references, alternatives represent the one-hot encoded sequences
# here you could employ any model and if necessary transform the one-hot
# encoded sequences: references and alternatives
ref_score = model.predict_on_batch(references)
alt_score = model.predict_on_batch(alternatives)
# high score difference indicates potentially high variant effect
diff_score = alt_score.astype('float16') - ref_score.astype('float16')
# float16 is used to reduce memory requirements, if many variants are tested.
# Also, small variant effects are usually not of interest,
# but this might be adapted if necessary.
break
print(diff_score)
[[ 0. ] [-0.000977] [-0.000977] [-0.000977] [-0.003906] [ 0. ]]