In this notebook we illustrate several variants how to predict regulatory regions (of a toy example) from the DNA sequence. The reference genome is made up of a concatenation of Oct4 and Mafk binding sites and we shall use all regions on chromosome 'pseudo1' as training and 'pseudo2' as test chromosomes.
import os
import numpy as np
from keras import Model
from keras import backend as K
from keras.layers import Conv2D
from keras.layers import Dense
from keras.layers import GlobalAveragePooling2D
from keras.layers import Input
from keras.layers import Reshape
from pkg_resources import resource_filename
from janggu import Janggu
from janggu import Scorer
from janggu import inputlayer
from janggu import outputdense
from janggu.data import Bioseq
from janggu.data import Cover
from janggu.data import ReduceDim
from janggu.layers import DnaConv2D
from janggu.layers import LocalAveragePooling2D
from janggu.utils import ExportClustermap
from janggu.utils import ExportTsne
from janggu.utils import ExportTsv
from IPython.display import Image
np.random.seed(1234)
/home/wkopp/anaconda3/envs/jdev/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. from ._conv import register_converters as _register_converters Using TensorFlow backend.
First, we need to specify the output directory in which the results are stored.
os.environ['JANGGU_OUTPUT'] = '/home/wkopp/janggu_examples'
Specify the DNA sequence feature order. Order 1, 2 and 3 correspond to mono-, di- and tri-nucleotide based features (see Tutorial).
order = 3
# load the dataset
# The pseudo genome represents just a concatenation of all sequences
# in sample.fa and sample2.fa. Therefore, the results should be almost
# identically to the models obtained from classify_fasta.py.
REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
# ROI contains regions spanning positive and negative examples
ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed')
ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed')
# PEAK_FILE only contains positive examples
PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
Load the datasets for training and testing
When you have loaded the datasets using the store_whole_genome=True option, it is possible to reuse the same dataset with different region of interests. To this end, the view method can be used to create another view on the dataset. The advantage of this option is that the memory footprint will remain the same.
from janggu.data import view
DNA_TRAIN = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
roi=ROI_TRAIN_FILE,
order=order,
binsize=200,
store_whole_genome=True)
LABELS_TRAIN = ReduceDim(Cover.create_from_bed('peaks', roi=ROI_TRAIN_FILE,
bedfiles=PEAK_FILE,
binsize=200,
resolution=200,
storage='sparse',
store_whole_genome=True))
DNA_TEST = view(DNA_TRAIN, ROI_TEST_FILE)
LABELS_TEST = view(LABELS_TRAIN, ROI_TEST_FILE)
DNA_TRAIN.shape, DNA_TEST.shape
((7797, 198, 1, 64), (200, 198, 1, 64))
LABELS_TRAIN.shape, LABELS_TEST.shape
((7797, 1), (200, 1))
Neural networks can also be defined using the Janggu wrappers for keras. This offers a few additional advantages, including reduced redundancy for defining models or automated evaluation.
First we define model using keras using a method. The decorators will automatically instantiate the initial layers and the output layers with the correct dimensionality. In the example above, this needed to be specified explicitly.
@inputlayer
@outputdense('sigmoid')
def double_stranded_model_dnaconv(inputs, inp, oup, params):
""" keras model for scanning both DNA strands.
A more elegant way of scanning both strands for motif occurrences
is achieved by the DnaConv2D layer wrapper, which internally
performs the convolution operation with the normal kernel weights
and the reverse complemented weights.
"""
with inputs.use('dna') as layer:
# the name in inputs.use() should be the same as the dataset name.
layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
activation=params[2]))(layer)
output = GlobalAveragePooling2D(name='motif')(layer)
return inputs, output
Now the model can be instantiated accordingly. We will also use a specific model name (which is optional).
modelname = 'dna2peak_ex4_order{}'.format(order)
# create a new model object
model = Janggu.create(template=double_stranded_model_dnaconv,
modelparams=(30, 21, 'relu'),
inputs=DNA_TRAIN,
outputs=LABELS_TRAIN,
name=modelname)
model.compile(optimizer='adadelta', loss='binary_crossentropy',
metrics=['acc'])
Model fitting is similar as before.
Note also the use of ReduceDim which converts the original 4D Cover object to 2D table-like data structure. This is just for convenience, since it is also possible to set up a model as in the previous example.
hist = model.fit(DNA_TRAIN, LABELS_TRAIN, epochs=100)
print('#' * 40)
print('loss: {}, acc: {}'.format(hist.history['loss'][-1],
hist.history['acc'][-1]))
print('#' * 40)
Epoch 1/100 244/244 [==============================] - 4s 15ms/step - loss: 0.6372 - acc: 0.6297 Epoch 2/100 244/244 [==============================] - 3s 12ms/step - loss: 0.5187 - acc: 0.7661 Epoch 3/100 244/244 [==============================] - 3s 12ms/step - loss: 0.4618 - acc: 0.7984 Epoch 4/100 244/244 [==============================] - 3s 11ms/step - loss: 0.4225 - acc: 0.8230 Epoch 5/100 244/244 [==============================] - 3s 11ms/step - loss: 0.3930 - acc: 0.8318 Epoch 6/100 244/244 [==============================] - 3s 11ms/step - loss: 0.3680 - acc: 0.8473 Epoch 7/100 244/244 [==============================] - 3s 12ms/step - loss: 0.3450 - acc: 0.8548 Epoch 8/100 244/244 [==============================] - 3s 11ms/step - loss: 0.3237 - acc: 0.8672 Epoch 9/100 244/244 [==============================] - 3s 11ms/step - loss: 0.3023 - acc: 0.8795 Epoch 10/100 244/244 [==============================] - 3s 11ms/step - loss: 0.2814 - acc: 0.8922 Epoch 11/100 244/244 [==============================] - 3s 11ms/step - loss: 0.2626 - acc: 0.8986 Epoch 12/100 244/244 [==============================] - 3s 12ms/step - loss: 0.2437 - acc: 0.9101 Epoch 13/100 244/244 [==============================] - 3s 12ms/step - loss: 0.2253 - acc: 0.9174 Epoch 14/100 244/244 [==============================] - 3s 12ms/step - loss: 0.2110 - acc: 0.9263 Epoch 15/100 244/244 [==============================] - 3s 12ms/step - loss: 0.1967 - acc: 0.9319 Epoch 16/100 244/244 [==============================] - 3s 11ms/step - loss: 0.1844 - acc: 0.9379 Epoch 17/100 244/244 [==============================] - 3s 11ms/step - loss: 0.1737 - acc: 0.9402 Epoch 18/100 244/244 [==============================] - 3s 11ms/step - loss: 0.1634 - acc: 0.9438 Epoch 19/100 244/244 [==============================] - 3s 12ms/step - loss: 0.1546 - acc: 0.9486 Epoch 20/100 244/244 [==============================] - 3s 12ms/step - loss: 0.1475 - acc: 0.9503 Epoch 21/100 244/244 [==============================] - 3s 11ms/step - loss: 0.1395 - acc: 0.9535 Epoch 22/100 244/244 [==============================] - 3s 12ms/step - loss: 0.1324 - acc: 0.9571 Epoch 23/100 244/244 [==============================] - 3s 11ms/step - loss: 0.1265 - acc: 0.9604 Epoch 24/100 244/244 [==============================] - 3s 13ms/step - loss: 0.1206 - acc: 0.9609 Epoch 25/100 244/244 [==============================] - 3s 12ms/step - loss: 0.1153 - acc: 0.9644 Epoch 26/100 244/244 [==============================] - 3s 12ms/step - loss: 0.1102 - acc: 0.9657 Epoch 27/100 244/244 [==============================] - 3s 11ms/step - loss: 0.1048 - acc: 0.9663 Epoch 28/100 244/244 [==============================] - 3s 12ms/step - loss: 0.1006 - acc: 0.9700 Epoch 29/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0973 - acc: 0.9714 Epoch 30/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0936 - acc: 0.9720 Epoch 31/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0906 - acc: 0.9721 Epoch 32/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0865 - acc: 0.9744 Epoch 33/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0837 - acc: 0.9743 Epoch 34/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0803 - acc: 0.9780 Epoch 35/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0778 - acc: 0.9791 Epoch 36/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0755 - acc: 0.9793 Epoch 37/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0728 - acc: 0.9809 Epoch 38/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0700 - acc: 0.9817 Epoch 39/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0681 - acc: 0.9814 Epoch 40/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0657 - acc: 0.9831 Epoch 41/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0641 - acc: 0.9834 Epoch 42/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0612 - acc: 0.9849 Epoch 43/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0597 - acc: 0.9862 Epoch 44/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0576 - acc: 0.9852 Epoch 45/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0565 - acc: 0.9863 Epoch 46/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0545 - acc: 0.9871 Epoch 47/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0531 - acc: 0.9877 Epoch 48/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0520 - acc: 0.9875 Epoch 49/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0496 - acc: 0.9886 Epoch 50/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0485 - acc: 0.9890 Epoch 51/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0471 - acc: 0.9895 Epoch 52/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0455 - acc: 0.9907 Epoch 53/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0444 - acc: 0.9905 Epoch 54/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0433 - acc: 0.9903 Epoch 55/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0423 - acc: 0.9912 Epoch 56/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0410 - acc: 0.9910 Epoch 57/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0396 - acc: 0.9915 Epoch 58/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0386 - acc: 0.9924 Epoch 59/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0376 - acc: 0.9918 Epoch 60/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0366 - acc: 0.9931 Epoch 61/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0348 - acc: 0.9924 Epoch 62/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0347 - acc: 0.9932 Epoch 63/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0334 - acc: 0.9942 Epoch 64/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0326 - acc: 0.9937 Epoch 65/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0315 - acc: 0.9940 Epoch 66/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0311 - acc: 0.9944 Epoch 67/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0299 - acc: 0.9944 Epoch 68/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0292 - acc: 0.9953 Epoch 69/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0289 - acc: 0.9949 Epoch 70/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0279 - acc: 0.9947 Epoch 71/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0270 - acc: 0.9954 Epoch 72/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0262 - acc: 0.9955 Epoch 73/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0255 - acc: 0.9962 Epoch 74/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0249 - acc: 0.9968 Epoch 75/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0238 - acc: 0.9971 Epoch 76/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0238 - acc: 0.9962 Epoch 77/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0229 - acc: 0.9971 Epoch 78/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0223 - acc: 0.9976 Epoch 79/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0216 - acc: 0.9973 Epoch 80/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0213 - acc: 0.9980 Epoch 81/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0207 - acc: 0.9978 Epoch 82/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0201 - acc: 0.9983 Epoch 83/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0194 - acc: 0.9982 Epoch 84/100 244/244 [==============================] - 3s 14ms/step - loss: 0.0189 - acc: 0.9981 Epoch 85/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0188 - acc: 0.9978 Epoch 86/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0183 - acc: 0.9983 Epoch 87/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0177 - acc: 0.9986 Epoch 88/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0171 - acc: 0.9983 Epoch 89/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0167 - acc: 0.9988 Epoch 90/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0164 - acc: 0.9986 Epoch 91/100 244/244 [==============================] - 3s 12ms/step - loss: 0.0156 - acc: 0.9990 Epoch 92/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0155 - acc: 0.9992 Epoch 93/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0151 - acc: 0.9988 Epoch 94/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0146 - acc: 0.9992 Epoch 95/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0143 - acc: 0.9990 Epoch 96/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0140 - acc: 0.9991 Epoch 97/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0136 - acc: 0.9992 Epoch 98/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0133 - acc: 0.9994 Epoch 99/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0130 - acc: 0.9992 Epoch 100/100 221/244 [==========================>...] - ETA: 0s - loss: 0.0127 - acc: 0.9994
# do the evaluation on the independent test data
model.evaluate(DNA_TEST, LABELS_TEST, datatags=['test'],
callbacks=['auc', 'auprc', 'roc', 'prc'])
evaluation_folder = os.path.join(os.environ['JANGGU_OUTPUT'], 'evaluation', modelname, 'test')
Image(os.path.join(evaluation_folder, 'prc.png'))
Image(os.path.join(evaluation_folder, 'roc.png'))