In this notebook we illustrate several variants how to predict regulatory regions (of a toy example) from the DNA sequence. The reference genome is made up of a concatenation of Oct4 and Mafk binding sites and we shall use all regions on chromosome 'pseudo1' as training and 'pseudo2' as test chromosomes.
import os
import numpy as np
from keras import Model
from keras import backend as K
from keras.layers import Conv2D
from keras.layers import Dense
from keras.layers import GlobalAveragePooling2D
from keras.layers import Input
from keras.layers import Reshape
from pkg_resources import resource_filename
from janggu import Janggu
from janggu import Scorer
from janggu import inputlayer
from janggu import outputdense
from janggu.data import Bioseq
from janggu.data import Cover
from janggu.data import ReduceDim
from janggu.layers import DnaConv2D
from janggu.layers import LocalAveragePooling2D
from janggu.utils import ExportClustermap
from janggu.utils import ExportTsne
from janggu.utils import ExportTsv
from IPython.display import Image
np.random.seed(1234)
/home/wkopp/anaconda3/envs/jdev/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. from ._conv import register_converters as _register_converters Using TensorFlow backend.
First, we need to specify the output directory in which the results are stored.
os.environ['JANGGU_OUTPUT'] = '/home/wkopp/janggu_examples'
Specify the DNA sequence feature order. Order 1, 2 and 3 correspond to mono-, di- and tri-nucleotide based features (see Tutorial).
order = 3
# load the dataset
# The pseudo genome represents just a concatenation of all sequences
# in sample.fa and sample2.fa. Therefore, the results should be almost
# identically to the models obtained from classify_fasta.py.
REFGENOME = resource_filename('janggu', 'resources/pseudo_genome.fa')
# ROI contains regions spanning positive and negative examples
ROI_TRAIN_FILE = resource_filename('janggu', 'resources/roi_train.bed')
ROI_TEST_FILE = resource_filename('janggu', 'resources/roi_test.bed')
# PEAK_FILE only contains positive examples
PEAK_FILE = resource_filename('janggu', 'resources/scores.bed')
Load the datasets for training and testing
# Training input and labels are purely defined genomic coordinates
DNA = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
roi=ROI_TRAIN_FILE,
binsize=200,
order=order,
cache=True)
LABELS = Cover.create_from_bed('peaks', roi=ROI_TRAIN_FILE,
bedfiles=PEAK_FILE,
binsize=200,
resolution=200,
cache=True,
storage='sparse')
DNA_TEST = Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
roi=ROI_TEST_FILE,
binsize=200,
order=order)
LABELS_TEST = Cover.create_from_bed('peaks',
bedfiles=PEAK_FILE,
roi=ROI_TEST_FILE,
binsize=200,
resolution=200,
storage='sparse')
loading from lazy loader reload /home/wkopp/janggu_examples/datasets/dna/08ee6c24faf5ebe600a7fc361e30a94ef2db75b41ffa057dabf5d5d7ef800e49.npz loading from bed lazy loader reload /home/wkopp/janggu_examples/datasets/peaks/db9ac6eef98519c37a8604886d1029a71d95374d8b9a6c10f283df557dd1cded.npz loading from lazy loader loading from bed lazy loader
Neural networks can also be defined using the Janggu wrappers for keras. This offers a few additional advantages, including reduced redundancy for defining models or automated evaluation.
First we define model using keras using a method. The decorators will automatically instantiate the initial layers and the output layers with the correct dimensionality. In the example above, this needed to be specified explicitly.
@inputlayer
@outputdense('sigmoid')
def double_stranded_model_dnaconv(inputs, inp, oup, params):
""" keras model for scanning both DNA strands.
A more elegant way of scanning both strands for motif occurrences
is achieved by the DnaConv2D layer wrapper, which internally
performs the convolution operation with the normal kernel weights
and the reverse complemented weights.
"""
with inputs.use('dna') as layer:
# the name in inputs.use() should be the same as the dataset name.
layer = DnaConv2D(Conv2D(params[0], (params[1], 1),
activation=params[2]))(layer)
output = GlobalAveragePooling2D(name='motif')(layer)
return inputs, output
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-151433374fa4> in <module>() ----> 1 @inputlayer 2 @outputdense('sigmoid') 3 def double_stranded_model_dnaconv(inputs, inp, oup, params): 4 """ keras model for scanning both DNA strands. 5 NameError: name 'inputlayer' is not defined
Now the model can be instantiated accordingly. We will also use a specific model name (which is optional).
modelname = 'dna2peak_ex1_order{}'.format(order)
# create a new model object
model = Janggu.create(template=double_stranded_model_dnaconv,
modelparams=(30, 21, 'relu'),
inputs=DNA,
outputs=ReduceDim(LABELS),
name=modelname)
model.compile(optimizer='adadelta', loss='binary_crossentropy',
metrics=['acc'])
Model fitting is similar as before.
Note also the use of ReduceDim which converts the original 4D Cover object to 2D table-like data structure. This is just for convenience, since it is also possible to set up a model as in the previous example.
hist = model.fit(DNA, ReduceDim(LABELS), epochs=100)
print('#' * 40)
print('loss: {}, acc: {}'.format(hist.history['loss'][-1],
hist.history['acc'][-1]))
print('#' * 40)
Epoch 1/100 244/244 [==============================] - 3s 10ms/step - loss: 0.6310 - acc: 0.6425 Epoch 2/100 244/244 [==============================] - 2s 10ms/step - loss: 0.5257 - acc: 0.7636 Epoch 3/100 244/244 [==============================] - 2s 10ms/step - loss: 0.4646 - acc: 0.7997 Epoch 4/100 244/244 [==============================] - 3s 10ms/step - loss: 0.4253 - acc: 0.8145 Epoch 5/100 244/244 [==============================] - 2s 10ms/step - loss: 0.3955 - acc: 0.8304 Epoch 6/100 244/244 [==============================] - 2s 10ms/step - loss: 0.3700 - acc: 0.8416 Epoch 7/100 244/244 [==============================] - 3s 11ms/step - loss: 0.3471 - acc: 0.8575 Epoch 8/100 244/244 [==============================] - 2s 10ms/step - loss: 0.3259 - acc: 0.8679 Epoch 9/100 244/244 [==============================] - 3s 10ms/step - loss: 0.3048 - acc: 0.8776 Epoch 10/100 244/244 [==============================] - 2s 10ms/step - loss: 0.2843 - acc: 0.8899 Epoch 11/100 244/244 [==============================] - 3s 10ms/step - loss: 0.2663 - acc: 0.8973 Epoch 12/100 244/244 [==============================] - 2s 10ms/step - loss: 0.2469 - acc: 0.9066 Epoch 13/100 244/244 [==============================] - 3s 10ms/step - loss: 0.2292 - acc: 0.9163 Epoch 14/100 244/244 [==============================] - 3s 10ms/step - loss: 0.2119 - acc: 0.9257 Epoch 15/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1983 - acc: 0.9324 Epoch 16/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1852 - acc: 0.9374 Epoch 17/100 244/244 [==============================] - 3s 10ms/step - loss: 0.1728 - acc: 0.9408 Epoch 18/100 244/244 [==============================] - 3s 10ms/step - loss: 0.1621 - acc: 0.9483 Epoch 19/100 244/244 [==============================] - 3s 11ms/step - loss: 0.1530 - acc: 0.9516 Epoch 20/100 244/244 [==============================] - 3s 10ms/step - loss: 0.1434 - acc: 0.9544 Epoch 21/100 244/244 [==============================] - 3s 10ms/step - loss: 0.1371 - acc: 0.9563 Epoch 22/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1292 - acc: 0.9597 Epoch 23/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1227 - acc: 0.9629 Epoch 24/100 244/244 [==============================] - 3s 11ms/step - loss: 0.1167 - acc: 0.9630 Epoch 25/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1111 - acc: 0.9658 Epoch 26/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1060 - acc: 0.9667 Epoch 27/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1008 - acc: 0.9687 Epoch 28/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0970 - acc: 0.9705 Epoch 29/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0926 - acc: 0.9716 Epoch 30/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0893 - acc: 0.9732 Epoch 31/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0858 - acc: 0.9758 Epoch 32/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0832 - acc: 0.9749 Epoch 33/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0796 - acc: 0.9776 Epoch 34/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0768 - acc: 0.9793 Epoch 35/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0743 - acc: 0.9804 Epoch 36/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0717 - acc: 0.9804 Epoch 37/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0694 - acc: 0.9801 Epoch 38/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0669 - acc: 0.9817 Epoch 39/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0649 - acc: 0.9835 Epoch 40/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0623 - acc: 0.9828 Epoch 41/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0606 - acc: 0.9841 Epoch 42/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0591 - acc: 0.9853 Epoch 43/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0570 - acc: 0.9846 Epoch 44/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0549 - acc: 0.9869 Epoch 45/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0536 - acc: 0.9859 Epoch 46/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0515 - acc: 0.9881 Epoch 47/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0504 - acc: 0.9876 Epoch 48/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0491 - acc: 0.9881 Epoch 49/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0472 - acc: 0.9886 Epoch 50/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0460 - acc: 0.9892 Epoch 51/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0452 - acc: 0.9895 Epoch 52/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0439 - acc: 0.9898 Epoch 53/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0424 - acc: 0.9903 Epoch 54/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0411 - acc: 0.9898 Epoch 55/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0403 - acc: 0.9914 Epoch 56/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0390 - acc: 0.9921 Epoch 57/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0377 - acc: 0.9924 Epoch 58/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0367 - acc: 0.9928 Epoch 59/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0355 - acc: 0.9933 Epoch 60/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0343 - acc: 0.9942 Epoch 61/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0338 - acc: 0.9932 Epoch 62/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0328 - acc: 0.9935 Epoch 63/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0319 - acc: 0.9941 Epoch 64/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0310 - acc: 0.9949 Epoch 65/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0301 - acc: 0.9949 Epoch 66/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0296 - acc: 0.9949 Epoch 67/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0287 - acc: 0.9956 Epoch 68/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0282 - acc: 0.9963 Epoch 69/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0269 - acc: 0.9960 Epoch 70/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0262 - acc: 0.9965 Epoch 71/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0258 - acc: 0.9962 Epoch 72/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0251 - acc: 0.9963 Epoch 73/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0242 - acc: 0.9972 Epoch 74/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0236 - acc: 0.9968 Epoch 75/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0230 - acc: 0.9967 Epoch 76/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0225 - acc: 0.9980 Epoch 77/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0217 - acc: 0.9976 Epoch 78/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0212 - acc: 0.9974 Epoch 79/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0207 - acc: 0.9980 Epoch 80/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0204 - acc: 0.9983 Epoch 81/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0193 - acc: 0.9982 Epoch 82/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0193 - acc: 0.9985 Epoch 83/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0187 - acc: 0.9982 Epoch 84/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0182 - acc: 0.9985 Epoch 85/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0177 - acc: 0.9983 Epoch 86/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0175 - acc: 0.9988 Epoch 87/100 244/244 [==============================] - 3s 10ms/step - loss: 0.0170 - acc: 0.9986 Epoch 88/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0163 - acc: 0.9988 Epoch 89/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0160 - acc: 0.9991 Epoch 90/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0157 - acc: 0.9990 Epoch 91/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0151 - acc: 0.9991 Epoch 92/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0148 - acc: 0.9991 Epoch 93/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0145 - acc: 0.9992 Epoch 94/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0139 - acc: 0.9992 Epoch 95/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0136 - acc: 0.9992 Epoch 96/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0133 - acc: 0.9995 Epoch 97/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0131 - acc: 0.9991 Epoch 98/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0128 - acc: 0.9992 Epoch 99/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0124 - acc: 0.9996 Epoch 100/100 244/244 [==============================] - 3s 11ms/step - loss: 0.0123 - acc: 0.9996 ######################################## loss: 0.012273260853311237, acc: 0.9996152366294728 ########################################
# do the evaluation on the independent test data
model.evaluate(DNA_TEST, ReduceDim(LABELS_TEST), datatags=['test'],
callbacks=['auc', 'auprc', 'roc', 'prc'])
[0.09988154709339142, 0.97]
evaluation_folder = os.path.join(os.environ['JANGGU_OUTPUT'], 'evaluation', modelname, 'test')
Image(os.path.join(evaluation_folder, 'prc.png'))
Image(os.path.join(evaluation_folder, 'roc.png'))
It is also possible to inspect the hidden layers for example by clustering the hidden feature activities. In this case, this is done for the 'motif' layer which is the first convolution layer.
# clustering plots based on hidden features
heatmap_eval = Scorer('heatmap', exporter=ExportClustermap(z_score=1.))
tsne_eval = Scorer('tsne', exporter=ExportTsne())
model.predict(DNA_TEST, datatags=['test'],
callbacks=[heatmap_eval, tsne_eval],
layername='motif')
array([[1.600911 , 0.4191686 , 0.59081465, ..., 1.9342976 , 0.28643346, 0.5154825 ], [1.423312 , 0.4810649 , 0.59921515, ..., 1.7891194 , 0.43152866, 0.5024406 ], [1.624104 , 0.43683857, 0.7034798 , ..., 1.8071641 , 0.34016404, 0.6076513 ], ..., [1.6051155 , 0.6844147 , 0.70858073, ..., 1.7525921 , 0.6137494 , 0.86825794], [1.6730189 , 0.76110625, 0.74325955, ..., 1.97932 , 0.6314724 , 0.8976532 ], [1.5126992 , 0.40343815, 0.60530233, ..., 1.560414 , 0.4068123 , 0.57226604]], dtype=float32)
In addition it is possible to define custom Scorers if necessary.