In this tutorial we fit a simple model to distinguish Oct4 from Mafk bound sites obtained from ChIP-seq experiments.
import os
import numpy as np
import pandas as pd
import pkg_resources
from keras import backend as K
from keras.layers import Conv2D
from keras.layers import GlobalAveragePooling2D
from keras.layers import Maximum
from janggu import Janggu
from janggu import Scorer
from janggu import inputlayer
from janggu import outputdense
from janggu.data import Array
from janggu.data import Bioseq
from janggu.layers import Complement
from janggu.layers import DnaConv2D
from janggu.layers import Reverse
from janggu.utils import ExportClustermap
from janggu.utils import ExportTsv
import matplotlib
np.random.seed(1234)
from IPython.display import Image
/home/wkopp/anaconda3/envs/jdev/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. from ._conv import register_converters as _register_converters Using TensorFlow backend.
Next we chose the output directory, the sequence feature order and the whether to use a model that scans only the forward strand or a model that scans both strands simultaneously.
modeltype = 'dnaconv'
#modeltype = 'double'
#modeltype = 'single'
order = 3
modeltype 'dnaconv' and 'double' both scan the DNA sequence on both strands for motif occurrences, while 'single' only scans the forward strand.
Next we set the JANGGU_OUTPUT environment variable, which defines the folder in which all results are stored.
os.environ['JANGGU_OUTPUT'] = '/home/wkopp/janggu_examples'
We use the following input files for the example:
# load the dataset
DATA_PATH = pkg_resources.resource_filename('janggu', 'resources/')
OCT4 = os.path.join(DATA_PATH, 'sample.fa')
MAFK = os.path.join(DATA_PATH, 'sample2.fa')
OCT4_TEST = os.path.join(DATA_PATH, 'sample_test.fa')
MAFK_TEST = os.path.join(DATA_PATH, 'sample2_test.fa')
The DNA sequences, which are used as input for the models, can be loaded using into a Bioseq dataset object
DNA = Bioseq.create_from_seq('dna', fastafile=[OCT4, MAFK],
order=order, cache=True)
DNA_TEST = Bioseq.create_from_seq('dna', fastafile=[OCT4_TEST, MAFK_TEST],
order=order, cache=True)
reload /home/wkopp/janggu_examples/datasets/dna/a670d5c6da22d5d11af94e663efe095ba034ce12572ef36bb74e1d94e915b397.npz reload /home/wkopp/janggu_examples/datasets/dna/b39e8ad5fe7ba00222dce88d0c41fff742422292deb21698862e27f0d06af087.npz
The output labels for the training and test set can be obtained as follows
# helper function
def nseqs(filename):
"""This function extracts the number of sequences
from the fasta file.
"""
return sum((1 for line in open(filename) if line[0] == '>'))
Y = np.asarray([[1] for line in range(nseqs(OCT4))] +
[[0] for line in range(nseqs(MAFK))])
LABELS = Array('y', Y, conditions=['TF-binding'])
Y = np.asarray([[1] for _ in range(nseqs(OCT4_TEST))] +
[[0] for _ in range(nseqs(MAFK_TEST))])
LABELS_TEST = Array('y', Y, conditions=['TF-binding'])
Next, we define three different network topologies using the Janggu framework. The first one considers only the forward strand, whereas the latter two scan both DNA strands for motif matches.
@inputlayer
@outputdense('sigmoid')
def single_stranded_model(inputs, inp, oup, params):
""" keras model that scans a DNA sequence using
a number of motifs.
This model only scans one strand for sequence patterns.
"""
with inputs.use('dna') as layer:
# the name in inputs.use() should be the same as the dataset name.
layer = Conv2D(params[0], (params[1], 1), activation=params[2])(layer)
output = GlobalAveragePooling2D(name='motif')(layer)
return inputs, output
@inputlayer
@outputdense('sigmoid')
def double_stranded_model(inputs, inp, oup, params):
""" keras model for scanning both DNA strands.
Sequence patterns may be present on either strand.
By scanning both DNA strands with the same motifs (kernels)
the performance of the model will generally improve.
In the model below, this is achieved by reverse complementing
the input tensor and keeping the convolution filters fixed.
"""
with inputs.use('dna') as layer:
# the name in inputs.use() should be the same as the dataset name.
forward = layer
convlayer = Conv2D(params[0], (params[1], 1), activation=params[2])
revcomp = Reverse()(forward)
revcomp = Complement()(revcomp)
forward = convlayer(forward)
revcomp = convlayer(revcomp)
revcomp = Reverse()(revcomp)
layer = Maximum()([forward, revcomp])
output = GlobalAveragePooling2D(name='motif')(layer)
return inputs, output
@inputlayer
@outputdense('sigmoid')
def double_stranded_model_dnaconv(inputs, inp, oup, params):
""" keras model for scanning both DNA strands.
A more elegant way of scanning both strands for motif occurrences
is achieved by the DnaConv2D layer wrapper, which internally
performs the convolution operation with the normal kernel weights
and the reverse complemented weights.
"""
with inputs.use('dna') as layer:
# the name in inputs.use() should be the same as the dataset name.
conv = DnaConv2D(Conv2D(params[0],
(params[1], 1),
activation=params[2]), name='conv1')(layer)
output = GlobalAveragePooling2D(name='motif')(conv)
return inputs, output
if modeltype == 'single':
modeltemplate = single_stranded_model
elif modeltype == 'double':
modeltemplate = double_stranded_model
else:
modeltemplate = double_stranded_model_dnaconv
K.clear_session()
Instantiate a new model.
modelname = 'fasta_seqs_m{}_o{}'.format(modeltype, order)
# create a new model object
model = Janggu.create(template=modeltemplate,
modelparams=(30, 21, 'relu'),
inputs=DNA,
outputs=LABELS,
name=modelname)
model.compile(optimizer='adadelta', loss='binary_crossentropy',
metrics=['acc'])
model.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dna (InputLayer) (None, 198, 1, 64) 0 _________________________________________________________________ conv1 (DnaConv2D) (None, 178, 1, 30) 40350 _________________________________________________________________ motif (GlobalAveragePooling2 (None, 30) 0 _________________________________________________________________ y (Dense) (None, 1) 31 ================================================================= Total params: 40,381 Trainable params: 40,381 Non-trainable params: 0 _________________________________________________________________
And train it
# fit the model
hist = model.fit(DNA, LABELS, epochs=100)
Epoch 1/100 244/244 [==============================] - 4s 15ms/step - loss: 0.6248 - acc: 0.6492 Epoch 2/100 244/244 [==============================] - 2s 9ms/step - loss: 0.5205 - acc: 0.7728 Epoch 3/100 244/244 [==============================] - 2s 10ms/step - loss: 0.4627 - acc: 0.7983 Epoch 4/100 244/244 [==============================] - 2s 10ms/step - loss: 0.4228 - acc: 0.8209 Epoch 5/100 244/244 [==============================] - 3s 11ms/step - loss: 0.3940 - acc: 0.8334 Epoch 6/100 244/244 [==============================] - 2s 10ms/step - loss: 0.3690 - acc: 0.8452 Epoch 7/100 244/244 [==============================] - 3s 10ms/step - loss: 0.3471 - acc: 0.8573 Epoch 8/100 244/244 [==============================] - 3s 10ms/step - loss: 0.3261 - acc: 0.8656 Epoch 9/100 244/244 [==============================] - 2s 10ms/step - loss: 0.3049 - acc: 0.8775 Epoch 10/100 244/244 [==============================] - 3s 10ms/step - loss: 0.2845 - acc: 0.8881 Epoch 11/100 244/244 [==============================] - 2s 10ms/step - loss: 0.2661 - acc: 0.8969 Epoch 12/100 244/244 [==============================] - 2s 9ms/step - loss: 0.2469 - acc: 0.9076 Epoch 13/100 244/244 [==============================] - 2s 10ms/step - loss: 0.2293 - acc: 0.9161 Epoch 14/100 244/244 [==============================] - 2s 9ms/step - loss: 0.2147 - acc: 0.9240 Epoch 15/100 244/244 [==============================] - 2s 10ms/step - loss: 0.2006 - acc: 0.9305 Epoch 16/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1889 - acc: 0.9371 Epoch 17/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1781 - acc: 0.9392 Epoch 18/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1677 - acc: 0.9447 Epoch 19/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1579 - acc: 0.9477 Epoch 20/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1494 - acc: 0.9525 Epoch 21/100 244/244 [==============================] - 2s 10ms/step - loss: 0.1406 - acc: 0.9556 Epoch 22/100 244/244 [==============================] - 2s 9ms/step - loss: 0.1346 - acc: 0.9579 Epoch 23/100 244/244 [==============================] - 2s 9ms/step - loss: 0.1273 - acc: 0.9589 Epoch 24/100 244/244 [==============================] - 2s 8ms/step - loss: 0.1214 - acc: 0.9598 Epoch 25/100 244/244 [==============================] - 2s 9ms/step - loss: 0.1163 - acc: 0.9613 Epoch 26/100 244/244 [==============================] - 2s 9ms/step - loss: 0.1112 - acc: 0.9639 Epoch 27/100 244/244 [==============================] - 2s 9ms/step - loss: 0.1066 - acc: 0.9669 Epoch 28/100 244/244 [==============================] - 2s 9ms/step - loss: 0.1022 - acc: 0.9676 Epoch 29/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0979 - acc: 0.9696 Epoch 30/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0938 - acc: 0.9719 Epoch 31/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0904 - acc: 0.9741 Epoch 32/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0877 - acc: 0.9749 Epoch 33/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0842 - acc: 0.9757 Epoch 34/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0812 - acc: 0.9766 Epoch 35/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0781 - acc: 0.9776 Epoch 36/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0752 - acc: 0.9810 Epoch 37/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0731 - acc: 0.9798 Epoch 38/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0709 - acc: 0.9816 Epoch 39/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0681 - acc: 0.9822 Epoch 40/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0657 - acc: 0.9830 Epoch 41/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0647 - acc: 0.9837 Epoch 42/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0624 - acc: 0.9832 Epoch 43/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0600 - acc: 0.9839 Epoch 44/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0586 - acc: 0.9847 Epoch 45/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0569 - acc: 0.9863 Epoch 46/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0552 - acc: 0.9869 Epoch 47/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0533 - acc: 0.9864 Epoch 48/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0521 - acc: 0.9888 Epoch 49/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0501 - acc: 0.9878 Epoch 50/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0493 - acc: 0.9895 Epoch 51/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0477 - acc: 0.9894 Epoch 52/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0460 - acc: 0.9901 Epoch 53/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0445 - acc: 0.9898 Epoch 54/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0434 - acc: 0.9905 Epoch 55/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0420 - acc: 0.9909 Epoch 56/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0410 - acc: 0.9914 Epoch 57/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0402 - acc: 0.9917 Epoch 58/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0388 - acc: 0.9914 Epoch 59/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0374 - acc: 0.9928 Epoch 60/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0366 - acc: 0.9927 Epoch 61/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0357 - acc: 0.9933 Epoch 62/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0349 - acc: 0.9930 Epoch 63/100 244/244 [==============================] - 2s 8ms/step - loss: 0.0337 - acc: 0.9942 Epoch 64/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0328 - acc: 0.9944 Epoch 65/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0319 - acc: 0.9940 Epoch 66/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0310 - acc: 0.9942 Epoch 67/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0305 - acc: 0.9942 Epoch 68/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0299 - acc: 0.9940 Epoch 69/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0291 - acc: 0.9952 Epoch 70/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0281 - acc: 0.9959 Epoch 71/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0275 - acc: 0.9951 Epoch 72/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0267 - acc: 0.9954 Epoch 73/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0261 - acc: 0.9960 Epoch 74/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0253 - acc: 0.9967 Epoch 75/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0245 - acc: 0.9963 Epoch 76/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0242 - acc: 0.9969 Epoch 77/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0232 - acc: 0.9965 Epoch 78/100 244/244 [==============================] - 2s 10ms/step - loss: 0.0227 - acc: 0.9978 Epoch 79/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0220 - acc: 0.9972 Epoch 80/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0216 - acc: 0.9976 Epoch 81/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0213 - acc: 0.9977 Epoch 82/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0201 - acc: 0.9981 Epoch 83/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0197 - acc: 0.9985 Epoch 84/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0193 - acc: 0.9980 Epoch 85/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0190 - acc: 0.9981 Epoch 86/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0185 - acc: 0.9985 Epoch 87/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0179 - acc: 0.9987 Epoch 88/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0173 - acc: 0.9986 Epoch 89/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0169 - acc: 0.9987 Epoch 90/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0164 - acc: 0.9987 Epoch 91/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0162 - acc: 0.9986 Epoch 92/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0158 - acc: 0.9994 Epoch 93/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0151 - acc: 0.9991 Epoch 94/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0148 - acc: 0.9994 Epoch 95/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0145 - acc: 0.9990 Epoch 96/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0143 - acc: 0.9991 Epoch 97/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0139 - acc: 0.9992 Epoch 98/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0135 - acc: 0.9994 Epoch 99/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0131 - acc: 0.9992 Epoch 100/100 244/244 [==============================] - 2s 9ms/step - loss: 0.0130 - acc: 0.9995
Finally, we can evaluate various performance measures, including the area under the ROC curve or the area under the precision recall curve.
model.evaluate(DNA_TEST, LABELS_TEST, datatags=['test'],
callbacks=['prc', 'auprc', 'roc', 'auroc'])
[0.09514768898487092, 0.96]
The results of this evaluation can be found in the output folder
evaluation_folder = os.path.join(os.environ['JANGGU_OUTPUT'], 'evaluation', modelname, 'test')
It contains summary figures of the PRC and ROC curve
Image(os.path.join(evaluation_folder, 'prc.png'))
Image(os.path.join(evaluation_folder, 'roc.png'))
The scores are also exported in table form
pd.read_csv(os.path.join(evaluation_folder, 'auprc.tsv'))
TF-binding | |
---|---|
0 | 0.995258 |
pd.read_csv(os.path.join(evaluation_folder, 'auroc.tsv'))
TF-binding | |
---|---|
0 | 0.995 |