import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
Using TensorFlow backend.
def read_20_newgroup_files(path_to_data_directory):
texts = []
labels_index = {}
labels = []
for name in sorted(os.listdir(path_to_data_directory)):
path = os.path.join(path_to_data_directory,name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
if sys.version_info < (3,):
f = open(fpath)
else:
f = open(fpath, encoding='latin-1')
t = f.read()
i = t.find('\n\n') +2 # skip header
if i > 0:
t = t[i:]
texts.append(t)
f.close()
labels.append(label_id)
return (texts,labels_index,labels)
path = "/home/felipe/data/20_newsgroup/20_newsgroup/"
texts,labels_index,labels = read_20_newgroup_files(path)
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000
VALIDATION_SPLIT = 0.2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
X_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
X_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
GLOVE_DIR = "/media/felipe/SAMSUNG/GloVe"
EMBEDDING_DIM = 100
embeddings_index = {}
with open(os.path.join(GLOVE_DIR,"glove.6B.{0}d.txt".format(EMBEDDING_DIM)),'r') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:],dtype='float32')
embeddings_index[word] = coefs
len(word_index)
174074
embedding_matrix = np.zeros((len(word_index)+1,EMBEDDING_DIM))
for word,i in word_index.items():
if i >= MAX_NB_WORDS:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index)+1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable = False)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss ='categorical_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
model.fit(X_train,y_train, validation_data=(X_val, y_val),
epochs=20, batch_size=128)
Train on 15998 samples, validate on 3999 samples Epoch 1/20 4352/15998 [=======>......................] - ETA: 94s - loss: 2.8506 - acc: 0.0843
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-10-af0321680fa3> in <module>() 1 model.fit(X_train,y_train, validation_data=(X_val, y_val), ----> 2 epochs=20, batch_size=128) /home/felipe/tf-venv3/lib/python3.5/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs) 1484 val_f=val_f, val_ins=val_ins, shuffle=shuffle, 1485 callback_metrics=callback_metrics, -> 1486 initial_epoch=initial_epoch) 1487 1488 def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None): /home/felipe/tf-venv3/lib/python3.5/site-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch) 1139 batch_logs['size'] = len(batch_ids) 1140 callbacks.on_batch_begin(batch_index, batch_logs) -> 1141 outs = f(ins_batch) 1142 if not isinstance(outs, list): 1143 outs = [outs] /home/felipe/tf-venv3/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs) 2101 session = get_session() 2102 updated = session.run(self.outputs + [self.updates_op], -> 2103 feed_dict=feed_dict) 2104 return updated[:len(self.outputs)] 2105 /home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata) 776 try: 777 result = self._run(None, fetches, feed_dict, options_ptr, --> 778 run_metadata_ptr) 779 if run_metadata: 780 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr) /home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata) 980 if final_fetches or final_targets: 981 results = self._do_run(handle, final_targets, final_fetches, --> 982 feed_dict_string, options, run_metadata) 983 else: 984 results = [] /home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1030 if handle is None: 1031 return self._do_call(_run_fn, self._session, feed_dict, fetch_list, -> 1032 target_list, options, run_metadata) 1033 else: 1034 return self._do_call(_prun_fn, self._session, handle, feed_dict, /home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args) 1037 def _do_call(self, fn, *args): 1038 try: -> 1039 return fn(*args) 1040 except errors.OpError as e: 1041 message = compat.as_text(e.message) /home/felipe/tf-venv3/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata) 1019 return tf_session.TF_Run(session, options, 1020 feed_dict, fetch_list, target_list, -> 1021 status, run_metadata) 1022 1023 def _prun_fn(session, handle, feed_dict, fetch_list): KeyboardInterrupt: