import os
embedding_dim = 200
corpus_base_dir = os.path.join('..', '..', '..', 'corpora', 'wired_it_20190821')
vocab_size = 10000
max_length = 1000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
num_epochs = 30
training_dir = os.path.join(corpus_base_dir, 'training')
test_dir = os.path.join(corpus_base_dir, 'test')
classes = ['attualit_','attualit__ambiente','attualit__media','attualit__politica','attualit__tech','economia_business','economia_finanza','economia_lavoro','economia_startup','gadget_accessori','gadget_audio_e_tv','gadget_computer','gadget_elettrodomestici','gadget_foto_e_video','gadget_motori','gadget_outdoor','gadget_videogiochi','internet_regole','internet_social_network','internet_tlc','internet_web','lifestyle_design','lifestyle_food','lifestyle_mobilit_','lifestyle_salute','lifestyle_viaggi','lol','mobile_app','mobile_smartphone','mobile_tablet','play_cinema','play_cultura','play_fumetti','play_libri','play_musica','play_tv','scienza','scienza_biotech','scienza_ecologia','scienza_lab','scienza_medicina','scienza_spazio']
def read_file_to_text(file_path):
with open(file_path, 'r', encoding="utf8") as file:
return file.read().replace('\n', ' ')
def list_text_files(folder):
return [os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith('.txt')]
flatten = lambda l: [item for sublist in l for item in sublist]
load_class_text_pairs = lambda folder: map(lambda classLabel: map(lambda file_name: (classLabel, read_file_to_text(file_name)), list_text_files(os.path.join(folder, classLabel))) , classes)
training_classes, training_texts = zip(*flatten(list(load_class_text_pairs(training_dir))))
test_classes, test_texts = zip(*flatten(list(load_class_text_pairs(test_dir))))
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tensorflow as tf
import numpy as np
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_texts)
training_sequences = tokenizer.texts_to_sequences(training_texts)
training_padded = pad_sequences(training_sequences, padding=padding_type, maxlen=max_length)
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
#tf.keras.layers.GlobalMaxPooling1D(),
#tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(len(classes) + 1, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
Model: "sequential_5" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_5 (Embedding) (None, 1000, 200) 2000000 _________________________________________________________________ global_average_pooling1d_4 ( (None, 200) 0 _________________________________________________________________ dense_6 (Dense) (None, 43) 8643 ================================================================= Total params: 2,008,643 Trainable params: 2,008,643 Non-trainable params: 0 _________________________________________________________________
# We remove the '_' because class names features '_' as separators (and they need NOT to be split, thay must be a unique token)
classes_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
classes_tokenizer.fit_on_texts(classes)
training_classes_seq = np.array(classes_tokenizer.texts_to_sequences(training_classes))
test_classes_seq = np.array(classes_tokenizer.texts_to_sequences(test_classes))
training_classes_encoded = [class_encoded[0] for class_encoded in training_classes_seq]
classes_word_index = classes_tokenizer.word_index
print(classes_word_index)
classes_by_index = dict([(index, key) for (key, index) in classes_word_index.items()])
{'attualit_': 1, 'attualit__ambiente': 2, 'attualit__media': 3, 'attualit__politica': 4, 'attualit__tech': 5, 'economia_business': 6, 'economia_finanza': 7, 'economia_lavoro': 8, 'economia_startup': 9, 'gadget_accessori': 10, 'gadget_audio_e_tv': 11, 'gadget_computer': 12, 'gadget_elettrodomestici': 13, 'gadget_foto_e_video': 14, 'gadget_motori': 15, 'gadget_outdoor': 16, 'gadget_videogiochi': 17, 'internet_regole': 18, 'internet_social_network': 19, 'internet_tlc': 20, 'internet_web': 21, 'lifestyle_design': 22, 'lifestyle_food': 23, 'lifestyle_mobilit_': 24, 'lifestyle_salute': 25, 'lifestyle_viaggi': 26, 'lol': 27, 'mobile_app': 28, 'mobile_smartphone': 29, 'mobile_tablet': 30, 'play_cinema': 31, 'play_cultura': 32, 'play_fumetti': 33, 'play_libri': 34, 'play_musica': 35, 'play_tv': 36, 'scienza': 37, 'scienza_biotech': 38, 'scienza_ecologia': 39, 'scienza_lab': 40, 'scienza_medicina': 41, 'scienza_spazio': 42}
history = model.fit(training_padded, training_classes_seq, epochs=num_epochs, validation_data=(test_padded, test_classes_seq), verbose=2)
Train on 13351 samples, validate on 5709 samples Epoch 1/30 13351/13351 - 4s - loss: 3.6755 - acc: 0.0517 - val_loss: 3.6100 - val_acc: 0.0855 Epoch 2/30 13351/13351 - 4s - loss: 3.5382 - acc: 0.1020 - val_loss: 3.4647 - val_acc: 0.1450 Epoch 3/30 13351/13351 - 4s - loss: 3.3164 - acc: 0.1935 - val_loss: 3.2081 - val_acc: 0.2091 Epoch 4/30 13351/13351 - 4s - loss: 3.0274 - acc: 0.2819 - val_loss: 2.9484 - val_acc: 0.2859 Epoch 5/30 13351/13351 - 4s - loss: 2.7583 - acc: 0.3591 - val_loss: 2.7162 - val_acc: 0.3614 Epoch 6/30 13351/13351 - 4s - loss: 2.5210 - acc: 0.4245 - val_loss: 2.5182 - val_acc: 0.4122 Epoch 7/30 13351/13351 - 4s - loss: 2.3152 - acc: 0.4742 - val_loss: 2.3549 - val_acc: 0.4449 Epoch 8/30 13351/13351 - 4s - loss: 2.1378 - acc: 0.5104 - val_loss: 2.2150 - val_acc: 0.4771 Epoch 9/30 13351/13351 - 5s - loss: 1.9860 - acc: 0.5459 - val_loss: 2.1024 - val_acc: 0.5024 Epoch 10/30 13351/13351 - 4s - loss: 1.8553 - acc: 0.5719 - val_loss: 2.0122 - val_acc: 0.4971 Epoch 11/30 13351/13351 - 4s - loss: 1.7406 - acc: 0.5931 - val_loss: 1.9336 - val_acc: 0.5225 Epoch 12/30 13351/13351 - 4s - loss: 1.6384 - acc: 0.6180 - val_loss: 1.8646 - val_acc: 0.5279 Epoch 13/30 13351/13351 - 4s - loss: 1.5480 - acc: 0.6328 - val_loss: 1.8047 - val_acc: 0.5530 Epoch 14/30 13351/13351 - 4s - loss: 1.4645 - acc: 0.6550 - val_loss: 1.7604 - val_acc: 0.5539 Epoch 15/30 13351/13351 - 4s - loss: 1.3898 - acc: 0.6699 - val_loss: 1.7153 - val_acc: 0.5586 Epoch 16/30 13351/13351 - 4s - loss: 1.3191 - acc: 0.6874 - val_loss: 1.6837 - val_acc: 0.5605 Epoch 17/30 13351/13351 - 4s - loss: 1.2527 - acc: 0.7028 - val_loss: 1.6510 - val_acc: 0.5738 Epoch 18/30 13351/13351 - 4s - loss: 1.1915 - acc: 0.7182 - val_loss: 1.6226 - val_acc: 0.5751 Epoch 19/30 13351/13351 - 4s - loss: 1.1320 - acc: 0.7328 - val_loss: 1.6016 - val_acc: 0.5731 Epoch 20/30 13351/13351 - 4s - loss: 1.0761 - acc: 0.7475 - val_loss: 1.5790 - val_acc: 0.5835 Epoch 21/30 13351/13351 - 4s - loss: 1.0235 - acc: 0.7616 - val_loss: 1.5623 - val_acc: 0.5866 Epoch 22/30 13351/13351 - 4s - loss: 0.9718 - acc: 0.7747 - val_loss: 1.5457 - val_acc: 0.5903 Epoch 23/30 13351/13351 - 4s - loss: 0.9213 - acc: 0.7882 - val_loss: 1.5372 - val_acc: 0.5934 Epoch 24/30 13351/13351 - 4s - loss: 0.8760 - acc: 0.8046 - val_loss: 1.5226 - val_acc: 0.5954 Epoch 25/30 13351/13351 - 4s - loss: 0.8304 - acc: 0.8145 - val_loss: 1.5140 - val_acc: 0.5919 Epoch 26/30 13351/13351 - 5s - loss: 0.7861 - acc: 0.8238 - val_loss: 1.5082 - val_acc: 0.5964 Epoch 27/30 13351/13351 - 4s - loss: 0.7437 - acc: 0.8398 - val_loss: 1.4991 - val_acc: 0.5999 Epoch 28/30 13351/13351 - 4s - loss: 0.7039 - acc: 0.8494 - val_loss: 1.4925 - val_acc: 0.6008 Epoch 29/30 13351/13351 - 4s - loss: 0.6656 - acc: 0.8593 - val_loss: 1.4927 - val_acc: 0.6012 Epoch 30/30 13351/13351 - 4s - loss: 0.6287 - acc: 0.8697 - val_loss: 1.4877 - val_acc: 0.6036
from sklearn import metrics
classes_probabilties = model.predict(test_padded, batch_size=32, verbose=1)
predicted = np.argmax(classes_probabilties, axis=1).tolist()
expected = flatten(test_classes_seq)
precision_scores = metrics.precision_score(expected, predicted, labels=np.unique(expected), average=None)
recall_scores = metrics.recall_score(expected, predicted, labels=np.unique(expected), average=None)
f1_scores = metrics.f1_score(expected, predicted, labels=np.unique(expected), average=None)
precion_recall_f1_scores = list(zip(precision_scores, recall_scores, f1_scores))
print('|Class Label'.ljust(40, ' ') + '|Pre |Rec |F1')
print('|--- |--- |--- |---')
for index, precision_recall_f1 in enumerate(precion_recall_f1_scores):
classLabel = classes_by_index[index + 1]
print('|%s|%.2f|%.2f|%.2f' % (classLabel.ljust(40, ' '), precision_recall_f1[0], precision_recall_f1[1], precision_recall_f1[2]))
precision_score_micro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='micro')
precision_score_macro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='macro')
recall_score_micro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='micro')
recall_score_macro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='macro')
f1_score_micro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='micro')
f1_score_macro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='macro')
print()
print('|Average Type |Prec |Rec |F1')
print('|--- |--- |--- |---')
print('|micro|%.2f|%.2f|%.2f' % (precision_score_micro_averaged, recall_score_micro_averaged, f1_score_micro_averaged))
print('|macro|%.2f|%.2f|%.2f' % (precision_score_macro_averaged, recall_score_macro_averaged, f1_score_macro_averaged))
5709/5709 [==============================] - 0s 49us/sample |Class Label |Pre |Rec |F1 |--- |--- |--- |--- |attualit_ |0.22|0.21|0.21 |attualit__ambiente |0.53|0.62|0.57 |attualit__media |0.54|0.41|0.46 |attualit__politica |0.56|0.59|0.57 |attualit__tech |0.35|0.30|0.32 |economia_business |0.39|0.43|0.41 |economia_finanza |0.75|0.66|0.70 |economia_lavoro |0.73|0.65|0.68 |economia_startup |0.73|0.65|0.69 |gadget_accessori |0.41|0.50|0.45 |gadget_audio_e_tv |0.80|0.79|0.80 |gadget_computer |0.69|0.64|0.66 |gadget_elettrodomestici |0.59|0.21|0.31 |gadget_foto_e_video |0.79|0.78|0.78 |gadget_motori |0.81|0.77|0.79 |gadget_outdoor |0.62|0.59|0.61 |gadget_videogiochi |0.84|0.85|0.85 |internet_regole |0.66|0.28|0.40 |internet_social_network |0.64|0.72|0.67 |internet_tlc |0.60|0.51|0.55 |internet_web |0.50|0.47|0.48 |lifestyle_design |0.49|0.59|0.54 |lifestyle_food |0.74|0.70|0.72 |lifestyle_mobilit_ |0.70|0.70|0.70 |lifestyle_salute |0.43|0.43|0.43 |lifestyle_viaggi |0.57|0.44|0.50 |lol |0.30|0.74|0.43 |mobile_app |0.62|0.60|0.61 |mobile_smartphone |0.87|0.80|0.83 |mobile_tablet |0.82|0.76|0.79 |play_cinema |0.74|0.84|0.79 |play_cultura |0.40|0.36|0.38 |play_fumetti |0.90|0.87|0.88 |play_libri |0.75|0.75|0.75 |play_musica |0.68|0.83|0.75 |play_tv |0.84|0.76|0.80 |scienza |0.41|0.27|0.32 |scienza_biotech |0.50|0.30|0.37 |scienza_ecologia |0.52|0.59|0.55 |scienza_lab |0.45|0.46|0.46 |scienza_medicina |0.63|0.55|0.58 |scienza_spazio |0.77|0.90|0.83 |Average Type |Prec |Rec |F1 |--- |--- |--- |--- |micro|0.60|0.60|0.60 |macro|0.62|0.59|0.59