#!/usr/bin/env python # coding: utf-8 # # 21 - Word2Vec # # by [Alejandro Correa Bahnsen](http://www.albahnsen.com/) # # version 1.0, July 2018 # # ## Part of the class [Applied Deep Learning](https://github.com/albahnsen/AppliedDeepLearningClass) # # This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License](http://creativecommons.org/licenses/by-sa/3.0/deed.en_US). # # Word2Vec # # Word2vec is a two-layer neural net that processes text. Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus. While Word2vec is not a deep neural network, it turns text into a numerical form that deep nets can understand. # # Word2vec’s applications extend beyond parsing sentences in the wild. It can be applied just as well to genes, code, likes, playlists, social media graphs and other verbal or symbolic series in which patterns may be discerned. # # Why? Because words are simply discrete states like the other data mentioned above, and we are simply looking for the transitional probabilities between those states: the likelihood that they will co-occur. So gene2vec, like2vec and follower2vec are all possible. With that in mind, the tutorial below will help you understand how to create neural embeddings for any group of discrete and co-occurring states. # # The purpose and usefulness of Word2vec is to group the vectors of similar words together in vectorspace. That is, it detects similarities mathematically. Word2vec creates vectors that are distributed numerical representations of word features, features such as the context of individual words. It does so without human intervention. # # Tensorflow Hub # # TensorFlow Hub is a library for the publication, discovery, and consumption of reusable parts of machine learning models. A module is a self-contained piece of a TensorFlow graph, along with its weights and assets, that can be reused across different tasks in a process known as transfer learning. Transfer learning can: # # - Train a model with a smaller dataset, # - Improve generalization, and # - Speed up training. # In[1]: get_ipython().system('pip install tensorflow-hub') # In[10]: import tensorflow_hub as hub import tensorflow as tf import numpy as np # See available modules on tensor_flow_hub # # https://www.tensorflow.org/hub/modules/text # In[30]: hub_model = "https://tfhub.dev/google/Wiki-words-250/1" # In[60]: def create_text_embedding(hub_model, input_sentences): SEQ_LENGTH = max(map(len, input_sentences)) EMBEDDING_DIM = 250 with tf.Graph().as_default() as g: embed_layer = hub.Module(hub_model, trainable=False, name='text_embedding') sentences = tf.placeholder(dtype=tf.string, shape=(None, SEQ_LENGTH)) batch_size = tf.shape(sentences)[0] flat_sentences = tf.reshape(sentences, [-1]) embeddings = embed_layer(flat_sentences) sentence_embedding = tf.reshape(embeddings, [batch_size, SEQ_LENGTH, EMBEDDING_DIM]) with tf.Session(graph=g) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) output = sess.run(sentence_embedding, feed_dict={ sentences: input_sentences }) return output # In[61]: sentences = ["cat is on the mat", "dog is in the fog", "padded sentence UNK UNK UNK"] sentences = [s.split() for s in sentences] # In[62]: sentences_em = create_text_embedding(hub_model, sentences) # In[63]: sentences_em.shape # In[64]: sentences_em[0] # # Example # In[118]: import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing.sequence import pad_sequences from keras.models import Sequential from keras.utils import np_utils from keras.layers import Dense, Flatten, Dropout, Activation, BatchNormalization from keras.optimizers import RMSprop, Adam from keras.callbacks import History from keras.layers import Conv1D, GlobalMaxPooling1D from livelossplot import PlotLossesKeras get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[45]: # read the data and set the datetime as the index import zipfile with zipfile.ZipFile('../datasets/Tweets.zip', 'r') as z: f = z.open('Tweets.csv') tweets = pd.read_csv(f, index_col=0) tweets.head() # In[79]: X = tweets['text'] y = tweets['airline_sentiment'].map({'negative':-1,'neutral':0,'positive':1}) Y = np_utils.to_categorical(y, 3) # In[66]: # Get words X_pad = [text_to_word_sequence(x) for x in X.tolist()] # In[72]: X_pad = pad_sequences(X_pad, dtype='str', value=0) # In[74]: X_pad.shape # ### Apply word2vec # In[75]: X_em = create_text_embedding(hub_model, X_pad) # In[76]: X_em.shape # ### Train neural network # In[132]: model = Sequential() model.add(Conv1D(100, 2, padding='same', strides=1, input_shape=X_em.shape[1:])) model.add(Activation('relu')) model.add(GlobalMaxPooling1D()) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(3)) model.add(Activation('softmax')) adam = Adam(lr=0.0002) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) # In[133]: model.summary() # In[134]: h = History() model.fit(X_em, Y, epochs=50, batch_size=500, verbose=0, callbacks=[h, PlotLossesKeras()], validation_split=0.2) # In[135]: h.history['val_acc'][-1] # In[ ]: