import math
import numpy as np
import random
import zipfile
import os
import tensorflow as tf
import pandas as pd
import pickle
!pip install -U -q PyDrive
from google.colab import files
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from numpy import genfromtxt
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
vocabulary_size = 1666577
# tf.logging.set_verbosity(tf.logging.ERROR)
os.remove('adc.json')
dl_id = input("Enter Gdrive file ID for Title dictionary: ") # 9-10-18 1mdF0JfrzbOxeHD26JaUW8KEfIdMEHzQG
thefile = drive.CreateFile({'id': dl_id})
thefile.GetContentFile('titleDict.pickle')
with open('titleDict.pickle', 'rb') as handle:
bookDictionary = pickle.load(handle)
Enter Gdrive file ID for Title dictionary: 1mdF0JfrzbOxeHD26JaUW8KEfIdMEHzQG
dl_id = input("Enter Gdrive file ID for Data ") # 9-10-18 1RHVwT1slwbhPlNTTF1JKS7agc4hQSm5c
myDownload = drive.CreateFile({'id': dl_id})
myDownload.GetContentFile('Data.npy')
my_data = np.load('Data.npy')
print(my_data[0:15])
Enter Gdrive file ID for Data 1RHVwT1slwbhPlNTTF1JKS7agc4hQSm5c [[0 list([])] [1 list([421089, 510776, 6403, 1554618, 1451018, 1448326, 1411539, 734702, 263668, 276186, 374145, 712335, 1540518, 732154, 1256014, 370711])] [2 list([])] [3 list([896236, 552833, 290985, 744122, 660888, 1492583, 324439, 1497464, 906952, 890270, 800459, 656974, 464637, 432398, 672494, 1501784, 1551199, 169770, 880915, 1257202, 1647789, 431318, 167368, 1309706, 645636, 1589247, 952101, 1594224, 566783, 1020670, 1530466, 572983, 393055, 923629, 1349376, 455838, 168364, 1419708, 670762, 64953])] [4 list([377701, 646875, 1527223, 458740, 1022675, 668690, 910689, 951671, 717587, 1655779, 670477, 66465, 374116, 450320, 83567, 863721, 1328431, 1585189, 1439964])] [5 list([1394328, 658435, 1338541, 1024419, 1193128, 1416126, 600891, 1133836, 1502110, 38954, 200361, 1271103, 914246, 580300, 337729, 316423, 1631441, 75283, 153695, 294419, 904711, 234803, 341096, 350848, 344889, 146171, 610828, 475984, 462863, 768574, 1060750, 753854, 355396, 457861, 1159063, 1074007, 919943, 1045192, 550452])] [6 list([1590967])] [7 list([731413, 371576, 101514, 291861, 668641, 812990, 457315, 1428604, 216222, 313539, 475783, 1384755, 1426847, 1612089, 124271, 1259377, 1209643, 994466, 1437081, 300318, 1432000])] [8 list([537201, 743717, 194785, 886957, 877387, 405472, 145841, 662184])] [9 list([727615, 1033127, 1488761, 205826, 1278175, 1406008, 546451, 739509, 1412014, 1628720, 797494, 381440, 738525, 103954, 1293419, 778810, 292339, 906068])] [10 list([767034, 632192, 943392, 1444320, 136613, 891973, 1497365, 1580850, 305850, 1000807, 1449216, 1476570, 301317, 1500249, 1262399, 501012, 1115942, 1058776, 1447436, 1357729, 1592057, 1498628, 1618410, 987861, 1504522])] [11 list([260217, 735748, 576441, 596114])] [12 list([1139960, 983105, 295417, 557677, 1252174, 697297, 881771, 1211343, 863739, 714478, 1219226, 935298, 1043281, 1229931, 839873, 1153430, 1080857, 1654324, 137984, 1025220, 696853, 570840, 1590806, 1351588, 595129, 964004, 1472538, 239877])] [13 list([])] [14 list([1255078, 525049, 1481675, 1231620, 894550, 127476, 384389, 737607, 1651253, 771448, 284807, 1127559, 213372, 687169, 1480889, 1395063, 369818, 1454291])]]
data_index = 0
epoch_index = 0
recEpoch_indexA = 0 #Used to help keep store of the total number of epoches with the models
def generate_batch(batch_size, inputCount): #batch size = number of labels
#inputCount = number of inputs per label
global data_index, epoch_index
batch = np.ndarray(shape=(batch_size, inputCount), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
n=0
while n < batch_size:
if len( set(my_data[data_index, 1]) ) >= inputCount:
labels[n,0] = my_data[data_index, 0]
batch[n] = random.sample( set(my_data[data_index, 1]), inputCount)
n = n+1
data_index = (data_index + 1) % len(my_data) #may have to do something like len my_data[:]
if data_index == 0:
epoch_index = epoch_index + 1
print('Completed %d Epochs' % epoch_index)
else:
data_index = (data_index + 1) % len(my_data)
if data_index == 0:
epoch_index = epoch_index + 1
print('Completed %d Epochs' % epoch_index)
return batch, labels
here, goes = generate_batch(20, 4) # to do next, insert %len(headernumber)
print('batch', here)
print('labels', goes)
batch [[1540518 712335 276186 510776] [1020670 890270 656974 167368] [ 83567 377701 910689 646875] [1271103 75283 344889 475984] [1384755 313539 994466 457315] [ 145841 877387 194785 886957] [1412014 1628720 797494 1278175] [1476570 1449216 1357729 1592057] [ 260217 735748 596114 576441] [1252174 1043281 935298 570840] [ 284807 1127559 1231620 1395063] [ 425605 1199985 503766 1177226] [ 883387 1249697 1369264 1606440] [ 143641 823302 1020170 1480253] [ 633950 69171 905572 319694] [ 145147 1299803 1441307 381248] [ 391622 1203282 1594386 1482127] [ 205375 1665861 400547 692811] [ 531508 22134 760494 1454629] [1628449 474993 1129303 875062]] labels [[ 1] [ 3] [ 4] [ 5] [ 7] [ 8] [ 9] [10] [11] [12] [14] [15] [17] [18] [19] [20] [21] [22] [23] [24]]
batch_size = 2048 #2^8
embedding_size = 80 # 2^8 Dimension of the embedding vector.
num_inputs =4
num_sampled = 128 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default(): #took out " , tf.device('/cpu:0')"
train_dataset = tf.placeholder(tf.int32, shape=[batch_size, num_inputs ])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
epochCount = tf.get_variable( 'epochCount', initializer= 0) #to store epoch count to total # of epochs are known
update_epoch = tf.assign(epochCount, epochCount + 1)
embeddings = tf.get_variable( 'embeddings',
initializer= tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.get_variable( 'softmax_weights',
initializer= tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.get_variable('softmax_biases',
initializer= tf.zeros([vocabulary_size]), trainable=False )
embed = tf.nn.embedding_lookup(embeddings, train_dataset) #train data set is
embed_reshaped = tf.reshape( embed, [batch_size*num_inputs, embedding_size] )
segments= np.arange(batch_size).repeat(num_inputs)
averaged_embeds = tf.segment_mean(embed_reshaped, segments, name=None)
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=averaged_embeds,
labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) #Original learning rate was 1.0
saver = tf.train.Saver()
def zipfolder(foldername, target_dir):
zipobj = zipfile.ZipFile(foldername + '.zip', 'w', zipfile.ZIP_DEFLATED)
rootlen = len(target_dir) + 1
for base, dirs, files in os.walk(target_dir):
for file in files:
fn = os.path.join(base, file)
zipobj.write(fn, fn[rootlen:])
loadModel = input("Would you like to load a checkpoint? Type y or n: ")
if loadModel == 'y':
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
zip_id = input("Enter Gdrive file ID for tensorflow models: ")
if not os.path.exists('checkpointsBook2VecCbowWindow1Downloaded'):
os.makedirs('checkpointsBook2VecCbowWindow1Downloaded')
# DOWNLOAD ZIP
print ("Downloading zip file")
myzip = drive.CreateFile({'id': zip_id})
myzip.GetContentFile('model.zip')
# UNZIP ZIP
print ("Uncompressing zip file")
zip_ref = zipfile.ZipFile('model.zip', 'r')
zip_ref.extractall('checkpointsBook2VecCbowWindow1Downloaded/')
zip_ref.close()
print( os.getcwd() )
print( os.listdir('./checkpointsBook2VecCbowWindow1Downloaded') )
Would you like to load a checkpoint? Type y or n: y Enter Gdrive file ID for tensorflow models: 14sVkBYW8SG9Rg9pjOE4KdO-8bwthgnFM Downloading zip file Uncompressing zip file /content ['checkpoint', 'Research2VecEmbedSize80.ckpt.data-00000-of-00001', 'Research2VecEmbedSize80.ckpt.meta', 'Research2VecEmbedSize80.ckpt.index']
num_steps = 10000000
if 'loadModel' not in locals() and 'loadModel' not in globals():
loadModel = 'n'
uploadModel = drive.CreateFile() #used to upload checkpoints when graph is run
with tf.Session(graph=graph) as session:
if loadModel == 'y':
saver.restore(session, './checkpointsBook2VecCbowWindow1Downloaded/Research2VecEmbedSize80.ckpt' )
else:
tf.global_variables_initializer().run() #Don't initalize variables after a checkpoint has been restored
print('Initialized')
average_loss = 0
saveIteration = 1
for step in range(1, num_steps):
batch_data, batch_labels = generate_batch(
batch_size, num_inputs)
feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += l
if step % 8000 == 0:
if step > 0:
average_loss = average_loss / 8000
print('Average loss at step %d: %f' % (step, average_loss))
average_loss = 0
if step % 50000 == 0:
recEpoch_indexA = epoch_index - recEpoch_indexA #how much did the epoch_index since it was last checked
for nE in range(0, recEpoch_indexA ):
session.run(update_epoch) #session run calls tend to be huge bottlenecks, keep in mind while determining the frequency
recEpoch_indexA = epoch_index
print('recEpoch_indexA is', recEpoch_indexA)
print( 'epochCount.eval() is ', epochCount.eval() )
print('epoch_index is ' , epoch_index)
save_path = saver.save(session, "checkpointsBook2Vec5Inputs/Research2VecEmbedSize80.ckpt") #Save checkpoint
auth.authenticate_user()
gauth = GoogleAuth() #Gdrive authenticion code placed here since it expires after some time
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
uploadModel = drive.CreateFile() #Need to also create drive object with updated authenticion
chptName = 'Research2VecEmbedSize80'+str(saveIteration)
zipfolder(chptName, 'checkpointsBook2Vec5Inputs')
uploadModel.SetContentFile(chptName+".zip")
uploadModel.Upload()
print("Checkpoint uploaded to Google Drive")
saveIteration += 1
os.remove(chptName+".zip") #Remove checkpoint zip file after upload
INFO:tensorflow:Restoring parameters from ./checkpointsBook2VecCbowWindow1Downloaded/Research2VecEmbedSize80.ckpt Initialized Completed 1 Epochs Completed 2 Epochs Completed 3 Epochs Completed 4 Epochs Completed 5 Epochs Completed 6 Epochs Completed 7 Epochs Completed 8 Epochs Completed 9 Epochs Completed 10 Epochs Completed 11 Epochs Average loss at step 8000: 0.015743 Completed 12 Epochs Completed 13 Epochs Completed 14 Epochs Completed 15 Epochs Completed 16 Epochs Completed 17 Epochs Completed 18 Epochs Completed 19 Epochs Completed 20 Epochs Completed 21 Epochs Completed 22 Epochs Average loss at step 16000: 0.015661 Completed 23 Epochs Completed 24 Epochs Completed 25 Epochs Completed 26 Epochs Completed 27 Epochs Completed 28 Epochs Completed 29 Epochs Completed 30 Epochs Completed 31 Epochs Completed 32 Epochs Completed 33 Epochs Average loss at step 24000: 0.015636 Completed 34 Epochs Completed 35 Epochs Completed 36 Epochs Completed 37 Epochs Completed 38 Epochs Completed 39 Epochs Completed 40 Epochs Completed 41 Epochs Completed 42 Epochs Completed 43 Epochs Completed 44 Epochs Average loss at step 32000: 0.015627 Completed 45 Epochs Completed 46 Epochs Completed 47 Epochs Completed 48 Epochs Completed 49 Epochs Completed 50 Epochs Completed 51 Epochs Completed 52 Epochs Completed 53 Epochs Completed 54 Epochs Completed 55 Epochs Average loss at step 40000: 0.015602 Completed 56 Epochs Completed 57 Epochs Completed 58 Epochs Completed 59 Epochs Completed 60 Epochs Completed 61 Epochs Completed 62 Epochs Completed 63 Epochs Completed 64 Epochs Completed 65 Epochs Completed 66 Epochs Average loss at step 48000: 0.015633 Completed 67 Epochs Completed 68 Epochs Completed 69 Epochs recEpoch_indexA is 69 epochCount.eval() is 2012 epoch_index is 69 Checkpoint uploaded to Google Drive Completed 70 Epochs Completed 71 Epochs Completed 72 Epochs Completed 73 Epochs