Notebook

In [0]:

import math
import numpy as np
import random
import zipfile
import os
import tensorflow as tf
import pandas as pd
import pickle

!pip install -U -q PyDrive

from google.colab import files
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from numpy import genfromtxt

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

vocabulary_size = 1666577 

# tf.logging.set_verbosity(tf.logging.ERROR)

In [0]:

os.remove('adc.json')

In [0]:

dl_id = input("Enter Gdrive file ID for Title dictionary: ") # 9-10-18 1mdF0JfrzbOxeHD26JaUW8KEfIdMEHzQG

thefile = drive.CreateFile({'id': dl_id})
thefile.GetContentFile('titleDict.pickle')

with open('titleDict.pickle', 'rb') as handle:
    bookDictionary = pickle.load(handle)

Enter Gdrive file ID for Title dictionary: 1mdF0JfrzbOxeHD26JaUW8KEfIdMEHzQG

In [0]:

dl_id = input("Enter Gdrive file ID for Data ") # 9-10-18 1RHVwT1slwbhPlNTTF1JKS7agc4hQSm5c

myDownload = drive.CreateFile({'id': dl_id})
myDownload.GetContentFile('Data.npy')
my_data = np.load('Data.npy')
print(my_data[0:15])

Enter Gdrive file ID for Data 1RHVwT1slwbhPlNTTF1JKS7agc4hQSm5c
[[0 list([])]
 [1
  list([421089, 510776, 6403, 1554618, 1451018, 1448326, 1411539, 734702, 263668, 276186, 374145, 712335, 1540518, 732154, 1256014, 370711])]
 [2 list([])]
 [3
  list([896236, 552833, 290985, 744122, 660888, 1492583, 324439, 1497464, 906952, 890270, 800459, 656974, 464637, 432398, 672494, 1501784, 1551199, 169770, 880915, 1257202, 1647789, 431318, 167368, 1309706, 645636, 1589247, 952101, 1594224, 566783, 1020670, 1530466, 572983, 393055, 923629, 1349376, 455838, 168364, 1419708, 670762, 64953])]
 [4
  list([377701, 646875, 1527223, 458740, 1022675, 668690, 910689, 951671, 717587, 1655779, 670477, 66465, 374116, 450320, 83567, 863721, 1328431, 1585189, 1439964])]
 [5
  list([1394328, 658435, 1338541, 1024419, 1193128, 1416126, 600891, 1133836, 1502110, 38954, 200361, 1271103, 914246, 580300, 337729, 316423, 1631441, 75283, 153695, 294419, 904711, 234803, 341096, 350848, 344889, 146171, 610828, 475984, 462863, 768574, 1060750, 753854, 355396, 457861, 1159063, 1074007, 919943, 1045192, 550452])]
 [6 list([1590967])]
 [7
  list([731413, 371576, 101514, 291861, 668641, 812990, 457315, 1428604, 216222, 313539, 475783, 1384755, 1426847, 1612089, 124271, 1259377, 1209643, 994466, 1437081, 300318, 1432000])]
 [8
  list([537201, 743717, 194785, 886957, 877387, 405472, 145841, 662184])]
 [9
  list([727615, 1033127, 1488761, 205826, 1278175, 1406008, 546451, 739509, 1412014, 1628720, 797494, 381440, 738525, 103954, 1293419, 778810, 292339, 906068])]
 [10
  list([767034, 632192, 943392, 1444320, 136613, 891973, 1497365, 1580850, 305850, 1000807, 1449216, 1476570, 301317, 1500249, 1262399, 501012, 1115942, 1058776, 1447436, 1357729, 1592057, 1498628, 1618410, 987861, 1504522])]
 [11 list([260217, 735748, 576441, 596114])]
 [12
  list([1139960, 983105, 295417, 557677, 1252174, 697297, 881771, 1211343, 863739, 714478, 1219226, 935298, 1043281, 1229931, 839873, 1153430, 1080857, 1654324, 137984, 1025220, 696853, 570840, 1590806, 1351588, 595129, 964004, 1472538, 239877])]
 [13 list([])]
 [14
  list([1255078, 525049, 1481675, 1231620, 894550, 127476, 384389, 737607, 1651253, 771448, 284807, 1127559, 213372, 687169, 1480889, 1395063, 369818, 1454291])]]

In [0]:

data_index = 0
epoch_index = 0
recEpoch_indexA = 0 #Used to help keep store of the total number of epoches with the models

def generate_batch(batch_size, inputCount): #batch size = number of labels
  #inputCount = number of inputs per label
    global data_index, epoch_index
    
    batch = np.ndarray(shape=(batch_size, inputCount), dtype=np.int32) 
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    n=0
    while n < batch_size:
      if len(    set(my_data[data_index, 1])   ) >= inputCount:
        labels[n,0] = my_data[data_index, 0]
        batch[n] = random.sample( set(my_data[data_index, 1]),  inputCount)
        n = n+1
        data_index = (data_index + 1) % len(my_data) #may have to do something like len my_data[:]
        if data_index == 0:
          epoch_index = epoch_index + 1
          print('Completed %d Epochs' % epoch_index)
      else:
        data_index = (data_index + 1) % len(my_data)
        if data_index == 0:
          epoch_index = epoch_index + 1
          print('Completed %d Epochs' % epoch_index)
    
    return batch, labels     
      
here, goes = generate_batch(20, 4) # to do next, insert %len(headernumber)
print('batch', here)
print('labels', goes)

batch [[1540518  712335  276186  510776]
 [1020670  890270  656974  167368]
 [  83567  377701  910689  646875]
 [1271103   75283  344889  475984]
 [1384755  313539  994466  457315]
 [ 145841  877387  194785  886957]
 [1412014 1628720  797494 1278175]
 [1476570 1449216 1357729 1592057]
 [ 260217  735748  596114  576441]
 [1252174 1043281  935298  570840]
 [ 284807 1127559 1231620 1395063]
 [ 425605 1199985  503766 1177226]
 [ 883387 1249697 1369264 1606440]
 [ 143641  823302 1020170 1480253]
 [ 633950   69171  905572  319694]
 [ 145147 1299803 1441307  381248]
 [ 391622 1203282 1594386 1482127]
 [ 205375 1665861  400547  692811]
 [ 531508   22134  760494 1454629]
 [1628449  474993 1129303  875062]]
labels [[ 1]
 [ 3]
 [ 4]
 [ 5]
 [ 7]
 [ 8]
 [ 9]
 [10]
 [11]
 [12]
 [14]
 [15]
 [17]
 [18]
 [19]
 [20]
 [21]
 [22]
 [23]
 [24]]

In [0]:

batch_size = 2048 #2^8

embedding_size = 80 # 2^8 Dimension of the embedding vector.
num_inputs =4

num_sampled = 128 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(): #took out " , tf.device('/cpu:0')"
 

  train_dataset = tf.placeholder(tf.int32, shape=[batch_size, num_inputs ])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])


  epochCount = tf.get_variable( 'epochCount', initializer= 0) #to store epoch count to total # of epochs are known
  update_epoch = tf.assign(epochCount, epochCount + 1)

  embeddings = tf.get_variable( 'embeddings', 
    initializer= tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

  softmax_weights = tf.get_variable( 'softmax_weights',
    initializer= tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  
  softmax_biases = tf.get_variable('softmax_biases', 
    initializer= tf.zeros([vocabulary_size]),  trainable=False )

  embed = tf.nn.embedding_lookup(embeddings, train_dataset) #train data set is

  embed_reshaped = tf.reshape( embed, [batch_size*num_inputs, embedding_size] )
  
  segments= np.arange(batch_size).repeat(num_inputs)

  averaged_embeds = tf.segment_mean(embed_reshaped, segments, name=None)

  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=averaged_embeds,
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) #Original learning rate was 1.0
                                                                                                                              
  saver = tf.train.Saver()

In [0]:

def zipfolder(foldername, target_dir):            
    zipobj = zipfile.ZipFile(foldername + '.zip', 'w', zipfile.ZIP_DEFLATED)
    rootlen = len(target_dir) + 1
    for base, dirs, files in os.walk(target_dir):
        for file in files:
            fn = os.path.join(base, file)
            zipobj.write(fn, fn[rootlen:])

In [0]:

loadModel = input("Would you like to load a checkpoint? Type y or n: ") 

if loadModel == 'y':
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  
  zip_id = input("Enter Gdrive file ID for tensorflow models: ") 

  if not os.path.exists('checkpointsBook2VecCbowWindow1Downloaded'):
      os.makedirs('checkpointsBook2VecCbowWindow1Downloaded')

  # DOWNLOAD ZIP
  print ("Downloading zip file")
  myzip = drive.CreateFile({'id': zip_id})
  myzip.GetContentFile('model.zip')

  # UNZIP ZIP
  print ("Uncompressing zip file")
  zip_ref = zipfile.ZipFile('model.zip', 'r')
  zip_ref.extractall('checkpointsBook2VecCbowWindow1Downloaded/')
  zip_ref.close()

  print( os.getcwd() )
  print( os.listdir('./checkpointsBook2VecCbowWindow1Downloaded') )
  

Would you like to load a checkpoint? Type y or n: y
Enter Gdrive file ID for tensorflow models: 14sVkBYW8SG9Rg9pjOE4KdO-8bwthgnFM
Downloading zip file
Uncompressing zip file
/content
['checkpoint', 'Research2VecEmbedSize80.ckpt.data-00000-of-00001', 'Research2VecEmbedSize80.ckpt.meta', 'Research2VecEmbedSize80.ckpt.index']

In [0]:

num_steps = 10000000

if 'loadModel' not in locals() and 'loadModel' not in globals():
  loadModel = 'n'

uploadModel = drive.CreateFile() #used to upload checkpoints when graph is run

with tf.Session(graph=graph) as session:
  
  if loadModel == 'y':
    saver.restore(session, './checkpointsBook2VecCbowWindow1Downloaded/Research2VecEmbedSize80.ckpt' )
  else:  
    tf.global_variables_initializer().run() #Don't initalize variables after a checkpoint has been restored
  
  print('Initialized')
  average_loss = 0
  saveIteration = 1
  for step in range(1, num_steps):
    
    batch_data, batch_labels = generate_batch(
      batch_size, num_inputs)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict) 

    average_loss += l
    if step % 8000 == 0:
      if step > 0:
        average_loss = average_loss / 8000
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
      
    if step % 50000 == 0:
      recEpoch_indexA =  epoch_index - recEpoch_indexA #how much did the epoch_index since it was last checked
      for nE in range(0, recEpoch_indexA ):
        session.run(update_epoch) #session run calls tend to be huge bottlenecks, keep in mind while determining the frequency
      recEpoch_indexA = epoch_index
      print('recEpoch_indexA is', recEpoch_indexA)
      print( 'epochCount.eval() is ', epochCount.eval() )
      print('epoch_index is ' , epoch_index)
      
      save_path = saver.save(session, "checkpointsBook2Vec5Inputs/Research2VecEmbedSize80.ckpt") #Save checkpoint
      
      auth.authenticate_user()
      gauth = GoogleAuth() #Gdrive authenticion code placed here since it expires after some time
      gauth.credentials = GoogleCredentials.get_application_default()
      drive = GoogleDrive(gauth) 
      uploadModel = drive.CreateFile() #Need to also create drive object with updated authenticion
      
      chptName = 'Research2VecEmbedSize80'+str(saveIteration)
      zipfolder(chptName, 'checkpointsBook2Vec5Inputs')
      uploadModel.SetContentFile(chptName+".zip")
      uploadModel.Upload()
      
      print("Checkpoint uploaded to Google Drive")
      saveIteration += 1
      os.remove(chptName+".zip") #Remove checkpoint zip file after upload

                

INFO:tensorflow:Restoring parameters from ./checkpointsBook2VecCbowWindow1Downloaded/Research2VecEmbedSize80.ckpt
Initialized
Completed 1 Epochs
Completed 2 Epochs
Completed 3 Epochs
Completed 4 Epochs
Completed 5 Epochs
Completed 6 Epochs
Completed 7 Epochs
Completed 8 Epochs
Completed 9 Epochs
Completed 10 Epochs
Completed 11 Epochs
Average loss at step 8000: 0.015743
Completed 12 Epochs
Completed 13 Epochs
Completed 14 Epochs
Completed 15 Epochs
Completed 16 Epochs
Completed 17 Epochs
Completed 18 Epochs
Completed 19 Epochs
Completed 20 Epochs
Completed 21 Epochs
Completed 22 Epochs
Average loss at step 16000: 0.015661
Completed 23 Epochs
Completed 24 Epochs
Completed 25 Epochs
Completed 26 Epochs
Completed 27 Epochs
Completed 28 Epochs
Completed 29 Epochs
Completed 30 Epochs
Completed 31 Epochs
Completed 32 Epochs
Completed 33 Epochs
Average loss at step 24000: 0.015636
Completed 34 Epochs
Completed 35 Epochs
Completed 36 Epochs
Completed 37 Epochs
Completed 38 Epochs
Completed 39 Epochs
Completed 40 Epochs
Completed 41 Epochs
Completed 42 Epochs
Completed 43 Epochs
Completed 44 Epochs
Average loss at step 32000: 0.015627
Completed 45 Epochs
Completed 46 Epochs
Completed 47 Epochs
Completed 48 Epochs
Completed 49 Epochs
Completed 50 Epochs
Completed 51 Epochs
Completed 52 Epochs
Completed 53 Epochs
Completed 54 Epochs
Completed 55 Epochs
Average loss at step 40000: 0.015602
Completed 56 Epochs
Completed 57 Epochs
Completed 58 Epochs
Completed 59 Epochs
Completed 60 Epochs
Completed 61 Epochs
Completed 62 Epochs
Completed 63 Epochs
Completed 64 Epochs
Completed 65 Epochs
Completed 66 Epochs
Average loss at step 48000: 0.015633
Completed 67 Epochs
Completed 68 Epochs
Completed 69 Epochs
recEpoch_indexA is 69
epochCount.eval() is  2012
epoch_index is  69
Checkpoint uploaded to Google Drive
Completed 70 Epochs
Completed 71 Epochs
Completed 72 Epochs
Completed 73 Epochs