Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server.
In 1 hour you can process around 60x60 = 3600 text-pairs.
# Create embeddings for wikipedia texts
# Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server.
# In 1 hour you can process around 3,600 = 60*60 text-pairs.
# Current script
baseDir = "/home/eml4u/EML4U/notebooks/wikipedia-embeddings"
# File IDs (for input and output)
#title = "american-films"
#title = "british-films"
#title = "indian-films"
title = "living-people"
dateA = "20100408"
dateB = "20201101"
idA = dateA + "-" + title
idB = dateB + "-" + title
# Input directories
dataDirA = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/" + idA + "/"
dataDirB = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/" + idB + "/"
# Output files
outDir = "/home/eml4u/EML4U/data/wikipedia-embeddings/"
fileEmbeddingsA = outDir + idA + ".txt"
fileEmbeddingsB = outDir + idB + ".txt"
fileIds = outDir + title + ".txt"
print(dataDirA)
print(dataDirB)
print(fileEmbeddingsA)
print(fileEmbeddingsB)
print(fileIds)
# Get file paths
import glob
filesA = glob.glob(dataDirA + '*.txt')
filesB = glob.glob(dataDirB + '*.txt')
# Development
# Limit number of file paths
if False:
filesA = filesA[:20]
filesB = filesB[:20]
# Print file paths
if False:
print('\n'.join(map(str, filesA)))
print()
print('\n'.join(map(str, filesB)))
# Read files
textsA = []
for filename in filesA:
fileobject = open(filename, "r")
text = fileobject.read()
textsA.append(text)
fileobject.close
textsB = []
for filename in filesB:
fileobject = open(filename, "r")
text = fileobject.read()
textsB.append(text)
fileobject.close
# Print text sizes / texts
print("len(textsA):", len(textsA))
print("len(textsB):", len(textsB))
if False:
print(textsA[0])
print(textsB[0])
# Ensure similar filenames in both points of time
import ntpath
filenames = []
for x in range(len(filesA)):
filenames.append(ntpath.basename(filesA[x]))
if(ntpath.basename(filesA[x]) != ntpath.basename(filesB[x])):
print (x , ntpath.basename(filesA[x]), ntpath.basename(filesB[x]))
print("len(filenames):", len(filenames))
# Prepare embeddings
import sys
import os
sys.path.append(os.path.abspath(baseDir))
from embedding import BertHuggingface
NUM_CLASSES = 8 # irrelevant if you dont want to retrain
bert = BertHuggingface(NUM_CLASSES)
# Create embeddings
import time
print(time.asctime())
startTime = time.time()
embeddingsA = bert.embed(textsA)
embeddingsB = bert.embed(textsB)
print("Runtime: %s seconds" % (time.time() - startTime))
print("embeddingsA.shape:", embeddingsA.shape)
print("embeddingsB.shape:", embeddingsB.shape)
# Write embeddings/arrays to files
print(fileEmbeddingsA)
print(fileEmbeddingsB)
print(fileIds)
import numpy
numpy.savetxt(fileEmbeddingsA, embeddingsA)
numpy.savetxt(fileEmbeddingsB, embeddingsB)
with open(fileIds, "w") as outfile:
outfile.write("\n".join(filenames))
# Check: Load arrays
if True:
loadedA = numpy.loadtxt(fileEmbeddingsA)
loadedB = numpy.loadtxt(fileEmbeddingsB)
with open(fileIds) as f:
loadedFilenames = f.read().splitlines()
print(numpy.array_equal(embeddingsA, loadedA))
print(numpy.array_equal(embeddingsB, loadedB))
print(numpy.array_equal(filenames, loadedFilenames))
print(type(embeddingsA))
print(type(loadedA))
print(type(loadedFilenames))