Notebook

Computation of embeddings of Wikipedia texts¶

Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server.

In 1 hour you can process around 60x60 = 3600 text-pairs.

In [ ]:

# Create embeddings for wikipedia texts
# Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server.
#       In 1 hour you can process around 3,600 = 60*60 text-pairs.

# Current script
baseDir = "/home/eml4u/EML4U/notebooks/wikipedia-embeddings"

# File IDs (for input and output)
#title = "american-films"
#title = "british-films"
#title = "indian-films"
title = "living-people"
dateA = "20100408"
dateB = "20201101"
idA = dateA + "-" + title
idB = dateB + "-" + title

# Input directories
dataDirA = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/" + idA + "/"
dataDirB = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/" + idB + "/"

# Output files
outDir = "/home/eml4u/EML4U/data/wikipedia-embeddings/"
fileEmbeddingsA = outDir + idA + ".txt"
fileEmbeddingsB = outDir + idB + ".txt"
fileIds = outDir + title + ".txt"

print(dataDirA)
print(dataDirB)
print(fileEmbeddingsA)
print(fileEmbeddingsB)
print(fileIds)

In [ ]:

# Get file paths
import glob
filesA = glob.glob(dataDirA + '*.txt')
filesB = glob.glob(dataDirB + '*.txt')

In [ ]:

# Development
# Limit number of file paths
if False:
    filesA = filesA[:20]
    filesB = filesB[:20]
# Print file paths
if False:
    print('\n'.join(map(str, filesA)))
    print()
    print('\n'.join(map(str, filesB)))

In [ ]:

# Read files
textsA = []
for filename in filesA:
    fileobject = open(filename, "r") 
    text = fileobject.read()
    textsA.append(text)
    fileobject.close

textsB = []
for filename in filesB:
    fileobject = open(filename, "r") 
    text = fileobject.read()
    textsB.append(text)
    fileobject.close

In [ ]:

# Print text sizes / texts
print("len(textsA):", len(textsA))
print("len(textsB):", len(textsB))

if False:
    print(textsA[0])
    print(textsB[0])

In [ ]:

# Ensure similar filenames in both points of time
import ntpath
filenames = []
for x in range(len(filesA)):
    filenames.append(ntpath.basename(filesA[x]))
    if(ntpath.basename(filesA[x]) != ntpath.basename(filesB[x])):
        print (x , ntpath.basename(filesA[x]), ntpath.basename(filesB[x]))
print("len(filenames):", len(filenames))

In [ ]:

# Prepare embeddings
import sys
import os
sys.path.append(os.path.abspath(baseDir))
from embedding import BertHuggingface

NUM_CLASSES = 8 # irrelevant if you dont want to retrain
bert = BertHuggingface(NUM_CLASSES)

In [ ]:

# Create embeddings
import time
print(time.asctime())
startTime = time.time()
embeddingsA = bert.embed(textsA)
embeddingsB = bert.embed(textsB)

print("Runtime: %s seconds" % (time.time() - startTime))
print("embeddingsA.shape:", embeddingsA.shape)
print("embeddingsB.shape:", embeddingsB.shape)

In [ ]:

# Write embeddings/arrays to files
print(fileEmbeddingsA)
print(fileEmbeddingsB)
print(fileIds)

import numpy
numpy.savetxt(fileEmbeddingsA, embeddingsA)
numpy.savetxt(fileEmbeddingsB, embeddingsB)
with open(fileIds, "w") as outfile:
    outfile.write("\n".join(filenames))

In [ ]:

# Check: Load arrays
if True:
    loadedA = numpy.loadtxt(fileEmbeddingsA)
    loadedB = numpy.loadtxt(fileEmbeddingsB)
    with open(fileIds) as f:
        loadedFilenames = f.read().splitlines()
    print(numpy.array_equal(embeddingsA, loadedA))
    print(numpy.array_equal(embeddingsB, loadedB))
    print(numpy.array_equal(filenames, loadedFilenames))
    print(type(embeddingsA))
    print(type(loadedA))
    print(type(loadedFilenames))

In [ ]: