# Configuration
# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-02-10-Wikipedia-Texts/
source_texts_directory = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/"
# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-04-07-Wikipedia-Embeddings/
embeddings_directory = "/home/eml4u/EML4U/data/wikipedia-embeddings/"
# points of time
id_a = "20100408"
id_b = "20201101"
# category ids
id_american = "american-films"
id_british = "british-films"
id_indian = "indian-films"
# file ids
id_american_a = id_a + "-" + id_american
id_american_b = id_b + "-" + id_american
id_british_a = id_a + "-" + id_british
id_british_b = id_b + "-" + id_british
id_indian_a = id_a + "-" + id_indian
id_indian_b = id_b + "-" + id_indian
# Imports
import numpy
print("numpy: " + numpy.version.version)
import sklearn
import sklearn.metrics
print("sklearn: " + sklearn.__version__)
# Class instance to access data (wp texts, pre-computed embeddings)
import data_access
data_accessor = data_access.DataAccess(source_texts_directory, embeddings_directory)
numpy: 1.19.2 sklearn: 0.23.2
# Load embeddings
embeddings_british_a = data_accessor.load_embeddings(id_british_a)
embeddings_british_b = data_accessor.load_embeddings(id_british_b)
print()
# Compute means
def get_mean(embeddings, note = "", printinfo = True):
mean = numpy.mean(embeddings, axis=0)
if printinfo:
print(str(type(mean)) + " " + str(mean.shape) + " " + note)
return mean
mean_british_a = get_mean(embeddings_british_a, "BritishA")
mean_british_b = get_mean(embeddings_british_b, "BritishB")
/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt (2147, 768) <class 'numpy.ndarray'> /home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt (2147, 768) <class 'numpy.ndarray'> <class 'numpy.ndarray'> (768,) BritishA <class 'numpy.ndarray'> (768,) BritishB
# Cosine similarity
def get_pairwise_cosine_similarity(a, b, note = "", printinfo = True):
if printinfo:
print(str(type(a)) + " " + str(a.shape) + "\n" + str(type(b)) + " " + str(b.shape))
cosSim = sklearn.metrics.pairwise.cosine_similarity(a, b, dense_output=True)[0][0]
if printinfo:
print(str(cosSim) + " " + note)
return cosSim
similarities_a = []
for i in range(len(mean_british_a)):
similarities_a.append((i, get_pairwise_cosine_similarity( mean_british_a.reshape(1, -1), embeddings_british_a[i].reshape(1, -1), "", False )))
smallest_similarities_a = sorted(similarities_a, key=lambda tup: tup[1], reverse=False)
similarities_b = []
for i in range(len(mean_british_b)):
similarities_b.append((i, get_pairwise_cosine_similarity( mean_british_b.reshape(1, -1), embeddings_british_b[i].reshape(1, -1), "", False )))
smallest_similarities_b = sorted(similarities_b, key=lambda tup: tup[1], reverse=False)
similarities_direct = []
for i in range(len(embeddings_british_a)):
similarities_direct.append((i, get_pairwise_cosine_similarity( embeddings_british_a[i].reshape(1, -1), embeddings_british_b[i].reshape(1, -1), "", False )))
smallest_similarities_direct = sorted(similarities_direct, key=lambda tup: tup[1], reverse=False)
# Differences of arrays as one value
def differenceValue(a, b):
x = 0
for i in range(len(a)):
x += abs(a[i] - b[i])
return x;
# Test .1 + .2 + .3 + .4 = 1
if True:
print(differenceValue(numpy.array([1,2,3,0]), numpy.array([1.1,2.2,3.3,-0.4])))
# Compute difference values between embeddings of single texts and mean-embeddings
# Array:
# [0] index (to look up source texts)
# [1] difference to mean t1 (== A)
# [2] difference to mean t2 (== B)
differences = []
for i in range(len(mean_british_a)):
differences.append((i, differenceValue(mean_british_a, embeddings_british_a[i]), differenceValue(mean_british_b, embeddings_british_b[i])))
# Sort by largest difference
largest_differences_a = sorted(differences, key=lambda tup: tup[1], reverse=True)
largest_differences_b = sorted(differences, key=lambda tup: tup[2], reverse=True)
# Explore embeddings of two points of time directly
differences_direct = []
for i in range(len(mean_british_a)):
differences_direct.append((i, differenceValue(embeddings_british_a[i], embeddings_british_b[i])))
# Sort
largest_differences_direct = sorted(differences_direct, key=lambda tup: tup[1], reverse=True)
# Print source texts
def print_source_text(directory, category_id, index):
print()
print("Category: " + category_id)
print("Index: " + str(index))
file = data_accessor.get_embeddings_dict_filename(category_id, index);
print("File: ")
print(data_accessor.read_source_text(directory, file))
print()
1.0
print("Smallest cosine similarity to mean of A")
print(smallest_similarities_a[0])
print(smallest_similarities_a[1])
print(smallest_similarities_a[2])
print("...")
print(smallest_similarities_a[len(smallest_similarities_a)-2])
print(smallest_similarities_a[len(smallest_similarities_a)-1])
print()
print("Largest difference values to mean of A")
print(largest_differences_a[0])
print(largest_differences_a[1])
print(largest_differences_a[2])
print("...")
print(largest_differences_a[len(largest_differences_a)-2])
print(largest_differences_a[len(largest_differences_a)-1])
print("\n")
print("Smallest cosine similarity to mean of B")
print(smallest_similarities_b[0])
print(smallest_similarities_b[1])
print(smallest_similarities_b[2])
print("...")
print(smallest_similarities_b[len(smallest_similarities_b)-2])
print(smallest_similarities_b[len(smallest_similarities_b)-1])
print()
print("Largest difference values to mean of B")
print(largest_differences_b[0])
print(largest_differences_b[1])
print(largest_differences_b[2])
print("...")
print(largest_differences_b[len(largest_differences_b)-2])
print(largest_differences_b[len(largest_differences_b)-1])
print("\n")
print("Smallest cosine similarity (direct)")
print(smallest_similarities_direct[0])
print(smallest_similarities_direct[1])
print(smallest_similarities_direct[2])
print("...")
print(smallest_similarities_direct[len(smallest_similarities_direct)-2])
print(smallest_similarities_direct[len(smallest_similarities_direct)-1])
print()
print("Largest difference values (direct)")
print(largest_differences_direct[0])
print(largest_differences_direct[1])
print(largest_differences_direct[2])
print("...")
print(largest_differences_direct[len(largest_differences_direct)-2])
print(largest_differences_direct[len(largest_differences_direct)-1])
print()
Smallest cosine similarity to mean of A (721, 0.8095643688026439) (333, 0.8195543132287988) (680, 0.820176887350935) ... (391, 0.9703187666616886) (393, 0.9757224095034057) Largest difference values to mean of A (721, 127.63656949018497, 63.07215986661895) (333, 126.23330120916125, 126.63447396310393) (680, 122.11133584967376, 129.0560627009414) ... (391, 40.37050334666524, 45.36463767773769) (393, 39.05389511333612, 56.23254257812948) Smallest cosine similarity to mean of B (680, 0.7901596223863818) (333, 0.8048592466102065) (179, 0.806587230956004) ... (334, 0.973834448611832) (309, 0.9741528939831201) Largest difference values to mean of B (179, 114.6978323360855, 131.07256570494872) (680, 122.11133584967376, 129.0560627009414) (381, 88.82879530042067, 127.66585960996093) ... (334, 51.76971304423001, 39.533962588139545) (309, 56.43851055612732, 39.04979437720267) Smallest cosine similarity (direct) (1047, 0.7142031344871039) (864, 0.7232851659170279) (1442, 0.7256858182916857) ... (1604, 0.999699356760097) (1635, 0.9997250982866731) Largest difference values (direct) (721, 152.96771019252628) (610, 141.938466045307) (409, 141.68865489200107) ... (422, 5.0086785865423735) (435, 4.515663030353608)
# Explore underlying texts (Largest difference values to mean of A)
if False:
print("721: As expected")
print_source_text(id_british_a, id_british, largest_differences_a[0][0])
print_source_text(id_british_b, id_british, largest_differences_a[0][0])
if True:
print("333: Very similar, even if difference to mean of A is large")
print_source_text(id_british_a, id_british, largest_differences_a[1][0])
print_source_text(id_british_b, id_british, largest_differences_a[1][0])
333: Very similar, even if difference to mean of A is large Category: british-films Index: 333 File: /home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/20100408-british-films/The_Tide_of_Traffic.txt The Tide of Traffic is a 1972 short documentary film directed by Derek Williams. It was nominated for an Academy Award for Best Documentary Short. References External links - Category:1972 films Category:British films Category:English-language films Category:British documentary films Category:Short films Category: british-films Index: 333 File: /home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/20201101-british-films/The_Tide_of_Traffic.txt The Tide of Traffic is a 1972 British short documentary film directed by Derek Williams. It was nominated for an Academy Award for Best Documentary Short. References External links - Watch The Tide of Traffic at BP Video Library - Category:1972 films Category:1972 documentary films Category:1972 short films Category:British films Category:English-language films Category:British documentary films Category:Short documentary films
# Explore underlying texts (Largest difference values to mean of B)
if True:
print("179: Very similar, even if difference to mean of B is large")
print_source_text(id_british_a, id_british, largest_differences_b[0][0])
print_source_text(id_british_b, id_british, largest_differences_b[0][0])
if False:
print("680: Very similar, even if difference to mean of B is large")
print_source_text(id_british_a, id_british, largest_differences_b[1][0])
print_source_text(id_british_b, id_british, largest_differences_b[1][0])
179: Very similar, even if difference to mean of B is large Category: british-films Index: 179 File: /home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/20100408-british-films/The_World_Is_Rich.txt The World Is Rich is a 1947 documentary film directed by Paul Rotha. It was nominated for an Academy Award for Best Documentary Feature. References External links - fr:The World Is Rich Category:1947 films Category:British films Category:English-language films Category:British documentary films Category:Black-and-white films Category:Films directed by Paul Rotha Category: british-films Index: 179 File: /home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/20201101-british-films/The_World_Is_Rich.txt The World Is Rich is a 1947 British documentary film directed by Paul Rotha. It was nominated for an Academy Award for Best Documentary Feature.. References External links - Category:1947 films Category:1947 documentary films Category:British films Category:English-language films Category:British documentary films Category:Black-and-white documentary films Category:Films directed by Paul Rotha Category:British black-and-white films
# Explore underlying texts (direct difference values)
if False:
print("721: As expected, different")
print_source_text(id_british_a, id_british, largest_differences_direct[0][0])
print_source_text(id_british_b, id_british, largest_differences_direct[0][0])
# Explore underlying texts (direct difference values)
if False:
print("435: As expected, similar")
print_source_text(id_british_a, id_british, largest_differences_direct[len(largest_differences_direct)-1][0])
print_source_text(id_british_b, id_british, largest_differences_direct[len(largest_differences_direct)-1][0])
# Explore underlying texts (direct similarity values)
if False:
print("1047: As expected, different")
print_source_text(id_british_a, id_british, smallest_similarities_direct[0][0])
print_source_text(id_british_b, id_british, smallest_similarities_direct[0][0])
if False:
print("864: As expected, different")
print_source_text(id_british_a, id_british, smallest_similarities_direct[1][0])
print_source_text(id_british_b, id_british, smallest_similarities_direct[1][0])