# Configuration
# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-02-10-Wikipedia-Texts/
source_texts_directory = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/"
# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-04-07-Wikipedia-Embeddings/
embeddings_directory = "/home/eml4u/EML4U/data/wikipedia-embeddings/"
# points of time
id_a = "20100408"
id_b = "20201101"
# category ids
id_american = "american-films"
id_british = "british-films"
id_indian = "indian-films"
# file ids
id_american_a = id_a + "-" + id_american
id_american_b = id_b + "-" + id_american
id_british_a = id_a + "-" + id_british
id_british_b = id_b + "-" + id_british
id_indian_a = id_a + "-" + id_indian
id_indian_b = id_b + "-" + id_indian
# 11020 american-films.txt
# 2147 british-films.txt
# 3596 indian-films.txt
execute_american = True
# Imports
import numpy
print("numpy: " + numpy.version.version)
import sklearn
import sklearn.metrics
print("sklearn: " + sklearn.__version__)
# Class instance to access data (wp texts, pre-computed embeddings)
import data_access
data_accessor = data_access.DataAccess(source_texts_directory, embeddings_directory)
numpy: 1.19.2 sklearn: 0.23.2
# Load embeddings
if execute_american:
embeddings_american_a = data_accessor.load_embeddings(id_american_a)
embeddings_american_b = data_accessor.load_embeddings(id_american_b)
embeddings_british_a = data_accessor.load_embeddings(id_british_a)
embeddings_british_b = data_accessor.load_embeddings(id_british_b)
embeddings_indian_a = data_accessor.load_embeddings(id_indian_a)
embeddings_indian_b = data_accessor.load_embeddings(id_indian_b)
print()
/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-american-films.txt (11020, 768) <class 'numpy.ndarray'> /home/eml4u/EML4U/data/wikipedia-embeddings/20201101-american-films.txt (11020, 768) <class 'numpy.ndarray'> /home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt (2147, 768) <class 'numpy.ndarray'> /home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt (2147, 768) <class 'numpy.ndarray'> /home/eml4u/EML4U/data/wikipedia-embeddings/20100408-indian-films.txt (3596, 768) <class 'numpy.ndarray'> /home/eml4u/EML4U/data/wikipedia-embeddings/20201101-indian-films.txt (3596, 768) <class 'numpy.ndarray'>
# Cosine similarity
def get_pairwise_cosine_similarity(a, b, note = "", printinfo = True):
if printinfo:
print(str(type(a)) + " " + str(a.shape) + "\n" + str(type(b)) + " " + str(b.shape))
cosSim = sklearn.metrics.pairwise.cosine_similarity(a, b, dense_output=True)[0][0]
if printinfo:
print(str(cosSim) + " " + note)
return cosSim
# Sums up cosine similarities of texts of 2 points of time and divides sum by number of elements
def get_mean_cosine_similarity(a, b, note = "", printinfo = True):
sum_ = 0;
for i in range(len(a)):
sum_ += sklearn.metrics.pairwise.cosine_similarity(a[i].reshape(1, -1), b[i].reshape(1, -1), dense_output=True)[0][0]
if printinfo:
print(str( len(a) )+ " elements " + note)
return sum_ / len(a)
print("Arithmetic mean of pairwise cosine similarity:")
if execute_american:
print(get_mean_cosine_similarity(embeddings_american_a, embeddings_american_b), "american")
print(get_mean_cosine_similarity(embeddings_british_a, embeddings_british_b), "british")
print(get_mean_cosine_similarity(embeddings_indian_a, embeddings_indian_b), "indian")
# Arithmetic mean of pairwise cosine similarity:
# 0.9521031637381328 american
# 0.9445474825043075 british
# 0.9354938114061401 indian
Arithmetic mean of pairwise cosine similarity: 11020 elements 0.9521031637381328 american 2147 elements 0.9445474825043075 british 3596 elements 0.9354938114061401 indian
Compute average embeddings for 2 points in time. The results will be a 768-dimensional vector for each point in time.
→ Get texts compared to the average vectors.
→ Get typical texts
# Arithmetic mean
def get_mean(embeddings, note = "", printinfo = True):
mean = numpy.mean(embeddings, axis=0)
if printinfo:
print(str(type(mean)) + " " + str(mean.shape) + " " + note)
return mean
print("Average embeddings for 2 points in time:")
if execute_american:
mean_american_a = get_mean(embeddings_american_a, "american_a")
mean_american_b = get_mean(embeddings_american_b, "american_b")
mean_british_a = get_mean(embeddings_british_a, "british_a")
mean_british_b = get_mean(embeddings_british_b, "british_b")
mean_indian_a = get_mean(embeddings_indian_a, "indian_a")
mean_indian_b = get_mean(embeddings_indian_b, "indian_b")
Average embeddings for 2 points in time: <class 'numpy.ndarray'> (768,) american_a <class 'numpy.ndarray'> (768,) american_b <class 'numpy.ndarray'> (768,) british_a <class 'numpy.ndarray'> (768,) british_b <class 'numpy.ndarray'> (768,) indian_a <class 'numpy.ndarray'> (768,) indian_b
# Texts compared to the average vectors
def get_distances(embeddings, mean_embeddings, printinfo = True):
distances = []
for i in range(len(embeddings)):
assert len(mean_embeddings) == len(embeddings[i]), "length of arrays different"
distances.append((i, get_pairwise_cosine_similarity(mean_embeddings.reshape(1, -1), embeddings[i].reshape(1, -1), "", False)))
distances = sorted(distances, key=lambda tup: tup[1], reverse=False)
if(printinfo):
print(len(distances), distances[0:3], "..", distances[len(distances)-2:])
return distances
if execute_american:
distances_american_a = get_distances(embeddings_american_a, mean_american_a)
distances_american_b = get_distances(embeddings_american_b, mean_american_b)
distances_british_a = get_distances(embeddings_british_a, mean_british_a)
distances_british_b = get_distances(embeddings_british_b, mean_british_b)
distances_indian_a = get_distances(embeddings_indian_a, mean_indian_a)
distances_indian_b = get_distances(embeddings_indian_b, mean_indian_b)
11020 [(7210, 0.6063458325904018), (448, 0.6434165782476016), (3828, 0.6609785427292287)] .. [(1941, 0.9749839140376109), (8218, 0.9756742946779071)] 11020 [(7210, 0.5968999088697942), (4629, 0.6120370250147551), (1738, 0.6434375594093735)] .. [(2017, 0.9780761837998851), (9245, 0.9789322185574947)] 2147 [(961, 0.795723406144361), (1471, 0.7977011006199223), (1047, 0.7990740939165086)] .. [(1107, 0.9754497563201272), (393, 0.9757224095034057)] 2147 [(680, 0.7901596223863818), (980, 0.7970756246003751), (966, 0.7974886203028873)] .. [(1249, 0.9763073406692253), (1993, 0.9763565781092824)] 3596 [(1816, 0.6436218415151918), (1175, 0.6523821880855325), (3116, 0.6672549870487694)] .. [(437, 0.9768417192399903), (2018, 0.9773265088174122)] 3596 [(346, 0.687495848746942), (50, 0.7013913114635741), (2945, 0.7695575497231928)] .. [(2821, 0.9819029091192182), (966, 0.9821326550032112)]
# Print source texts
def print_source_text(directory, category_id, index):
print()
print("Category: " + category_id)
print("Index: " + str(index))
file = data_accessor.get_embeddings_dict_filename(category_id, index);
print("File: ")
print(data_accessor.read_source_text(directory, file))
print()
if False:
print_source_text(id_british_b, id_british, similarities_british_b[0][0])
print_source_text(id_british_b, id_british, 680)
e.g. Integrated Gradients for text https://github.com/SeldonIO/alibi
# Get articles with largest distance to v_t2
# Distance: Smallest cosine similarity
# -> See similarities_british_b
# 100 articles with largest distance to mean vector B
distances_british_b = distances_british_b[0:100]
print(distances_british_b)
[(680, 0.7901596223863818), (980, 0.7970756246003751), (966, 0.7974886203028873), (1605, 0.7984441078760017), (333, 0.8048592466102065), (179, 0.806587230956004), (1202, 0.8096051288887628), (381, 0.8126745126594594), (245, 0.8173266825145751), (1925, 0.821152862297925), (255, 0.8246783158761353), (1514, 0.8287613672087364), (1811, 0.829478593797842), (2087, 0.8348875860421145), (1520, 0.8365918721037443), (886, 0.8366306915455757), (902, 0.8389038535147046), (853, 0.8407825229891188), (1015, 0.84217543288988), (1501, 0.8439344648809313), (406, 0.8461331874787608), (1554, 0.8485776775421392), (663, 0.8509946040235267), (1286, 0.8513454108754721), (213, 0.8514522727454383), (1149, 0.852359455090014), (1637, 0.8539473941773443), (778, 0.8540251801467501), (2009, 0.8576742923186815), (1435, 0.8607933663390277), (483, 0.862620813656519), (526, 0.8629482931038279), (101, 0.8641558058376334), (720, 0.8643496013779346), (1666, 0.8649259321141527), (1674, 0.865616164520292), (789, 0.866252396217221), (1156, 0.8664125570172904), (824, 0.8664865933138797), (332, 0.8671766380114813), (605, 0.8679861756571845), (574, 0.8692133375218205), (385, 0.8703120437107627), (400, 0.8710870542779425), (884, 0.8737298301327405), (1762, 0.8740842715701184), (272, 0.8747334865889536), (591, 0.8748916853566279), (1780, 0.8755334280631477), (295, 0.8764491348254336), (1858, 0.8778708375071165), (1599, 0.877924260244563), (1342, 0.8785883554894383), (2000, 0.8794430006512584), (1757, 0.8795602473035575), (1579, 0.8796144442283066), (1635, 0.8804493627515715), (674, 0.880598769307674), (747, 0.8808114412749424), (2128, 0.8817070054497067), (875, 0.8827556029215672), (1549, 0.8832425625074518), (1258, 0.8833110976547145), (174, 0.8836400509311044), (918, 0.8844534830455353), (1981, 0.8848847113626503), (2053, 0.8852315112685786), (259, 0.8854263498204921), (355, 0.8855939571431497), (1691, 0.8859139831616893), (1247, 0.8865212579727186), (304, 0.8872719351437507), (764, 0.8881581364107067), (216, 0.8881797777193562), (1321, 0.8884719784419831), (1432, 0.8887640378662863), (23, 0.8894833788108552), (839, 0.8899124912192335), (290, 0.890238506109883), (183, 0.8910271068032816), (522, 0.8910528699982174), (1295, 0.8916884399043221), (1462, 0.8917890957542246), (1883, 0.8920079172980555), (819, 0.8925052759266741), (240, 0.8925691265135722), (1835, 0.8927933950167327), (727, 0.8929515843282431), (1191, 0.8931548126744745), (1593, 0.893289718025112), (1323, 0.8934086582551485), (805, 0.8941019599905784), (1231, 0.8943603343110167), (164, 0.8945786418679671), (1615, 0.8950430990051336), (1675, 0.8953028183688045), (17, 0.8953558750795475), (2036, 0.8954911725690698), (1941, 0.8957662081755633), (545, 0.895993783162472)]