Notebook

Cosine Similarity (Version C)¶

Computes arithmetic mean of pairwise cosine similarity:
- 0.952 american films
- 0.944 british films
- 0.935 indian films

In [1]:

# Configuration

# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-02-10-Wikipedia-Texts/
source_texts_directory = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/"
# https://hobbitdata.informatik.uni-leipzig.de/EML4U/2021-04-07-Wikipedia-Embeddings/
embeddings_directory  = "/home/eml4u/EML4U/data/wikipedia-embeddings/"

# points of time
id_a = "20100408"
id_b = "20201101"
# category ids
id_american = "american-films"
id_british  = "british-films"
id_indian   = "indian-films"
# file ids
id_american_a = id_a + "-" + id_american
id_american_b = id_b + "-" + id_american
id_british_a  = id_a + "-" + id_british
id_british_b  = id_b + "-" + id_british
id_indian_a   = id_a + "-" + id_indian
id_indian_b   = id_b + "-" + id_indian

# 11020 american-films.txt
# 2147 british-films.txt
# 3596 indian-films.txt
execute_american = True

In [2]:

# Imports

import numpy
print("numpy:   " + numpy.version.version)

import sklearn
import sklearn.metrics
print("sklearn: " + sklearn.__version__)

# Class instance to access data (wp texts, pre-computed embeddings)
import data_access
data_accessor = data_access.DataAccess(source_texts_directory, embeddings_directory)

numpy:   1.19.2
sklearn: 0.23.2

In [3]:

# Load embeddings

if execute_american:
    embeddings_american_a = data_accessor.load_embeddings(id_american_a)
    embeddings_american_b = data_accessor.load_embeddings(id_american_b)
embeddings_british_a = data_accessor.load_embeddings(id_british_a)
embeddings_british_b = data_accessor.load_embeddings(id_british_b)
embeddings_indian_a = data_accessor.load_embeddings(id_indian_a)
embeddings_indian_b = data_accessor.load_embeddings(id_indian_b)
print()

/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-american-films.txt
(11020, 768) <class 'numpy.ndarray'>
/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-american-films.txt
(11020, 768) <class 'numpy.ndarray'>
/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-british-films.txt
(2147, 768) <class 'numpy.ndarray'>
/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-british-films.txt
(2147, 768) <class 'numpy.ndarray'>
/home/eml4u/EML4U/data/wikipedia-embeddings/20100408-indian-films.txt
(3596, 768) <class 'numpy.ndarray'>
/home/eml4u/EML4U/data/wikipedia-embeddings/20201101-indian-films.txt
(3596, 768) <class 'numpy.ndarray'>

In [4]:

# Cosine similarity

def get_pairwise_cosine_similarity(a, b, note = "", printinfo = True):
    if printinfo:
        print(str(type(a)) + " " + str(a.shape) + "\n" + str(type(b)) + " " + str(b.shape))
    cosSim = sklearn.metrics.pairwise.cosine_similarity(a, b, dense_output=True)[0][0]
    if printinfo:
        print(str(cosSim) + " " + note)
    return cosSim

# Sums up cosine similarities of texts of 2 points of time and divides sum by number of elements
def get_mean_cosine_similarity(a, b, note = "", printinfo = True):
    sum_ = 0;
    for i in range(len(a)):
        sum_ += sklearn.metrics.pairwise.cosine_similarity(a[i].reshape(1, -1), b[i].reshape(1, -1), dense_output=True)[0][0]
    if printinfo:
        print(str( len(a)  )+ " elements " + note)
    return sum_ / len(a)

print("Arithmetic mean of pairwise cosine similarity:")
if execute_american:
    print(get_mean_cosine_similarity(embeddings_american_a, embeddings_american_b), "american")
print(get_mean_cosine_similarity(embeddings_british_a, embeddings_british_b), "british")
print(get_mean_cosine_similarity(embeddings_indian_a, embeddings_indian_b), "indian")

# Arithmetic mean of pairwise cosine similarity:
# 0.9521031637381328 american
# 0.9445474825043075 british
# 0.9354938114061401 indian

Arithmetic mean of pairwise cosine similarity:
11020 elements 
0.9521031637381328 american
2147 elements 
0.9445474825043075 british
3596 elements 
0.9354938114061401 indian

1. Average embeddings¶

Compute average embeddings for 2 points in time. The results will be a 768-dimensional vector for each point in time.
→ Get texts compared to the average vectors.

→ Get typical texts

One vector of old point in time $\bar{v_{t1}}$ , one vector new point in time $\bar{v_{t2}}$
Between: CosSim

In [5]:

# Arithmetic mean

def get_mean(embeddings, note = "", printinfo = True):
    mean = numpy.mean(embeddings, axis=0)
    if printinfo:
        print(str(type(mean)) + " " + str(mean.shape) + " " +  note)
    return mean

print("Average embeddings for 2 points in time:")
if execute_american:
    mean_american_a  = get_mean(embeddings_american_a, "american_a")
    mean_american_b  = get_mean(embeddings_american_b, "american_b")
mean_british_a  = get_mean(embeddings_british_a, "british_a")
mean_british_b  = get_mean(embeddings_british_b, "british_b")
mean_indian_a  = get_mean(embeddings_indian_a, "indian_a")
mean_indian_b  = get_mean(embeddings_indian_b, "indian_b")

Average embeddings for 2 points in time:
<class 'numpy.ndarray'> (768,) american_a
<class 'numpy.ndarray'> (768,) american_b
<class 'numpy.ndarray'> (768,) british_a
<class 'numpy.ndarray'> (768,) british_b
<class 'numpy.ndarray'> (768,) indian_a
<class 'numpy.ndarray'> (768,) indian_b

In [6]:

# Texts compared to the average vectors

def get_distances(embeddings, mean_embeddings, printinfo = True):
    distances = []
    for i in range(len(embeddings)):
        assert len(mean_embeddings) == len(embeddings[i]), "length of arrays different"
        distances.append((i, get_pairwise_cosine_similarity(mean_embeddings.reshape(1, -1), embeddings[i].reshape(1, -1), "", False)))
    distances = sorted(distances, key=lambda tup: tup[1], reverse=False)
    if(printinfo):
        print(len(distances), distances[0:3], "..", distances[len(distances)-2:])
    return distances

if execute_american:
    distances_american_a = get_distances(embeddings_american_a, mean_american_a)
    distances_american_b = get_distances(embeddings_american_b, mean_american_b)
distances_british_a = get_distances(embeddings_british_a, mean_british_a)
distances_british_b = get_distances(embeddings_british_b, mean_british_b)
distances_indian_a = get_distances(embeddings_indian_a, mean_indian_a)
distances_indian_b = get_distances(embeddings_indian_b, mean_indian_b)

11020 [(7210, 0.6063458325904018), (448, 0.6434165782476016), (3828, 0.6609785427292287)] .. [(1941, 0.9749839140376109), (8218, 0.9756742946779071)]
11020 [(7210, 0.5968999088697942), (4629, 0.6120370250147551), (1738, 0.6434375594093735)] .. [(2017, 0.9780761837998851), (9245, 0.9789322185574947)]
2147 [(961, 0.795723406144361), (1471, 0.7977011006199223), (1047, 0.7990740939165086)] .. [(1107, 0.9754497563201272), (393, 0.9757224095034057)]
2147 [(680, 0.7901596223863818), (980, 0.7970756246003751), (966, 0.7974886203028873)] .. [(1249, 0.9763073406692253), (1993, 0.9763565781092824)]
3596 [(1816, 0.6436218415151918), (1175, 0.6523821880855325), (3116, 0.6672549870487694)] .. [(437, 0.9768417192399903), (2018, 0.9773265088174122)]
3596 [(346, 0.687495848746942), (50, 0.7013913114635741), (2945, 0.7695575497231928)] .. [(2821, 0.9819029091192182), (966, 0.9821326550032112)]

In [7]:

# Print source texts
def print_source_text(directory, category_id, index):
    print()
    print("Category: " + category_id)
    print("Index:    " + str(index))
    file = data_accessor.get_embeddings_dict_filename(category_id, index);
    print("File:     ")
    print(data_accessor.read_source_text(directory, file))
    print()

if False:
    print_source_text(id_british_b, id_british, similarities_british_b[0][0])
    print_source_text(id_british_b, id_british, 680)

2. Compare each document embedding $v_{t2i}$ (of every wp article) at $t2$ with $\bar{v_{t2}}$ using CosSim.¶

Get WP articles with largest distance to mean-vector $\bar{v_t2}$ .
Optional: For article with largest distance, check attention and highlight words with largest attention

e.g. Integrated Gradients for text https://github.com/SeldonIO/alibi

Check plotting + word counts (end of file) https://github.com/EML4U/Topic-Modeling/blob/main/Twitter%20test.ipynb

In [8]:

# Get articles with largest distance to v_t2
# Distance: Smallest cosine similarity
# -> See similarities_british_b

In [10]:

# 100 articles with largest distance to mean vector B
distances_british_b = distances_british_b[0:100]
print(distances_british_b)

[(680, 0.7901596223863818), (980, 0.7970756246003751), (966, 0.7974886203028873), (1605, 0.7984441078760017), (333, 0.8048592466102065), (179, 0.806587230956004), (1202, 0.8096051288887628), (381, 0.8126745126594594), (245, 0.8173266825145751), (1925, 0.821152862297925), (255, 0.8246783158761353), (1514, 0.8287613672087364), (1811, 0.829478593797842), (2087, 0.8348875860421145), (1520, 0.8365918721037443), (886, 0.8366306915455757), (902, 0.8389038535147046), (853, 0.8407825229891188), (1015, 0.84217543288988), (1501, 0.8439344648809313), (406, 0.8461331874787608), (1554, 0.8485776775421392), (663, 0.8509946040235267), (1286, 0.8513454108754721), (213, 0.8514522727454383), (1149, 0.852359455090014), (1637, 0.8539473941773443), (778, 0.8540251801467501), (2009, 0.8576742923186815), (1435, 0.8607933663390277), (483, 0.862620813656519), (526, 0.8629482931038279), (101, 0.8641558058376334), (720, 0.8643496013779346), (1666, 0.8649259321141527), (1674, 0.865616164520292), (789, 0.866252396217221), (1156, 0.8664125570172904), (824, 0.8664865933138797), (332, 0.8671766380114813), (605, 0.8679861756571845), (574, 0.8692133375218205), (385, 0.8703120437107627), (400, 0.8710870542779425), (884, 0.8737298301327405), (1762, 0.8740842715701184), (272, 0.8747334865889536), (591, 0.8748916853566279), (1780, 0.8755334280631477), (295, 0.8764491348254336), (1858, 0.8778708375071165), (1599, 0.877924260244563), (1342, 0.8785883554894383), (2000, 0.8794430006512584), (1757, 0.8795602473035575), (1579, 0.8796144442283066), (1635, 0.8804493627515715), (674, 0.880598769307674), (747, 0.8808114412749424), (2128, 0.8817070054497067), (875, 0.8827556029215672), (1549, 0.8832425625074518), (1258, 0.8833110976547145), (174, 0.8836400509311044), (918, 0.8844534830455353), (1981, 0.8848847113626503), (2053, 0.8852315112685786), (259, 0.8854263498204921), (355, 0.8855939571431497), (1691, 0.8859139831616893), (1247, 0.8865212579727186), (304, 0.8872719351437507), (764, 0.8881581364107067), (216, 0.8881797777193562), (1321, 0.8884719784419831), (1432, 0.8887640378662863), (23, 0.8894833788108552), (839, 0.8899124912192335), (290, 0.890238506109883), (183, 0.8910271068032816), (522, 0.8910528699982174), (1295, 0.8916884399043221), (1462, 0.8917890957542246), (1883, 0.8920079172980555), (819, 0.8925052759266741), (240, 0.8925691265135722), (1835, 0.8927933950167327), (727, 0.8929515843282431), (1191, 0.8931548126744745), (1593, 0.893289718025112), (1323, 0.8934086582551485), (805, 0.8941019599905784), (1231, 0.8943603343110167), (164, 0.8945786418679671), (1615, 0.8950430990051336), (1675, 0.8953028183688045), (17, 0.8953558750795475), (2036, 0.8954911725690698), (1941, 0.8957662081755633), (545, 0.895993783162472)]

In [ ]:

Cosine Similarity (Version C)¶

1. Average embeddings¶

2. Compare each document embedding vt2iv_{t2i} (of every wp article) at t2t2 with ¯vt2\bar{v_{t2}} using CosSim.¶

2. Compare each document embedding $v_{t2i}$ (of every wp article) at $t2$ with $\bar{v_{t2}}$ using CosSim.¶