Evaluate Word Representations

Word vectors are often evaulated using some benchmarks, such as WS353-Sim.txt. The data is a list of pairs of words, along with their similarities. For example, the first a few lines of the file looks like below:

(base) Jianguos-Air:ws jianguolu$ more WS353-Sim.txt tiger cat 7.35 tiger tiger 10.00 plane car 5.77 train car 6.31

Performance of vector representations are often evaluated using correlation between this ranking and the ranking produced by machine. One example is Speareman correlation.

In [93]:
from gensim.models import Word2Vec
import gensim
txtfile= open('../data/reuters2.txt','r')
sentences=[line.lower().strip().split(' ') for line in txtfile.readlines()] 
model = gensim.models.Word2Vec(sentences, min_count=2, sg=1, iter=5)
words = list(model.wv.vocab)
In [94]:
test='news'
print('words similar to \''+ test + '\':\t'+ str(model.wv.most_similar(test)))
words similar to 'news':	[('conference', 0.9213701486587524), ('press', 0.8843624591827393), ('reuters', 0.8795915246009827), ('denied', 0.8423165082931519), ('journal', 0.8308773040771484), ('bil', 0.8253540992736816), ('newspaper', 0.8204785585403442), ('richard', 0.8181889057159424), ('reporters', 0.8134483098983765), ('danforth', 0.8100142478942871)]

Read the test data

In [95]:
data = []
with open("../data/ws/WS353-Sim.txt") as f:
    for line in f:
        x, y, sim = line.strip().lower().split()
        data.append(((x, y), sim))   
data
Out[95]:
[(('tiger', 'cat'), '7.35'),
 (('tiger', 'tiger'), '10.00'),
 (('plane', 'car'), '5.77'),
 (('train', 'car'), '6.31'),
 (('television', 'radio'), '6.77'),
 (('media', 'radio'), '7.42'),
 (('bread', 'butter'), '6.19'),
 (('cucumber', 'potato'), '5.92'),
 (('doctor', 'nurse'), '7.00'),
 (('professor', 'doctor'), '6.62'),
 (('student', 'professor'), '6.81'),
 (('smart', 'stupid'), '5.81'),
 (('wood', 'forest'), '7.73'),
 (('money', 'cash'), '9.15'),
 (('king', 'queen'), '8.58'),
 (('king', 'rook'), '5.92'),
 (('bishop', 'rabbi'), '6.69'),
 (('fuck', 'sex'), '9.44'),
 (('football', 'soccer'), '9.03'),
 (('football', 'basketball'), '6.81'),
 (('football', 'tennis'), '6.63'),
 (('arafat', 'jackson'), '2.50'),
 (('physics', 'chemistry'), '7.35'),
 (('vodka', 'gin'), '8.46'),
 (('vodka', 'brandy'), '8.13'),
 (('drink', 'eat'), '6.87'),
 (('car', 'automobile'), '8.94'),
 (('gem', 'jewel'), '8.96'),
 (('journey', 'voyage'), '9.29'),
 (('boy', 'lad'), '8.83'),
 (('coast', 'shore'), '9.10'),
 (('asylum', 'madhouse'), '8.87'),
 (('magician', 'wizard'), '9.02'),
 (('midday', 'noon'), '9.29'),
 (('furnace', 'stove'), '8.79'),
 (('food', 'fruit'), '7.52'),
 (('bird', 'cock'), '7.10'),
 (('bird', 'crane'), '7.38'),
 (('food', 'rooster'), '4.42'),
 (('money', 'dollar'), '8.42'),
 (('money', 'currency'), '9.04'),
 (('tiger', 'jaguar'), '8.00'),
 (('tiger', 'feline'), '8.00'),
 (('tiger', 'carnivore'), '7.08'),
 (('tiger', 'mammal'), '6.85'),
 (('tiger', 'animal'), '7.00'),
 (('tiger', 'organism'), '4.77'),
 (('tiger', 'fauna'), '5.62'),
 (('psychology', 'psychiatry'), '8.08'),
 (('psychology', 'science'), '6.71'),
 (('psychology', 'discipline'), '5.58'),
 (('planet', 'star'), '8.45'),
 (('planet', 'moon'), '8.08'),
 (('planet', 'sun'), '8.02'),
 (('precedent', 'example'), '5.85'),
 (('precedent', 'antecedent'), '6.04'),
 (('cup', 'tableware'), '6.85'),
 (('cup', 'artifact'), '2.92'),
 (('cup', 'object'), '3.69'),
 (('cup', 'entity'), '2.15'),
 (('jaguar', 'cat'), '7.42'),
 (('jaguar', 'car'), '7.27'),
 (('mile', 'kilometer'), '8.66'),
 (('skin', 'eye'), '6.22'),
 (('japanese', 'american'), '6.50'),
 (('century', 'year'), '7.59'),
 (('announcement', 'news'), '7.56'),
 (('doctor', 'personnel'), '5.00'),
 (('harvard', 'yale'), '8.13'),
 (('hospital', 'infrastructure'), '4.63'),
 (('life', 'death'), '7.88'),
 (('travel', 'activity'), '5.00'),
 (('type', 'kind'), '8.97'),
 (('street', 'place'), '6.44'),
 (('street', 'avenue'), '8.88'),
 (('street', 'block'), '6.88'),
 (('cell', 'phone'), '7.81'),
 (('dividend', 'payment'), '7.63'),
 (('calculation', 'computation'), '8.44'),
 (('profit', 'loss'), '7.63'),
 (('dollar', 'yen'), '7.78'),
 (('dollar', 'buck'), '9.22'),
 (('phone', 'equipment'), '7.13'),
 (('liquid', 'water'), '7.89'),
 (('marathon', 'sprint'), '7.47'),
 (('seafood', 'food'), '8.34'),
 (('seafood', 'lobster'), '8.70'),
 (('lobster', 'food'), '7.81'),
 (('lobster', 'wine'), '5.70'),
 (('championship', 'tournament'), '8.36'),
 (('man', 'woman'), '8.30'),
 (('man', 'governor'), '5.25'),
 (('murder', 'manslaughter'), '8.53'),
 (('opera', 'performance'), '6.88'),
 (('mexico', 'brazil'), '7.44'),
 (('glass', 'metal'), '5.56'),
 (('aluminum', 'metal'), '7.83'),
 (('rock', 'jazz'), '7.59'),
 (('museum', 'theater'), '7.19'),
 (('shower', 'thunderstorm'), '6.31'),
 (('monk', 'oracle'), '5.00'),
 (('cup', 'food'), '5.00'),
 (('journal', 'association'), '4.97'),
 (('street', 'children'), '4.94'),
 (('car', 'flight'), '4.94'),
 (('space', 'chemistry'), '4.88'),
 (('situation', 'conclusion'), '4.81'),
 (('word', 'similarity'), '4.75'),
 (('peace', 'plan'), '4.75'),
 (('consumer', 'energy'), '4.75'),
 (('ministry', 'culture'), '4.69'),
 (('smart', 'student'), '4.62'),
 (('investigation', 'effort'), '4.59'),
 (('image', 'surface'), '4.56'),
 (('life', 'term'), '4.50'),
 (('start', 'match'), '4.47'),
 (('computer', 'news'), '4.47'),
 (('board', 'recommendation'), '4.47'),
 (('lad', 'brother'), '4.46'),
 (('observation', 'architecture'), '4.38'),
 (('coast', 'hill'), '4.38'),
 (('deployment', 'departure'), '4.25'),
 (('benchmark', 'index'), '4.25'),
 (('attempt', 'peace'), '4.25'),
 (('consumer', 'confidence'), '4.13'),
 (('start', 'year'), '4.06'),
 (('focus', 'life'), '4.06'),
 (('development', 'issue'), '3.97'),
 (('theater', 'history'), '3.91'),
 (('situation', 'isolation'), '3.88'),
 (('profit', 'warning'), '3.88'),
 (('media', 'trading'), '3.88'),
 (('chance', 'credibility'), '3.88'),
 (('precedent', 'information'), '3.85'),
 (('architecture', 'century'), '3.78'),
 (('population', 'development'), '3.75'),
 (('stock', 'live'), '3.73'),
 (('peace', 'atmosphere'), '3.69'),
 (('morality', 'marriage'), '3.69'),
 (('minority', 'peace'), '3.69'),
 (('atmosphere', 'landscape'), '3.69'),
 (('report', 'gain'), '3.63'),
 (('music', 'project'), '3.63'),
 (('seven', 'series'), '3.56'),
 (('experience', 'music'), '3.47'),
 (('school', 'center'), '3.44'),
 (('five', 'month'), '3.38'),
 (('announcement', 'production'), '3.38'),
 (('morality', 'importance'), '3.31'),
 (('money', 'operation'), '3.31'),
 (('delay', 'news'), '3.31'),
 (('governor', 'interview'), '3.25'),
 (('practice', 'institution'), '3.19'),
 (('century', 'nation'), '3.16'),
 (('coast', 'forest'), '3.15'),
 (('shore', 'woodland'), '3.08'),
 (('drink', 'car'), '3.04'),
 (('president', 'medal'), '3.00'),
 (('prejudice', 'recognition'), '3.00'),
 (('viewer', 'serial'), '2.97'),
 (('peace', 'insurance'), '2.94'),
 (('mars', 'water'), '2.94'),
 (('media', 'gain'), '2.88'),
 (('precedent', 'cognition'), '2.81'),
 (('announcement', 'effort'), '2.75'),
 (('line', 'insurance'), '2.69'),
 (('crane', 'implement'), '2.69'),
 (('drink', 'mother'), '2.65'),
 (('opera', 'industry'), '2.63'),
 (('volunteer', 'motto'), '2.56'),
 (('listing', 'proximity'), '2.56'),
 (('precedent', 'collection'), '2.50'),
 (('cup', 'article'), '2.40'),
 (('sign', 'recess'), '2.38'),
 (('problem', 'airport'), '2.38'),
 (('reason', 'hypertension'), '2.31'),
 (('direction', 'combination'), '2.25'),
 (('wednesday', 'news'), '2.22'),
 (('glass', 'magician'), '2.08'),
 (('cemetery', 'woodland'), '2.08'),
 (('possibility', 'girl'), '1.94'),
 (('cup', 'substance'), '1.92'),
 (('forest', 'graveyard'), '1.85'),
 (('stock', 'egg'), '1.81'),
 (('month', 'hotel'), '1.81'),
 (('energy', 'secretary'), '1.81'),
 (('precedent', 'group'), '1.77'),
 (('production', 'hike'), '1.75'),
 (('stock', 'phone'), '1.62'),
 (('holy', 'sex'), '1.62'),
 (('stock', 'cd'), '1.31'),
 (('drink', 'ear'), '1.31'),
 (('delay', 'racism'), '1.19'),
 (('stock', 'life'), '0.92'),
 (('stock', 'jaguar'), '0.92'),
 (('monk', 'slave'), '0.92'),
 (('lad', 'wizard'), '0.92'),
 (('sugar', 'approach'), '0.88'),
 (('rooster', 'voyage'), '0.62'),
 (('noon', 'string'), '0.54'),
 (('chord', 'smile'), '0.54'),
 (('professor', 'cucumber'), '0.31'),
 (('king', 'cabbage'), '0.23')]
In [96]:
results = []
count=0
for (x, y), sim in data:
    if (x in words) & (y in words):
        s=model.similarity(x, y)
        results.append((s, sim))
        print(x+"\t"+y+"\t"+str(s)+"\t"+str(sim))

actual, expected = zip(*results)
actual = np.array(actual,dtype=float)
expected = np.array(expected,dtype=float)

cor= spearmanr(actual, expected)
plane	car	0.845894	5.77
train	car	0.8253301	6.31
television	radio	0.8854684	6.77
media	radio	0.78267133	7.42
wood	forest	0.9491677	7.73
money	cash	0.54670787	9.15
vodka	gin	0.9932268	8.46
car	automobile	0.86125314	8.94
coast	shore	0.79238176	9.10
food	fruit	0.70788664	7.52
money	dollar	0.812971	8.42
money	currency	0.83917785	9.04
skin	eye	0.9569119	6.22
japanese	american	0.32299042	6.50
century	year	0.40324146	7.59
announcement	news	0.735155	7.56
harvard	yale	0.9207133	8.13
travel	activity	0.7749698	5.00
type	kind	0.9445182	8.97
street	place	0.56938833	6.44
street	block	0.67979705	6.88
cell	phone	0.8618718	7.81
dividend	payment	0.7390964	7.63
profit	loss	0.8865849	7.63
dollar	yen	0.83618474	7.78
phone	equipment	0.5708454	7.13
liquid	water	0.81790745	7.89
marathon	sprint	0.89464813	7.47
man	governor	0.7621277	5.25
mexico	brazil	0.7504493	7.44
glass	metal	0.842183	5.56
aluminum	metal	0.9597205	7.83
journal	association	0.38928396	4.97
street	children	0.5339896	4.94
car	flight	0.7774404	4.94
situation	conclusion	0.8985571	4.81
consumer	energy	0.7234855	4.75
investigation	effort	0.7672967	4.59
image	surface	0.8605318	4.56
life	term	0.4206339	4.50
start	match	0.78489614	4.47
computer	news	0.22836396	4.47
board	recommendation	0.6471271	4.47
coast	hill	0.85905534	4.38
consumer	confidence	0.6790625	4.13
start	year	0.62758374	4.06
focus	life	0.6285585	4.06
development	issue	0.38354498	3.97
theater	history	0.7866639	3.91
profit	warning	0.4572981	3.88
media	trading	0.41036922	3.88
report	gain	0.4197008	3.63
music	project	0.7485242	3.63
seven	series	0.6105926	3.56
experience	music	0.81340456	3.47
five	month	0.60924137	3.38
announcement	production	0.41704503	3.38
money	operation	0.6590473	3.31
delay	news	0.6802064	3.31
governor	interview	0.8907014	3.25
practice	institution	0.908399	3.19
century	nation	0.73726445	3.16
coast	forest	0.9076781	3.15
drink	car	0.842479	3.04
media	gain	0.52620554	2.88
announcement	effort	0.6258839	2.75
line	insurance	0.80263776	2.69
crane	implement	0.8525954	2.69
problem	airport	0.81871426	2.38
direction	combination	0.781883	2.25
wednesday	news	0.7870244	2.22
stock	egg	0.5302174	1.81
month	hotel	0.53558105	1.81
energy	secretary	0.29264736	1.81
production	hike	0.47090003	1.75
stock	phone	0.4107827	1.62
stock	life	0.3888274	0.92
sugar	approach	0.76375794	0.88
In [97]:
print(cor)
SpearmanrResult(correlation=0.36797308936169465, pvalue=0.0009180986798817254)

Other_Evaluations

There are about 10 other evaluations data sets. Some focuses on rare words, some on verbs. Which method is in favor of which data?

In [ ]: