In [1]:
"""Political Vector Projector

Given word vectors trained by word2vec (Mikolov et al. 2013)
or fastText (Bojanowski et al. 2016), project the vectors of
U.S. senators onto a 'conservative' to 'liberal' axis.
The scalar components of such projections may be interpreted
as a valid metric of political ideology.

https://empirical.coffee/blog/2017/political-vector-projector
"""

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import csv
import re
import os

%matplotlib inline
plt.style.use('seaborn')
In [2]:
class Word2VecProjector():
"""Read a vector file trained by word2vec"""

def __init__(self, vec_file, eval_file):
self._vec_file = vec_file
self._eval_file = eval_file
self._axes = ('conservative', 'liberal', 'good', 'bad')

def _parse_vector(self, line):
"""Return: vector as a list, label"""
line = line.strip().split()
return line[1:], line[0]

def _validate_shape(self, shape, vocab_size, embed_dim):
"""Verify the shape of tensor as specified by the first line in .vec"""
try:
assert shape == (int(vocab_size), int(embed_dim))
except AssertionError:
print('Warning: Something went wrong with file IO!')
print('The .vec file header line specifies that vocab_size = {}, '
'embed_dim = {}'.format(vocab_size, embed_dim))
print('But actually reading the file yielded a tensor of shape {}'
.format(shape))

"""return dict {word: vector}"""
vec_dict = {}
with open(self._vec_file) as f:
for line in f:
vector, label = self._parse_vector(line)
vec_dict[label] = np.array(vector, np.float32)
# This line would totally fail if 'coffee' is not in the vocab.
self._validate_shape((len(vec_dict), len(vec_dict['coffee'])),
vocab_size, embed_dim)
return vec_dict

"""Read a csv file of DW-NOMINATE data, avaliable at voteview.com"""
names = []
ideology_scores = []
missing = []
with open(csv_file) as f:
if (row['dim1']): # use the first dimension of DW-NOMINATE
ideology_scores.append(row['dim1'])
names.append(row['bioname'])
else:
missing.append(row['bioname'])
ideology_scores.append('0')
names.append(row['bioname'] + '(missing score)')
if missing:
print("These members' ideology scores are missing from the csv: {}"
.format(missing))
ideology_scores = np.array(ideology_scores, np.float32)
# Re-format names to only contain last name,
# e.g. 'MOYNIHAN, Daniel Patrick' -> 'moynihan'
names = [re.sub(',.*', '', i).lower() for i in names]
return ideology_scores, names

def _scalar_projection(self, vectors, axis_vectors):
"""Compute the scarlar components of vector projections.

Assuming the order of axis_vectors is:
[positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]"""
x_axis = axis_vectors[0] - axis_vectors[1]
x_axis = x_axis / np.linalg.norm(x_axis)
y_axis = axis_vectors[2] - axis_vectors[3]
y_axis = y_axis / np.linalg.norm(y_axis)
x_proj = np.dot(vectors, x_axis)
y_proj = np.dot(vectors, y_axis)
return x_proj, y_proj

def _scatter_plot(self, x_proj, y_proj, labels, title=None, x_label=None,
y_label=None, font_size=7, save_dir=None):
"""scatter plot with labels

If save_dir is passed, graph will be higher resolution
and saved to to disk."""
fig, ax = plt.subplots()
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.axhline(linewidth=0.5, color='#1f77b4')
ax.axvline(linewidth=0.5, color='#1f77b4')
if save_dir:
ax.scatter(x_proj, y_proj, s=10)
for i, label in enumerate(labels):
ax.annotate(label, (x_proj[i], y_proj[i]), fontsize=font_size)
ax.tick_params(axis='both', which='major', labelsize=6)
plt.savefig(save_dir, dpi=500)
else:
ax.scatter(x_proj, y_proj)
for i, label in enumerate(labels):
ax.annotate(label, (x_proj[i], y_proj[i]))

def _get_in_vocab_vec(self, ideology, names):
"""filter out-of-vocabulary vectors """
in_vocab_names = []
in_vocab_ideology = []
in_vocab_vec = []
for i, name in enumerate(names):
if (self.vec_dict.get(name) is not None):
in_vocab_names.append(name)
in_vocab_ideology.append(ideology[i])
in_vocab_vec.append(self.vec_dict[name])
else:
print('model vocabulary is missing {}'.format(name))
return in_vocab_vec, in_vocab_ideology, in_vocab_names

def evaluate_ideology_projection(self, cgrs_sess):
"""Compare vector projected ideology against DW-NOMINATE."""
axis_vectors = np.array([self.vec_dict[a] for a in self._axes],
np.float32)
self._eval_file.format(cgrs_sess))
vectors, ideology, names = self._get_in_vocab_vec(ideology, names)
x_proj, _ = self._scalar_projection(vectors, axis_vectors)
self._scatter_plot(x_proj, ideology, labels=names,
title='{}th U.S. Senate'.format(cgrs_sess),
x_label='vector projected ideology (liberal - conservative)',
y_label='DW-NOMINATE')
pearson, _ = stats.pearsonr(x_proj, ideology)
spearman, _ = stats.mstats.spearmanr(x_proj, ideology)
plt.show()
print("Pearson's r = {}".format(pearson))
print("Spearman's rho = {}\n\n".format(spearman))
return pearson, spearman

def multiyear_evaluation(self, cgrs_sess):
"""Invoke evaluate_ideology_projection over multiple years"""
sum_pearson = 0
sum_spearman = 0
for i in cgrs_sess:
pearson, spearman = self.evaluate_ideology_projection(cgrs_sess=i)
sum_pearson += pearson
sum_spearman += spearman
print('============\n')
print("Average Pearson's r = {}".format(
sum_pearson / (len(cgrs_sess))))
print("Average Spearman's rho = {}".format(
sum_spearman / (len(cgrs_sess))))
In [3]:
class FastTextProjector(Word2VecProjector):
"""Load a .vec file generated by fastText's print-word-vectors command.

The first 4 vectors must be those that define the axes, in the order of
[positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]

This is taken care of by the gen_queries.py scirpt included with
to the beginning of each queries to fastText's print-word-vectors.
"""

def __init__(self, vec_file, eval_file):
self._vec_file = vec_file
self._eval_file = eval_file

vocab = []
vectors = []
axis_vectors = []
with open(file) as f:
for i in range(4): # use the first 4 vectors to define axes
axis_vectors.append(vector)
for line in f:
vector, word = self._parse_vector(line)
vocab.append(word)
vectors.append(vector)
vectors = np.array(vectors, np.float32)
axis_vectors = np.array(axis_vectors, np.float32)
return vectors, axis_vectors

def evaluate_ideology_projection(self, cgrs_sess,
title=None, save_dir=None):
"""Compare vector projected ideology against DW-NOMINATE."""
self._eval_file.format(cgrs_sess))
self._vec_file.format(cgrs_sess))
x_proj, _ = self._scalar_projection(vectors, axis_vectors)
try:
self._scatter_plot(x_proj, ideology, labels=names,
title='{}th U.S. Senate'.format(cgrs_sess),
x_label='vector projected ideology (liberal - conservative)',
y_label='DW-NOMINATE',
save_dir=save_dir)
except ValueError:
print('Failed to graph S{} \n'
'mismatch in length of x_proj and ideology'
.format(cgrs_sess))
pearson, _ = stats.pearsonr(x_proj, ideology)
spearman, _ = stats.mstats.spearmanr(x_proj, ideology)
plt.show()
print("Pearson's r = {}".format(pearson))
print("Spearman's rho = {}\n\n".format(spearman))
return pearson, spearman

# Results¶

In [4]:
# DW-NOMINATE data is included in this repo
EVAL_FILE = './dw-nominate/S{}_members.csv'
In [5]:
# included in this repo at ./queried_vecotrs
# Single year evaluation:
nyt_model = FastTextProjector(
vec_file='./queried_vectors/nyt_97-114/S{}.vec', eval_file=EVAL_FILE)
nyt_model.evaluate_ideology_projection(cgrs_sess=109)
Pearson's r = 0.8171525597572327
Spearman's rho = 0.7930787826461453

Out[5]:
(0.81715256, 0.79307878264614529)
In [6]:
# Multiyear evaluation:
nyt_model.multiyear_evaluation(cgrs_sess=range(97,115))
Pearson's r = 0.722510814666748
Spearman's rho = 0.7702277083963022

Pearson's r = 0.7646467685699463
Spearman's rho = 0.814777536861473

Pearson's r = 0.7757857441902161
Spearman's rho = 0.8175968334877668

Pearson's r = 0.737393856048584
Spearman's rho = 0.7834910475007509

Pearson's r = 0.7367940545082092
Spearman's rho = 0.7744353470405559

Pearson's r = 0.7085356712341309
Spearman's rho = 0.7411642925505213

Pearson's r = 0.7511775493621826
Spearman's rho = 0.7728208761092454

Pearson's r = 0.7658203840255737
Spearman's rho = 0.7686895197967026

Pearson's r = 0.7578223943710327
Spearman's rho = 0.7374518049257855

Pearson's r = 0.7666221261024475
Spearman's rho = 0.7514744483860694

These members' ideology scores are missing from the csv: ['BARKLEY, Dean']
Pearson's r = 0.7966147065162659
Spearman's rho = 0.7789447094095034