"""Political Vector Projector
Given word vectors trained by word2vec (Mikolov et al. 2013)
or fastText (Bojanowski et al. 2016), project the vectors of
U.S. senators onto a 'conservative' to 'liberal' axis.
The scalar components of such projections may be interpreted
as a valid metric of political ideology.
Learn more about this project at
https://empirical.coffee/blog/2017/political-vector-projector
Author: Albert Webson. MIT License.
"""
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import csv
import re
import os
%matplotlib inline
plt.style.use('seaborn')
class Word2VecProjector():
"""Read a vector file trained by word2vec"""
def __init__(self, vec_file, eval_file):
self._vec_file = vec_file
self._eval_file = eval_file
self._axes = ('conservative', 'liberal', 'good', 'bad')
self.vec_dict = self._read_vec_file()
def _parse_vector(self, line):
"""Return: vector as a list, label"""
line = line.strip().split()
return line[1:], line[0]
def _validate_shape(self, shape, vocab_size, embed_dim):
"""Verify the shape of tensor as specified by the first line in .vec"""
try:
assert shape == (int(vocab_size), int(embed_dim))
except AssertionError:
print('Warning: Something went wrong with file IO!')
print('The .vec file header line specifies that vocab_size = {}, '
'embed_dim = {}'.format(vocab_size, embed_dim))
print('But actually reading the file yielded a tensor of shape {}'
.format(shape))
def _read_vec_file(self):
"""return dict {word: vector}"""
vec_dict = {}
with open(self._vec_file) as f:
vocab_size, embed_dim = f.readline().strip().split()
for line in f:
vector, label = self._parse_vector(line)
vec_dict[label] = np.array(vector, np.float32)
# This line would totally fail if 'coffee' is not in the vocab.
self._validate_shape((len(vec_dict), len(vec_dict['coffee'])),
vocab_size, embed_dim)
return vec_dict
def _read_dwnominate_csv(self, csv_file):
"""Read a csv file of DW-NOMINATE data, avaliable at voteview.com"""
names = []
ideology_scores = []
missing = []
with open(csv_file) as f:
reader = csv.DictReader(f)
for row in reader:
if (row['dim1']): # use the first dimension of DW-NOMINATE
ideology_scores.append(row['dim1'])
names.append(row['bioname'])
else:
missing.append(row['bioname'])
ideology_scores.append('0')
names.append(row['bioname'] + '(missing score)')
if missing:
print("These members' ideology scores are missing from the csv: {}"
.format(missing))
ideology_scores = np.array(ideology_scores, np.float32)
# Re-format names to only contain last name,
# e.g. 'MOYNIHAN, Daniel Patrick' -> 'moynihan'
names = [re.sub(',.*', '', i).lower() for i in names]
return ideology_scores, names
def _scalar_projection(self, vectors, axis_vectors):
"""Compute the scarlar components of vector projections.
Assuming the order of axis_vectors is:
[positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]"""
x_axis = axis_vectors[0] - axis_vectors[1]
x_axis = x_axis / np.linalg.norm(x_axis)
y_axis = axis_vectors[2] - axis_vectors[3]
y_axis = y_axis / np.linalg.norm(y_axis)
x_proj = np.dot(vectors, x_axis)
y_proj = np.dot(vectors, y_axis)
return x_proj, y_proj
def _scatter_plot(self, x_proj, y_proj, labels, title=None, x_label=None,
y_label=None, font_size=7, save_dir=None):
"""scatter plot with labels
If save_dir is passed, graph will be higher resolution
and saved to to disk."""
fig, ax = plt.subplots()
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.axhline(linewidth=0.5, color='#1f77b4')
ax.axvline(linewidth=0.5, color='#1f77b4')
if save_dir:
ax.scatter(x_proj, y_proj, s=10)
for i, label in enumerate(labels):
ax.annotate(label, (x_proj[i], y_proj[i]), fontsize=font_size)
ax.tick_params(axis='both', which='major', labelsize=6)
plt.savefig(save_dir, dpi=500)
else:
ax.scatter(x_proj, y_proj)
for i, label in enumerate(labels):
ax.annotate(label, (x_proj[i], y_proj[i]))
def _get_in_vocab_vec(self, ideology, names):
"""filter out-of-vocabulary vectors """
in_vocab_names = []
in_vocab_ideology = []
in_vocab_vec = []
for i, name in enumerate(names):
if (self.vec_dict.get(name) is not None):
in_vocab_names.append(name)
in_vocab_ideology.append(ideology[i])
in_vocab_vec.append(self.vec_dict[name])
else:
print('model vocabulary is missing {}'.format(name))
return in_vocab_vec, in_vocab_ideology, in_vocab_names
def evaluate_ideology_projection(self, cgrs_sess):
"""Compare vector projected ideology against DW-NOMINATE."""
axis_vectors = np.array([self.vec_dict[a] for a in self._axes],
np.float32)
ideology, names = self._read_dwnominate_csv(
self._eval_file.format(cgrs_sess))
vectors, ideology, names = self._get_in_vocab_vec(ideology, names)
x_proj, _ = self._scalar_projection(vectors, axis_vectors)
self._scatter_plot(x_proj, ideology, labels=names,
title='{}th U.S. Senate'.format(cgrs_sess),
x_label='vector projected ideology (liberal - conservative)',
y_label='DW-NOMINATE')
pearson, _ = stats.pearsonr(x_proj, ideology)
spearman, _ = stats.mstats.spearmanr(x_proj, ideology)
plt.show()
print("Pearson's r = {}".format(pearson))
print("Spearman's rho = {}\n\n".format(spearman))
return pearson, spearman
def multiyear_evaluation(self, cgrs_sess):
"""Invoke evaluate_ideology_projection over multiple years"""
sum_pearson = 0
sum_spearman = 0
for i in cgrs_sess:
pearson, spearman = self.evaluate_ideology_projection(cgrs_sess=i)
sum_pearson += pearson
sum_spearman += spearman
print('============\n')
print("Average Pearson's r = {}".format(
sum_pearson / (len(cgrs_sess))))
print("Average Spearman's rho = {}".format(
sum_spearman / (len(cgrs_sess))))
class FastTextProjector(Word2VecProjector):
"""Load a .vec file generated by fastText's print-word-vectors command.
The first 4 vectors must be those that define the axes, in the order of
[positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]
This is taken care of by the gen_queries.py scirpt included with
this project, which add ['conservative', 'liberal', 'good', 'bad']
to the beginning of each queries to fastText's print-word-vectors.
"""
def __init__(self, vec_file, eval_file):
self._vec_file = vec_file
self._eval_file = eval_file
def _read_vec_file(self, file):
vocab = []
vectors = []
axis_vectors = []
with open(file) as f:
for i in range(4): # use the first 4 vectors to define axes
vector, _ = self._parse_vector(f.readline())
axis_vectors.append(vector)
for line in f:
vector, word = self._parse_vector(line)
vocab.append(word)
vectors.append(vector)
vectors = np.array(vectors, np.float32)
axis_vectors = np.array(axis_vectors, np.float32)
return vectors, axis_vectors
def evaluate_ideology_projection(self, cgrs_sess,
title=None, save_dir=None):
"""Compare vector projected ideology against DW-NOMINATE."""
ideology, names = self._read_dwnominate_csv(
self._eval_file.format(cgrs_sess))
vectors, axis_vectors = self._read_vec_file(
self._vec_file.format(cgrs_sess))
x_proj, _ = self._scalar_projection(vectors, axis_vectors)
try:
self._scatter_plot(x_proj, ideology, labels=names,
title='{}th U.S. Senate'.format(cgrs_sess),
x_label='vector projected ideology (liberal - conservative)',
y_label='DW-NOMINATE',
save_dir=save_dir)
except ValueError:
print('Failed to graph S{} \n'
'mismatch in length of x_proj and ideology'
.format(cgrs_sess))
pearson, _ = stats.pearsonr(x_proj, ideology)
spearman, _ = stats.mstats.spearmanr(x_proj, ideology)
plt.show()
print("Pearson's r = {}".format(pearson))
print("Spearman's rho = {}\n\n".format(spearman))
return pearson, spearman
# DW-NOMINATE data is included in this repo
EVAL_FILE = './dw-nominate/S{}_members.csv'
# An example of loading pre-generated vectors from fastText
# included in this repo at ./queried_vecotrs
# Single year evaluation:
nyt_model = FastTextProjector(
vec_file='./queried_vectors/nyt_97-114/S{}.vec', eval_file=EVAL_FILE)
nyt_model.evaluate_ideology_projection(cgrs_sess=109)
Pearson's r = 0.8171525597572327 Spearman's rho = 0.7930787826461453
(0.81715256, 0.79307878264614529)
# Multiyear evaluation:
nyt_model.multiyear_evaluation(cgrs_sess=range(97,115))
Pearson's r = 0.722510814666748 Spearman's rho = 0.7702277083963022
Pearson's r = 0.7646467685699463 Spearman's rho = 0.814777536861473
Pearson's r = 0.7757857441902161 Spearman's rho = 0.8175968334877668
Pearson's r = 0.737393856048584 Spearman's rho = 0.7834910475007509
Pearson's r = 0.7367940545082092 Spearman's rho = 0.7744353470405559
Pearson's r = 0.7085356712341309 Spearman's rho = 0.7411642925505213
Pearson's r = 0.7511775493621826 Spearman's rho = 0.7728208761092454
Pearson's r = 0.7658203840255737 Spearman's rho = 0.7686895197967026
Pearson's r = 0.7578223943710327 Spearman's rho = 0.7374518049257855
Pearson's r = 0.7666221261024475 Spearman's rho = 0.7514744483860694 These members' ideology scores are missing from the csv: ['BARKLEY, Dean']
Pearson's r = 0.7966147065162659 Spearman's rho = 0.7789447094095034
Pearson's r = 0.7895196676254272 Spearman's rho = 0.7805536465188878
Pearson's r = 0.8171525597572327 Spearman's rho = 0.7930787826461453
Pearson's r = 0.7891560196876526 Spearman's rho = 0.7668504464646492
Pearson's r = 0.7518661022186279 Spearman's rho = 0.7215084717529929
Pearson's r = 0.7333480715751648 Spearman's rho = 0.7122471842347057
Pearson's r = 0.7240954637527466 Spearman's rho = 0.7185254041513824
Pearson's r = 0.7178351283073425 Spearman's rho = 0.6795264928864299 ============ Average Pearson's r = 0.7559276156955295 Average Spearman's rho = 0.7601869196177704
wapo_model = FastTextProjector(
vec_file='./queried_vectors/wapo_95-109/S{}.vec', eval_file=EVAL_FILE)
wapo_model.multiyear_evaluation(cgrs_sess=range(95,110))
Pearson's r = 0.7261107563972473 Spearman's rho = 0.7057019931152531
Pearson's r = 0.7600694894790649 Spearman's rho = 0.7726804940329455
Pearson's r = 0.7895463109016418 Spearman's rho = 0.8098830075504513
Pearson's r = 0.8181456327438354 Spearman's rho = 0.8451310295215771
Pearson's r = 0.8257299661636353 Spearman's rho = 0.8510432570110609
Pearson's r = 0.7889075875282288 Spearman's rho = 0.8178535987107831
Pearson's r = 0.7826847434043884 Spearman's rho = 0.8057967194511483
Pearson's r = 0.7837004661560059 Spearman's rho = 0.8053865416206926
Pearson's r = 0.7878368496894836 Spearman's rho = 0.8148739998914757
Pearson's r = 0.7836102843284607 Spearman's rho = 0.7944232841602862
Pearson's r = 0.786763072013855 Spearman's rho = 0.7762408414906434
Pearson's r = 0.7884320020675659 Spearman's rho = 0.7763835652546375 These members' ideology scores are missing from the csv: ['BARKLEY, Dean']
Pearson's r = 0.8037465810775757 Spearman's rho = 0.8085161297438791
Pearson's r = 0.8080106973648071 Spearman's rho = 0.8121148761196986
Pearson's r = 0.819696843624115 Spearman's rho = 0.8080351716268449 ============ Average Pearson's r = 0.7901994188626608 Average Spearman's rho = 0.8002709672867585
# An example of loading a word2vec model. Please provide your own
# pre-trained word2vec models and specify MODEL_DIR.
MODEL_DIR = '../word2vec_models/'
try:
nyt_w2v_model = Word2VecProjector(
vec_file=os.path.join(MODEL_DIR, 'w2v_nyt.txt'), eval_file=EVAL_FILE)
nyt_w2v_model.multiyear_evaluation(cgrs_sess=range(97,115))
except IOError:
print('Pre-trained model not found. Please check MODEL_DIR.')
model vocabulary is missing d'amato
Pearson's r = 0.7685012817382812 Spearman's rho = 0.7951344803219345 model vocabulary is missing d'amato
Pearson's r = 0.7917909622192383 Spearman's rho = 0.8188132103970914 model vocabulary is missing d'amato
Pearson's r = 0.8123881816864014 Spearman's rho = 0.8364929324133276 model vocabulary is missing d'amato
Pearson's r = 0.7887059450149536 Spearman's rho = 0.8162098096683893 model vocabulary is missing d'amato
Pearson's r = 0.7857689261436462 Spearman's rho = 0.8037376307733807 model vocabulary is missing d'amato
Pearson's r = 0.7683195471763611 Spearman's rho = 0.7847287799321377 model vocabulary is missing moseley braun model vocabulary is missing d'amato
Pearson's r = 0.7781537175178528 Spearman's rho = 0.7960791850990293 model vocabulary is missing moseley braun model vocabulary is missing d'amato
Pearson's r = 0.794052243232727 Spearman's rho = 0.7909516027924174 model vocabulary is missing moseley braun model vocabulary is missing d'amato
Pearson's r = 0.7774635553359985 Spearman's rho = 0.7660391594352975
Pearson's r = 0.7746751308441162 Spearman's rho = 0.7619246356437601 These members' ideology scores are missing from the csv: ['BARKLEY, Dean']
Pearson's r = 0.7974948883056641 Spearman's rho = 0.7815970763587149
Pearson's r = 0.8008320927619934 Spearman's rho = 0.7872398047734976
Pearson's r = 0.8238189816474915 Spearman's rho = 0.7941588092228233
Pearson's r = 0.7869190573692322 Spearman's rho = 0.7468012432705324
Pearson's r = 0.7542150020599365 Spearman's rho = 0.6994866121957366
Pearson's r = 0.7406333088874817 Spearman's rho = 0.7160967385980497
Pearson's r = 0.699846088886261 Spearman's rho = 0.6843862022600551
Pearson's r = 0.7146888375282288 Spearman's rho = 0.6872027840388328 ============ Average Pearson's r = 0.7754593193531036 Average Spearman's rho = 0.7703933720663892
Results from this method are quite amusing but still highly experimental. For a detailed account, please refer to https://empirical.coffee/blog/2017/political-vector-projector
from gensim.models.wrappers import FastText
MODEL_DIR = '../fastText/models/'
class ExperimentalFastTextProjector(FastTextProjector):
"""still a work in progress"""
def __init__(self, pretrained_model):
self.model = FastText.load_fasttext_format(pretrained_model)
self._axes = ('conservative', 'liberal', 'good', 'bad')
def _query_fasttext_wrapper(self, queries):
"""load fastText binary file, return vectors of given queries
It's not very fast, but it's faster if you want to
query vectors on-the-fly, and you don't want to
manually query C++ fastText via the command line"""
vectors = [self.model[i] for i in queries]
vectors = np.array(vectors, np.float32)
axis_vectors = [self.model[i] for i in self._axes]
axis_vectors = np.array(axis_vectors, np.float32)
return vectors, axis_vectors
def project_queries(self, query_file, title=None, save_dir=None):
with open(query_file) as f:
# parser skips empty line or comments with #
queries = [w.strip() for w in f if not (w == '\n' or '#' in w)]
vectors, axis_vectors = self._query_fasttext_wrapper(queries)
x_proj, y_proj = self._scalar_projection(vectors, axis_vectors)
self._scatter_plot(x_proj, y_proj, labels=queries, title=title,
x_label='liberal - conservative',
y_label='bad - good',
font_size=7, save_dir=save_dir)
nyt = ExperimentalFastTextProjector(os.path.join(MODEL_DIR, 'nyt_1981-2016'))
nyt.project_queries(query_file='./queries/salient_policies.md',
title='Salient Policies and Talking Points (NYT)',
save_dir='./graphs/NYT policies.png')
wsj = ExperimentalFastTextProjector(os.path.join(MODEL_DIR, 'wsj_1997-2017'))
wsj.project_queries(query_file='./queries/salient_policies.md',
title='Salient Policies and Talking Points (WSJ)',
save_dir='./graphs/WSJ policies.png')