#!/usr/bin/env python # coding: utf-8 # In[1]: """Political Vector Projector Given word vectors trained by word2vec (Mikolov et al. 2013) or fastText (Bojanowski et al. 2016), project the vectors of U.S. senators onto a 'conservative' to 'liberal' axis. The scalar components of such projections may be interpreted as a valid metric of political ideology. Learn more about this project at https://empirical.coffee/blog/2017/political-vector-projector Author: Albert Webson. MIT License. """ import numpy as np from scipy import stats import matplotlib.pyplot as plt import csv import re import os get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('seaborn') # In[2]: class Word2VecProjector(): """Read a vector file trained by word2vec""" def __init__(self, vec_file, eval_file): self._vec_file = vec_file self._eval_file = eval_file self._axes = ('conservative', 'liberal', 'good', 'bad') self.vec_dict = self._read_vec_file() def _parse_vector(self, line): """Return: vector as a list, label""" line = line.strip().split() return line[1:], line[0] def _validate_shape(self, shape, vocab_size, embed_dim): """Verify the shape of tensor as specified by the first line in .vec""" try: assert shape == (int(vocab_size), int(embed_dim)) except AssertionError: print('Warning: Something went wrong with file IO!') print('The .vec file header line specifies that vocab_size = {}, ' 'embed_dim = {}'.format(vocab_size, embed_dim)) print('But actually reading the file yielded a tensor of shape {}' .format(shape)) def _read_vec_file(self): """return dict {word: vector}""" vec_dict = {} with open(self._vec_file) as f: vocab_size, embed_dim = f.readline().strip().split() for line in f: vector, label = self._parse_vector(line) vec_dict[label] = np.array(vector, np.float32) # This line would totally fail if 'coffee' is not in the vocab. self._validate_shape((len(vec_dict), len(vec_dict['coffee'])), vocab_size, embed_dim) return vec_dict def _read_dwnominate_csv(self, csv_file): """Read a csv file of DW-NOMINATE data, avaliable at voteview.com""" names = [] ideology_scores = [] missing = [] with open(csv_file) as f: reader = csv.DictReader(f) for row in reader: if (row['dim1']): # use the first dimension of DW-NOMINATE ideology_scores.append(row['dim1']) names.append(row['bioname']) else: missing.append(row['bioname']) ideology_scores.append('0') names.append(row['bioname'] + '(missing score)') if missing: print("These members' ideology scores are missing from the csv: {}" .format(missing)) ideology_scores = np.array(ideology_scores, np.float32) # Re-format names to only contain last name, # e.g. 'MOYNIHAN, Daniel Patrick' -> 'moynihan' names = [re.sub(',.*', '', i).lower() for i in names] return ideology_scores, names def _scalar_projection(self, vectors, axis_vectors): """Compute the scarlar components of vector projections. Assuming the order of axis_vectors is: [positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]""" x_axis = axis_vectors[0] - axis_vectors[1] x_axis = x_axis / np.linalg.norm(x_axis) y_axis = axis_vectors[2] - axis_vectors[3] y_axis = y_axis / np.linalg.norm(y_axis) x_proj = np.dot(vectors, x_axis) y_proj = np.dot(vectors, y_axis) return x_proj, y_proj def _scatter_plot(self, x_proj, y_proj, labels, title=None, x_label=None, y_label=None, font_size=7, save_dir=None): """scatter plot with labels If save_dir is passed, graph will be higher resolution and saved to to disk.""" fig, ax = plt.subplots() ax.set_title(title) ax.set_xlabel(x_label) ax.set_ylabel(y_label) ax.axhline(linewidth=0.5, color='#1f77b4') ax.axvline(linewidth=0.5, color='#1f77b4') if save_dir: ax.scatter(x_proj, y_proj, s=10) for i, label in enumerate(labels): ax.annotate(label, (x_proj[i], y_proj[i]), fontsize=font_size) ax.tick_params(axis='both', which='major', labelsize=6) plt.savefig(save_dir, dpi=500) else: ax.scatter(x_proj, y_proj) for i, label in enumerate(labels): ax.annotate(label, (x_proj[i], y_proj[i])) def _get_in_vocab_vec(self, ideology, names): """filter out-of-vocabulary vectors """ in_vocab_names = [] in_vocab_ideology = [] in_vocab_vec = [] for i, name in enumerate(names): if (self.vec_dict.get(name) is not None): in_vocab_names.append(name) in_vocab_ideology.append(ideology[i]) in_vocab_vec.append(self.vec_dict[name]) else: print('model vocabulary is missing {}'.format(name)) return in_vocab_vec, in_vocab_ideology, in_vocab_names def evaluate_ideology_projection(self, cgrs_sess): """Compare vector projected ideology against DW-NOMINATE.""" axis_vectors = np.array([self.vec_dict[a] for a in self._axes], np.float32) ideology, names = self._read_dwnominate_csv( self._eval_file.format(cgrs_sess)) vectors, ideology, names = self._get_in_vocab_vec(ideology, names) x_proj, _ = self._scalar_projection(vectors, axis_vectors) self._scatter_plot(x_proj, ideology, labels=names, title='{}th U.S. Senate'.format(cgrs_sess), x_label='vector projected ideology (liberal - conservative)', y_label='DW-NOMINATE') pearson, _ = stats.pearsonr(x_proj, ideology) spearman, _ = stats.mstats.spearmanr(x_proj, ideology) plt.show() print("Pearson's r = {}".format(pearson)) print("Spearman's rho = {}\n\n".format(spearman)) return pearson, spearman def multiyear_evaluation(self, cgrs_sess): """Invoke evaluate_ideology_projection over multiple years""" sum_pearson = 0 sum_spearman = 0 for i in cgrs_sess: pearson, spearman = self.evaluate_ideology_projection(cgrs_sess=i) sum_pearson += pearson sum_spearman += spearman print('============\n') print("Average Pearson's r = {}".format( sum_pearson / (len(cgrs_sess)))) print("Average Spearman's rho = {}".format( sum_spearman / (len(cgrs_sess)))) # In[3]: class FastTextProjector(Word2VecProjector): """Load a .vec file generated by fastText's print-word-vectors command. The first 4 vectors must be those that define the axes, in the order of [positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis] This is taken care of by the gen_queries.py scirpt included with this project, which add ['conservative', 'liberal', 'good', 'bad'] to the beginning of each queries to fastText's print-word-vectors. """ def __init__(self, vec_file, eval_file): self._vec_file = vec_file self._eval_file = eval_file def _read_vec_file(self, file): vocab = [] vectors = [] axis_vectors = [] with open(file) as f: for i in range(4): # use the first 4 vectors to define axes vector, _ = self._parse_vector(f.readline()) axis_vectors.append(vector) for line in f: vector, word = self._parse_vector(line) vocab.append(word) vectors.append(vector) vectors = np.array(vectors, np.float32) axis_vectors = np.array(axis_vectors, np.float32) return vectors, axis_vectors def evaluate_ideology_projection(self, cgrs_sess, title=None, save_dir=None): """Compare vector projected ideology against DW-NOMINATE.""" ideology, names = self._read_dwnominate_csv( self._eval_file.format(cgrs_sess)) vectors, axis_vectors = self._read_vec_file( self._vec_file.format(cgrs_sess)) x_proj, _ = self._scalar_projection(vectors, axis_vectors) try: self._scatter_plot(x_proj, ideology, labels=names, title='{}th U.S. Senate'.format(cgrs_sess), x_label='vector projected ideology (liberal - conservative)', y_label='DW-NOMINATE', save_dir=save_dir) except ValueError: print('Failed to graph S{} \n' 'mismatch in length of x_proj and ideology' .format(cgrs_sess)) pearson, _ = stats.pearsonr(x_proj, ideology) spearman, _ = stats.mstats.spearmanr(x_proj, ideology) plt.show() print("Pearson's r = {}".format(pearson)) print("Spearman's rho = {}\n\n".format(spearman)) return pearson, spearman # # Results # In[4]: # DW-NOMINATE data is included in this repo EVAL_FILE = './dw-nominate/S{}_members.csv' # In[5]: # An example of loading pre-generated vectors from fastText # included in this repo at ./queried_vecotrs # Single year evaluation: nyt_model = FastTextProjector( vec_file='./queried_vectors/nyt_97-114/S{}.vec', eval_file=EVAL_FILE) nyt_model.evaluate_ideology_projection(cgrs_sess=109) # In[6]: # Multiyear evaluation: nyt_model.multiyear_evaluation(cgrs_sess=range(97,115)) # In[7]: wapo_model = FastTextProjector( vec_file='./queried_vectors/wapo_95-109/S{}.vec', eval_file=EVAL_FILE) wapo_model.multiyear_evaluation(cgrs_sess=range(95,110)) # In[8]: # An example of loading a word2vec model. Please provide your own # pre-trained word2vec models and specify MODEL_DIR. MODEL_DIR = '../word2vec_models/' try: nyt_w2v_model = Word2VecProjector( vec_file=os.path.join(MODEL_DIR, 'w2v_nyt.txt'), eval_file=EVAL_FILE) nyt_w2v_model.multiyear_evaluation(cgrs_sess=range(97,115)) except IOError: print('Pre-trained model not found. Please check MODEL_DIR.') # ## Projection of Public Policies # Results from this method are quite amusing but still highly experimental. For a detailed account, please refer to https://empirical.coffee/blog/2017/political-vector-projector # In[9]: from gensim.models.wrappers import FastText MODEL_DIR = '../fastText/models/' class ExperimentalFastTextProjector(FastTextProjector): """still a work in progress""" def __init__(self, pretrained_model): self.model = FastText.load_fasttext_format(pretrained_model) self._axes = ('conservative', 'liberal', 'good', 'bad') def _query_fasttext_wrapper(self, queries): """load fastText binary file, return vectors of given queries It's not very fast, but it's faster if you want to query vectors on-the-fly, and you don't want to manually query C++ fastText via the command line""" vectors = [self.model[i] for i in queries] vectors = np.array(vectors, np.float32) axis_vectors = [self.model[i] for i in self._axes] axis_vectors = np.array(axis_vectors, np.float32) return vectors, axis_vectors def project_queries(self, query_file, title=None, save_dir=None): with open(query_file) as f: # parser skips empty line or comments with # queries = [w.strip() for w in f if not (w == '\n' or '#' in w)] vectors, axis_vectors = self._query_fasttext_wrapper(queries) x_proj, y_proj = self._scalar_projection(vectors, axis_vectors) self._scatter_plot(x_proj, y_proj, labels=queries, title=title, x_label='liberal - conservative', y_label='bad - good', font_size=7, save_dir=save_dir) # In[10]: nyt = ExperimentalFastTextProjector(os.path.join(MODEL_DIR, 'nyt_1981-2016')) # In[11]: nyt.project_queries(query_file='./queries/salient_policies.md', title='Salient Policies and Talking Points (NYT)', save_dir='./graphs/NYT policies.png') # In[12]: wsj = ExperimentalFastTextProjector(os.path.join(MODEL_DIR, 'wsj_1997-2017')) # In[13]: wsj.project_queries(query_file='./queries/salient_policies.md', title='Salient Policies and Talking Points (WSJ)', save_dir='./graphs/WSJ policies.png')