In [1]:
"""Political Vector Projector

Given word vectors trained by word2vec (Mikolov et al. 2013) 
or fastText (Bojanowski et al. 2016), project the vectors of 
U.S. senators onto a 'conservative' to 'liberal' axis. 
The scalar components of such projections may be interpreted 
as a valid metric of political ideology.

Learn more about this project at
https://empirical.coffee/blog/2017/political-vector-projector
Author: Albert Webson. MIT License.
"""

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import csv
import re
import os

%matplotlib inline
plt.style.use('seaborn')
In [2]:
class Word2VecProjector():
    """Read a vector file trained by word2vec"""
    
    def __init__(self, vec_file, eval_file):
        self._vec_file = vec_file
        self._eval_file = eval_file
        self._axes = ('conservative', 'liberal', 'good', 'bad')
        self.vec_dict = self._read_vec_file()
    
    def _parse_vector(self, line):
        """Return: vector as a list, label"""
        line = line.strip().split()
        return line[1:], line[0]

    def _validate_shape(self, shape, vocab_size, embed_dim):
        """Verify the shape of tensor as specified by the first line in .vec"""
        try:
            assert shape == (int(vocab_size), int(embed_dim))
        except AssertionError:
            print('Warning: Something went wrong with file IO!')
            print('The .vec file header line specifies that vocab_size = {}, '
                'embed_dim = {}'.format(vocab_size, embed_dim))
            print('But actually reading the file yielded a tensor of shape {}'
                .format(shape))
    
    def _read_vec_file(self):
        """return dict {word: vector}"""
        vec_dict = {}
        with open(self._vec_file) as f:
            vocab_size, embed_dim = f.readline().strip().split() 
            for line in f:
                vector, label = self._parse_vector(line)
                vec_dict[label] = np.array(vector, np.float32)
        # This line would totally fail if 'coffee' is not in the vocab.
        self._validate_shape((len(vec_dict), len(vec_dict['coffee'])),
                       vocab_size, embed_dim)
        return vec_dict
    
    def _read_dwnominate_csv(self, csv_file):
        """Read a csv file of DW-NOMINATE data, avaliable at voteview.com"""
        names = []
        ideology_scores = []
        missing = []
        with open(csv_file) as f:
            reader = csv.DictReader(f)
            for row in reader:
                if (row['dim1']): # use the first dimension of DW-NOMINATE
                    ideology_scores.append(row['dim1'])
                    names.append(row['bioname'])
                else:
                    missing.append(row['bioname'])
                    ideology_scores.append('0')
                    names.append(row['bioname'] + '(missing score)')
        if missing:
            print("These members' ideology scores are missing from the csv: {}"
                .format(missing))
        ideology_scores = np.array(ideology_scores, np.float32)
        # Re-format names to only contain last name,
        # e.g. 'MOYNIHAN, Daniel Patrick' -> 'moynihan'
        names = [re.sub(',.*', '', i).lower() for i in names]
        return ideology_scores, names

    def _scalar_projection(self, vectors, axis_vectors):
        """Compute the scarlar components of vector projections.

        Assuming the order of axis_vectors is: 
        [positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]"""
        x_axis = axis_vectors[0] - axis_vectors[1]
        x_axis = x_axis / np.linalg.norm(x_axis)
        y_axis = axis_vectors[2] - axis_vectors[3]
        y_axis = y_axis / np.linalg.norm(y_axis)
        x_proj = np.dot(vectors, x_axis)
        y_proj = np.dot(vectors, y_axis)
        return x_proj, y_proj

    def _scatter_plot(self, x_proj, y_proj, labels, title=None, x_label=None,
                     y_label=None, font_size=7, save_dir=None):
        """scatter plot with labels

        If save_dir is passed, graph will be higher resolution 
        and saved to to disk."""
        fig, ax = plt.subplots()
        ax.set_title(title)
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)
        ax.axhline(linewidth=0.5, color='#1f77b4')
        ax.axvline(linewidth=0.5, color='#1f77b4')
        if save_dir:
            ax.scatter(x_proj, y_proj, s=10)
            for i, label in enumerate(labels):
                ax.annotate(label, (x_proj[i], y_proj[i]), fontsize=font_size)
            ax.tick_params(axis='both', which='major', labelsize=6)
            plt.savefig(save_dir, dpi=500)
        else:    
            ax.scatter(x_proj, y_proj)
            for i, label in enumerate(labels):
                ax.annotate(label, (x_proj[i], y_proj[i]))
    
    def _get_in_vocab_vec(self, ideology, names):
        """filter out-of-vocabulary vectors """
        in_vocab_names = []
        in_vocab_ideology = []
        in_vocab_vec = []
        for i, name in enumerate(names):
            if (self.vec_dict.get(name) is not None):
                in_vocab_names.append(name)
                in_vocab_ideology.append(ideology[i])
                in_vocab_vec.append(self.vec_dict[name])
            else:
                print('model vocabulary is missing {}'.format(name))
        return in_vocab_vec, in_vocab_ideology, in_vocab_names
    
    def evaluate_ideology_projection(self, cgrs_sess):
        """Compare vector projected ideology against DW-NOMINATE."""
        axis_vectors = np.array([self.vec_dict[a] for a in self._axes], 
            np.float32)
        ideology, names = self._read_dwnominate_csv(
            self._eval_file.format(cgrs_sess))
        vectors, ideology, names = self._get_in_vocab_vec(ideology, names)
        x_proj, _ = self._scalar_projection(vectors, axis_vectors)
        self._scatter_plot(x_proj, ideology, labels=names,
            title='{}th U.S. Senate'.format(cgrs_sess),
            x_label='vector projected ideology (liberal - conservative)',
            y_label='DW-NOMINATE')
        pearson, _ = stats.pearsonr(x_proj, ideology)
        spearman, _ = stats.mstats.spearmanr(x_proj, ideology)
        plt.show()
        print("Pearson's r = {}".format(pearson))
        print("Spearman's rho = {}\n\n".format(spearman))
        return pearson, spearman
    
    def multiyear_evaluation(self, cgrs_sess):
        """Invoke evaluate_ideology_projection over multiple years"""
        sum_pearson = 0
        sum_spearman = 0
        for i in cgrs_sess:
            pearson, spearman = self.evaluate_ideology_projection(cgrs_sess=i)
            sum_pearson += pearson
            sum_spearman += spearman
        print('============\n')
        print("Average Pearson's r = {}".format(
            sum_pearson / (len(cgrs_sess))))
        print("Average Spearman's rho = {}".format(
            sum_spearman / (len(cgrs_sess))))
In [3]:
class FastTextProjector(Word2VecProjector):
    """Load a .vec file generated by fastText's print-word-vectors command.

    The first 4 vectors must be those that define the axes, in the order of 
    [positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]

    This is taken care of by the gen_queries.py scirpt included with 
    this project, which add ['conservative', 'liberal', 'good', 'bad']
    to the beginning of each queries to fastText's print-word-vectors.
    """

    def __init__(self, vec_file, eval_file):
        self._vec_file = vec_file
        self._eval_file = eval_file
    
    def _read_vec_file(self, file):
        vocab = []
        vectors = []
        axis_vectors = []
        with open(file) as f:
            for i in range(4): # use the first 4 vectors to define axes
                vector, _ = self._parse_vector(f.readline())
                axis_vectors.append(vector)
            for line in f:
                vector, word = self._parse_vector(line)
                vocab.append(word)
                vectors.append(vector)
        vectors = np.array(vectors, np.float32)
        axis_vectors = np.array(axis_vectors, np.float32)
        return vectors, axis_vectors

    def evaluate_ideology_projection(self, cgrs_sess, 
        title=None, save_dir=None):
        """Compare vector projected ideology against DW-NOMINATE."""
        ideology, names = self._read_dwnominate_csv(
            self._eval_file.format(cgrs_sess))
        vectors, axis_vectors = self._read_vec_file(
            self._vec_file.format(cgrs_sess))
        x_proj, _ = self._scalar_projection(vectors, axis_vectors)
        try:
            self._scatter_plot(x_proj, ideology, labels=names, 
                title='{}th U.S. Senate'.format(cgrs_sess),
                x_label='vector projected ideology (liberal - conservative)',
                y_label='DW-NOMINATE',
                save_dir=save_dir)
        except ValueError:
            print('Failed to graph S{} \n'
                  'mismatch in length of x_proj and ideology'
                  .format(cgrs_sess))
        pearson, _ = stats.pearsonr(x_proj, ideology)
        spearman, _ = stats.mstats.spearmanr(x_proj, ideology)
        plt.show()
        print("Pearson's r = {}".format(pearson))
        print("Spearman's rho = {}\n\n".format(spearman))
        return pearson, spearman

Results

In [4]:
# DW-NOMINATE data is included in this repo
EVAL_FILE = './dw-nominate/S{}_members.csv'
In [5]:
# An example of loading pre-generated vectors from fastText
# included in this repo at ./queried_vecotrs
# Single year evaluation:
nyt_model = FastTextProjector(
    vec_file='./queried_vectors/nyt_97-114/S{}.vec', eval_file=EVAL_FILE)
nyt_model.evaluate_ideology_projection(cgrs_sess=109)
Pearson's r = 0.8171525597572327
Spearman's rho = 0.7930787826461453


Out[5]:
(0.81715256, 0.79307878264614529)
In [6]:
# Multiyear evaluation:
nyt_model.multiyear_evaluation(cgrs_sess=range(97,115))
Pearson's r = 0.722510814666748
Spearman's rho = 0.7702277083963022


Pearson's r = 0.7646467685699463
Spearman's rho = 0.814777536861473


Pearson's r = 0.7757857441902161
Spearman's rho = 0.8175968334877668


Pearson's r = 0.737393856048584
Spearman's rho = 0.7834910475007509


Pearson's r = 0.7367940545082092
Spearman's rho = 0.7744353470405559


Pearson's r = 0.7085356712341309
Spearman's rho = 0.7411642925505213


Pearson's r = 0.7511775493621826
Spearman's rho = 0.7728208761092454


Pearson's r = 0.7658203840255737
Spearman's rho = 0.7686895197967026


Pearson's r = 0.7578223943710327
Spearman's rho = 0.7374518049257855


Pearson's r = 0.7666221261024475
Spearman's rho = 0.7514744483860694


These members' ideology scores are missing from the csv: ['BARKLEY, Dean']
Pearson's r = 0.7966147065162659
Spearman's rho = 0.7789447094095034