#!/usr/bin/env python
# coding: utf-8

# In[1]:


"""Political Vector Projector

Given word vectors trained by word2vec (Mikolov et al. 2013) 
or fastText (Bojanowski et al. 2016), project the vectors of 
U.S. senators onto a 'conservative' to 'liberal' axis. 
The scalar components of such projections may be interpreted 
as a valid metric of political ideology.

Learn more about this project at
https://empirical.coffee/blog/2017/political-vector-projector
Author: Albert Webson. MIT License.
"""

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import csv
import re
import os

get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('seaborn')


# In[2]:


class Word2VecProjector():
    """Read a vector file trained by word2vec"""
    
    def __init__(self, vec_file, eval_file):
        self._vec_file = vec_file
        self._eval_file = eval_file
        self._axes = ('conservative', 'liberal', 'good', 'bad')
        self.vec_dict = self._read_vec_file()
    
    def _parse_vector(self, line):
        """Return: vector as a list, label"""
        line = line.strip().split()
        return line[1:], line[0]

    def _validate_shape(self, shape, vocab_size, embed_dim):
        """Verify the shape of tensor as specified by the first line in .vec"""
        try:
            assert shape == (int(vocab_size), int(embed_dim))
        except AssertionError:
            print('Warning: Something went wrong with file IO!')
            print('The .vec file header line specifies that vocab_size = {}, '
                'embed_dim = {}'.format(vocab_size, embed_dim))
            print('But actually reading the file yielded a tensor of shape {}'
                .format(shape))
    
    def _read_vec_file(self):
        """return dict {word: vector}"""
        vec_dict = {}
        with open(self._vec_file) as f:
            vocab_size, embed_dim = f.readline().strip().split() 
            for line in f:
                vector, label = self._parse_vector(line)
                vec_dict[label] = np.array(vector, np.float32)
        # This line would totally fail if 'coffee' is not in the vocab.
        self._validate_shape((len(vec_dict), len(vec_dict['coffee'])),
                       vocab_size, embed_dim)
        return vec_dict
    
    def _read_dwnominate_csv(self, csv_file):
        """Read a csv file of DW-NOMINATE data, avaliable at voteview.com"""
        names = []
        ideology_scores = []
        missing = []
        with open(csv_file) as f:
            reader = csv.DictReader(f)
            for row in reader:
                if (row['dim1']): # use the first dimension of DW-NOMINATE
                    ideology_scores.append(row['dim1'])
                    names.append(row['bioname'])
                else:
                    missing.append(row['bioname'])
                    ideology_scores.append('0')
                    names.append(row['bioname'] + '(missing score)')
        if missing:
            print("These members' ideology scores are missing from the csv: {}"
                .format(missing))
        ideology_scores = np.array(ideology_scores, np.float32)
        # Re-format names to only contain last name,
        # e.g. 'MOYNIHAN, Daniel Patrick' -> 'moynihan'
        names = [re.sub(',.*', '', i).lower() for i in names]
        return ideology_scores, names

    def _scalar_projection(self, vectors, axis_vectors):
        """Compute the scarlar components of vector projections.

        Assuming the order of axis_vectors is: 
        [positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]"""
        x_axis = axis_vectors[0] - axis_vectors[1]
        x_axis = x_axis / np.linalg.norm(x_axis)
        y_axis = axis_vectors[2] - axis_vectors[3]
        y_axis = y_axis / np.linalg.norm(y_axis)
        x_proj = np.dot(vectors, x_axis)
        y_proj = np.dot(vectors, y_axis)
        return x_proj, y_proj

    def _scatter_plot(self, x_proj, y_proj, labels, title=None, x_label=None,
                     y_label=None, font_size=7, save_dir=None):
        """scatter plot with labels

        If save_dir is passed, graph will be higher resolution 
        and saved to to disk."""
        fig, ax = plt.subplots()
        ax.set_title(title)
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)
        ax.axhline(linewidth=0.5, color='#1f77b4')
        ax.axvline(linewidth=0.5, color='#1f77b4')
        if save_dir:
            ax.scatter(x_proj, y_proj, s=10)
            for i, label in enumerate(labels):
                ax.annotate(label, (x_proj[i], y_proj[i]), fontsize=font_size)
            ax.tick_params(axis='both', which='major', labelsize=6)
            plt.savefig(save_dir, dpi=500)
        else:    
            ax.scatter(x_proj, y_proj)
            for i, label in enumerate(labels):
                ax.annotate(label, (x_proj[i], y_proj[i]))
    
    def _get_in_vocab_vec(self, ideology, names):
        """filter out-of-vocabulary vectors """
        in_vocab_names = []
        in_vocab_ideology = []
        in_vocab_vec = []
        for i, name in enumerate(names):
            if (self.vec_dict.get(name) is not None):
                in_vocab_names.append(name)
                in_vocab_ideology.append(ideology[i])
                in_vocab_vec.append(self.vec_dict[name])
            else:
                print('model vocabulary is missing {}'.format(name))
        return in_vocab_vec, in_vocab_ideology, in_vocab_names
    
    def evaluate_ideology_projection(self, cgrs_sess):
        """Compare vector projected ideology against DW-NOMINATE."""
        axis_vectors = np.array([self.vec_dict[a] for a in self._axes], 
            np.float32)
        ideology, names = self._read_dwnominate_csv(
            self._eval_file.format(cgrs_sess))
        vectors, ideology, names = self._get_in_vocab_vec(ideology, names)
        x_proj, _ = self._scalar_projection(vectors, axis_vectors)
        self._scatter_plot(x_proj, ideology, labels=names,
            title='{}th U.S. Senate'.format(cgrs_sess),
            x_label='vector projected ideology (liberal - conservative)',
            y_label='DW-NOMINATE')
        pearson, _ = stats.pearsonr(x_proj, ideology)
        spearman, _ = stats.mstats.spearmanr(x_proj, ideology)
        plt.show()
        print("Pearson's r = {}".format(pearson))
        print("Spearman's rho = {}\n\n".format(spearman))
        return pearson, spearman
    
    def multiyear_evaluation(self, cgrs_sess):
        """Invoke evaluate_ideology_projection over multiple years"""
        sum_pearson = 0
        sum_spearman = 0
        for i in cgrs_sess:
            pearson, spearman = self.evaluate_ideology_projection(cgrs_sess=i)
            sum_pearson += pearson
            sum_spearman += spearman
        print('============\n')
        print("Average Pearson's r = {}".format(
            sum_pearson / (len(cgrs_sess))))
        print("Average Spearman's rho = {}".format(
            sum_spearman / (len(cgrs_sess))))


# In[3]:


class FastTextProjector(Word2VecProjector):
    """Load a .vec file generated by fastText's print-word-vectors command.

    The first 4 vectors must be those that define the axes, in the order of 
    [positive_x_axis, negative_x_axis, positive_y_axis, negative_y_axis]

    This is taken care of by the gen_queries.py scirpt included with 
    this project, which add ['conservative', 'liberal', 'good', 'bad']
    to the beginning of each queries to fastText's print-word-vectors.
    """

    def __init__(self, vec_file, eval_file):
        self._vec_file = vec_file
        self._eval_file = eval_file
    
    def _read_vec_file(self, file):
        vocab = []
        vectors = []
        axis_vectors = []
        with open(file) as f:
            for i in range(4): # use the first 4 vectors to define axes
                vector, _ = self._parse_vector(f.readline())
                axis_vectors.append(vector)
            for line in f:
                vector, word = self._parse_vector(line)
                vocab.append(word)
                vectors.append(vector)
        vectors = np.array(vectors, np.float32)
        axis_vectors = np.array(axis_vectors, np.float32)
        return vectors, axis_vectors

    def evaluate_ideology_projection(self, cgrs_sess, 
        title=None, save_dir=None):
        """Compare vector projected ideology against DW-NOMINATE."""
        ideology, names = self._read_dwnominate_csv(
            self._eval_file.format(cgrs_sess))
        vectors, axis_vectors = self._read_vec_file(
            self._vec_file.format(cgrs_sess))
        x_proj, _ = self._scalar_projection(vectors, axis_vectors)
        try:
            self._scatter_plot(x_proj, ideology, labels=names, 
                title='{}th U.S. Senate'.format(cgrs_sess),
                x_label='vector projected ideology (liberal - conservative)',
                y_label='DW-NOMINATE',
                save_dir=save_dir)
        except ValueError:
            print('Failed to graph S{} \n'
                  'mismatch in length of x_proj and ideology'
                  .format(cgrs_sess))
        pearson, _ = stats.pearsonr(x_proj, ideology)
        spearman, _ = stats.mstats.spearmanr(x_proj, ideology)
        plt.show()
        print("Pearson's r = {}".format(pearson))
        print("Spearman's rho = {}\n\n".format(spearman))
        return pearson, spearman


# # Results

# In[4]:


# DW-NOMINATE data is included in this repo
EVAL_FILE = './dw-nominate/S{}_members.csv'


# In[5]:


# An example of loading pre-generated vectors from fastText
# included in this repo at ./queried_vecotrs
# Single year evaluation:
nyt_model = FastTextProjector(
    vec_file='./queried_vectors/nyt_97-114/S{}.vec', eval_file=EVAL_FILE)
nyt_model.evaluate_ideology_projection(cgrs_sess=109)


# In[6]:


# Multiyear evaluation:
nyt_model.multiyear_evaluation(cgrs_sess=range(97,115))


# In[7]:


wapo_model = FastTextProjector(
    vec_file='./queried_vectors/wapo_95-109/S{}.vec', eval_file=EVAL_FILE)
wapo_model.multiyear_evaluation(cgrs_sess=range(95,110))


# In[8]:


# An example of loading a word2vec model. Please provide your own
# pre-trained word2vec models and specify MODEL_DIR.
MODEL_DIR = '../word2vec_models/'
try:
    nyt_w2v_model = Word2VecProjector(
        vec_file=os.path.join(MODEL_DIR, 'w2v_nyt.txt'), eval_file=EVAL_FILE)
    nyt_w2v_model.multiyear_evaluation(cgrs_sess=range(97,115))
except IOError:
    print('Pre-trained model not found. Please check MODEL_DIR.')


# ## Projection of Public Policies
# Results from this method are quite amusing but still highly experimental. For a detailed account, please refer to https://empirical.coffee/blog/2017/political-vector-projector

# In[9]:


from gensim.models.wrappers import FastText

MODEL_DIR = '../fastText/models/'

class ExperimentalFastTextProjector(FastTextProjector):
    """still a work in progress"""
    
    def __init__(self, pretrained_model):
        self.model = FastText.load_fasttext_format(pretrained_model)
        self._axes = ('conservative', 'liberal', 'good', 'bad')
        
    def _query_fasttext_wrapper(self, queries):
        """load fastText binary file, return vectors of given queries
        It's not very fast, but it's faster if you want to
        query vectors on-the-fly, and you don't want to
        manually query C++ fastText via the command line"""
        vectors = [self.model[i] for i in queries]
        vectors = np.array(vectors, np.float32)
        axis_vectors = [self.model[i] for i in self._axes]
        axis_vectors = np.array(axis_vectors, np.float32)
        return vectors, axis_vectors

    def project_queries(self, query_file, title=None, save_dir=None):
        with open(query_file) as f:
            # parser skips empty line or comments with #
            queries = [w.strip() for w in f if not (w == '\n' or '#' in w)]
        vectors, axis_vectors = self._query_fasttext_wrapper(queries)
        x_proj, y_proj = self._scalar_projection(vectors, axis_vectors)
        self._scatter_plot(x_proj, y_proj, labels=queries, title=title,
                         x_label='liberal - conservative',
                         y_label='bad - good',
                         font_size=7, save_dir=save_dir)


# In[10]:


nyt = ExperimentalFastTextProjector(os.path.join(MODEL_DIR, 'nyt_1981-2016'))


# In[11]:


nyt.project_queries(query_file='./queries/salient_policies.md',
                title='Salient Policies and Talking Points (NYT)',
                save_dir='./graphs/NYT policies.png')


# In[12]:


wsj = ExperimentalFastTextProjector(os.path.join(MODEL_DIR, 'wsj_1997-2017'))


# In[13]:


wsj.project_queries(query_file='./queries/salient_policies.md',
                title='Salient Policies and Talking Points (WSJ)',
                save_dir='./graphs/WSJ policies.png')