#!/usr/bin/env python
# coding: utf-8

# # US Judge Ideology
# ## By George Radner and Ian Sapollnik
# ### ECON 407 Final Assignment
# ### April 21, 2019

# In this project, we assess the following questions:
# - How has the ideological position of US Federal District Court Judges changed over time?
# - What has driven these changes?
# - What influences how judges vote?
# 
# This notebook is divided into various parts. Part 1 outlines the data used and cleans the data. Part 2 presents an overview of the data. Part 3 shows overall trends in ideology. Part 4 isolates specific effects. Part 5 looks at how judges make decisions.

# # Part 0 - Import Packages

# In[10]:


# All packages used in this notebook are imported here.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn import (
    linear_model, metrics, neural_network, pipeline, preprocessing, model_selection, tree
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
import plotly
import chart_studio.plotly as py
import plotly.graph_objs as go
from scipy import stats
import statsmodels as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from patsy.builtins import *
from patsy import dmatrices
import statistics as st

#Jellyfish is not included in syzygy. Uncomment if necessary.
#!pip install jellyfish
import jellyfish as jf


# # Part 1 - Data Collection and Cleaning

# ## Attributes of Federal Judges Data

# The main dataset for this analysis is the [Attributes of U.S. Federal Judges Database](http://artsandsciences.sc.edu/poli/juri/attributes.htm), from The Judicial Research Initiative (JuRI) at the University of South Carolina. The data was downloaded as a .dta file from the website, but had no data labels or encoding. The labeling file was written only for SAS. Before starting to clean the data in Python, we took the SAS cleaning code and manually changed it into Stata code in order to label the values and clean other portions of the data. After running this through Stata, the data was exported as a .csv file, and we do the final cleaning (everything that could possibly be done in Python) here.

# In[12]:


# Load raw data
judge_att_data = pd.read_csv('Judge Attribute Data.csv')

# Here is what it looks like right now
judge_att_data.head()


# In[13]:


# Drop unnecessary columns, rename necessary columns 
judge_att_data = judge_att_data.drop(columns = ['name_original','___l','___j','___char','elevate','dcother',
                                                'liable', 'dummy','religion','circuit',
                                                'songer_code','amon','crossl','pred','appt','temp',
                                                'trans','liable','abamin','dsenate','rsenate','dhouse',
                                                'rhouse','fhouse','fsenate','drhouse','drsenate',
                                                'whouse','wsenate','nrhouse','nrsenate','dsens','rsens',
                                                'yeari','yearc','e1','e2','e3','e4','e5','e6','congresi',
                                                'unity','e7','e8','yearo','congreso','unityo','cityb',
                                                'badeg','bastate','bastatus','jddeg','jdstate','jdstatus',
                                                'grad1','grad2','tperm','fsens','drsens','wsens','nrsens',
                                                'osens','agego','service','csb','ba','bast','bapp','ls',
                                                'lsst','jdpp','graddeg1','graddeg2','statecab','state2',
                                                'recdate','ageon'])
judge_att_data = judge_att_data.rename(columns = {'name':'Name','circuit_original':'Circuit','id':'ID',
                                        'pres':'Appointing President','yearl':'Year of Departure',
                                        'yearb':'Year of Birth','yeard':'Year of Death',
                                        'pleft':'President when Departed','left':'Reason for Departing',
                                        'party':'Judge Party','district':'District','state':'State',
                                        'city':'City','gender':'Gender','race':'Race',
                                        'ayear':'Year of Appointment','crossa':'Cross Appointment',
                                        'recess':'Recess Appointment','aba':'ABA Rating',
                                        'assets':'Assets','congress':'Congress','unityi':'Unity',
                                        'hdem':'House Democrats','hrep':'House Republicans',
                                        'sdem':'Senate Democrats','srep':'Senate Republicans',
                                        'hother':'House Independents','sother':'Senate Independents',
                                        'networth':'Net Worth','appres':'Appointing President Party'})

# Replace zero values with missing for net worth and assets
def replace_zero_with_na(x):
    if x == 0:
        return np.nan
    else:
        return x
judge_att_data['Assets'] = judge_att_data['Assets'].apply(replace_zero_with_na)
judge_att_data['Net Worth'] = judge_att_data['Net Worth'].apply(replace_zero_with_na)

# Turn the position indicator columns into dummies and rename (these all start with 'p')
def turn_into_dummy(val):
    if np.isnan(val):
        return 0
    else:
        return 1

position_columns = list(filter(lambda col: col[0] == 'p', list(judge_att_data.columns)))
for col in position_columns:
    judge_att_data[col] = judge_att_data[col].apply(turn_into_dummy)
    judge_att_data = judge_att_data.rename(columns = {col:'Previous Position - ' + col[1:]})
    
# Creating new variable for whether judge held any of the elected positions
# These are the variables for the judge holding elected office of some kind     
political_positions =  ['Previous Position - house', 'Previous Position - senate',
                        'Previous Position - gov','Previous Position - ssenate',
                        'Previous Position - shouse','Previous Position - mayor','Previous Position - ccoun']

# Creating column of 0's which we will then fill
judge_att_data["Politician"] = 0*judge_att_data['Previous Position - house']
for position in political_positions:
    judge_att_data["Politician"] = np.maximum(judge_att_data["Politician"],judge_att_data[position])
    
# Creating new variable for judge's age at the time of appointment
judge_att_data["Age When Appointed"] = judge_att_data["Year of Appointment"] - judge_att_data["Year of Birth"]

# Here is the data now
judge_att_data.head()


# ## Judge Ideology Data

# We also have data on individual judge ideology scores, which comes from [Christina L Boyd](http://clboyd.net/ideology.html) at the University of Georgia. In this dataset, a negative ideology score is more liberal, while a positive ideology score is more conservative. The scores range from -1 to 1.
# 
# The judge names in the two datasets do not match perfectly. As a result, we must fuzzy match the names to obtain a high number matches between the two datasets. To do this, we calculate the [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) between each string in the Judge Attribute Data and the Ideology Data, and take a match for the highest scoring match for each name in the Attribute Data. The Jaro-Winkler distance assigns stronger weights to characters at the beginning of the string. Since the names are in the Last, First M. format, this makes last names more important than first names for the matching, which gives more accurate results. With some manual inspection, we chose 0.89 as the minimum score for an accurate name match.

# In[14]:


# Load ideology data
judge_ideo_score = pd.read_excel('Judge Ideology Scores.xlsx')
judge_ideo_score = judge_ideo_score[['judgename','ideology_score']]
judge_ideo_score = judge_ideo_score.rename(columns = {'judgename':'Name','ideology_score':'Ideology Score'})

# Here is what it looks like
judge_ideo_score.head()


# In[15]:


# Define function for getting the best matching from a given list. We will use this again later on.
def get_best_name_match_from_list(name, data_list):
    best_match = ""
    highest_jw = 0
    
    for potential_match in data_list:
        # This gives the Jaro-Winkler score which we use for matching
        current_score = jf.jaro_winkler(potential_match, name)
        if ((current_score > highest_jw) and (current_score > 0.89)):
            highest_jw = current_score
            best_match = potential_match
    
    return best_match

# Create column of closest name
judge_att_data['Closest Name'] = judge_att_data['Name'].apply(lambda x : get_best_name_match_from_list(x,judge_ideo_score['Name']))
# Here is what some results look like. Note that blanks exist where no match was found.
judge_att_data[['Name','Closest Name']].sample(10)


# In[16]:


# Merge ideology data into attribute data using the closest match found
judge_att_data = judge_att_data.merge(judge_ideo_score, left_on = 'Closest Name', right_on = 'Name', how = 'left')
judge_att_data = judge_att_data.drop(columns = ['Name_y','Closest Name'])
judge_att_data = judge_att_data.rename(columns = {'Name_x':'Name'})


# In[17]:


judge_att_data.info()


# ## Judge Decision Data

# Finally, we have data on decisions made by US Federal District Court judges. This comes from [The Carp-Manning U.S. District Court Case Database](https://www.umassd.edu/cas/polisci/resources/us-district-court-database/). For every decision, the dataset assigns a 'liberal' or 'conservative' designation to the decision that was made, and states which judge authored the decision. The dataset also has information on the court that made the decision, when the decision was made, and what legal category the decision falls under. We take this data and merge the Judge Attribute Data into it, again using Jaro-Winkler fuzzy matching. Thus we can also explore what influences specific decisions that judges have made.

# In[19]:


# Load data and rename columns
decision_data = pd.read_csv('Carp-Manning.csv')
decision_data = decision_data.rename(columns
                                     = {'judge':'Authoring Judge','crtpoint':'Court Location',
                                        'numjudge':'Number of Judges','circuit':'Circuit',
                                        'state':'State','statdist':'District','month':'Month',
                                        'year':'Year','libcon':'Decision Ideology',
                                        'casetype':'Case Type','category':'Case Category',
                                        'casnum':'Case Number','apyear':'Year of Appointment',
                                        'appres':'Appointing President','party':'Judge Party',
                                        'gender':'Gender','race':'Race'})

# Here is what the data looks like
decision_data.head()


# In[ ]:


# Get best match in Attributes Data
decision_data['Closest Name'] = decision_data['Authoring Judge'].apply(lambda x : get_best_name_match_from_list(x,judge_att_data['Name']))
# Here are some matches
decision_data[['Authoring Judge','Closest Name']].sample(10)


# In[ ]:


# Now merge based on these matches
decision_data = decision_data.merge(judge_att_data, left_on = 'Closest Name', right_on = 'Name', how = 'left')

# Get rid of duplicate columns
duplicate_cols = list(filter(lambda col: col[-2:] == '_y', list(decision_data.columns)))
decision_data = decision_data.drop(columns = duplicate_cols)
decision_data = decision_data.drop(columns = ['Closest Name','Name'])
duplicate_cols = [col[:-2] for col in duplicate_cols]
for col in duplicate_cols:
    decision_data = decision_data.rename(columns = {col + '_x': col})
    
# Here is what it looks like now
decision_data.head()


# In[ ]:


decision_data.info()


# # Part 2 - Data Overview and Charts

# We now present some graphs, charts and map that highlight key attributes of the Judge Attribute and Ideology Data. Ideology data are not consistent before 1956, so we eliminate years before this.

# In[ ]:


# Ideology data are not consistent before 1956, so we eliminate years before this.
judges = judge_att_data[judge_att_data["Year of Appointment"] > 1956].copy()


# Rescaling the ideology variable for presentation
judges["Ideology Score"] = judges["Ideology Score"].apply(lambda x: x*100)

# The absolute value of the ideology score measures a judge's distance from the ideological centre
judges["Absolute Ideology"] = judges["Ideology Score"].apply(abs)

# The dataset includes attributes of judges including:
# Gender, race, age, politican party, past experience (including in politics)
judges.head()


# ## Judge Ideology Over Time

# To provide motivation for our analysis, we will create a graph showing how the average ideology of judges shifts over time 

# In[ ]:


# Dataframe with mean ideology and absolute ideology by year
year_ideology = judges.groupby("Year of Appointment")[["Absolute Ideology","Ideology Score"]].mean()
year_ideology.reset_index(inplace = True)

plt.style.use("fivethirtyeight")
fig, ax = plt.subplots(2,1,figsize=(11,8.5))

# Hide grid lines, set a white face color, and set yaxis label
for counter, value in enumerate(["Mean Ideology Score","Mean Absolute Ideology Score"]):
    ax[counter].set_facecolor('white')
    ax[counter].grid(False)
    ax[counter].set_ylabel(value)

# Plot mean ideology, absolute ideology over time 
for counter, value in enumerate(["Ideology Score","Absolute Ideology"]):
    ax[counter].plot(year_ideology["Year of Appointment"],
                    year_ideology[value],"-o")
      
ax[0].set_title("Judge Ideology Varies by President...")   
ax[1].set_title("...But Absolute Ideology is on the Rise")

ax[1].set_xlabel("Year of Judge Appointment")


fig.tight_layout()


# ## Judge Ideology By Age

# The graph above showed judges have become more ideological. Our next question is whether this trend occurred in Democratic judges, Republican judges, or both? Since ideology varies so much based on the president in power, we'll look at the judge's year of birth, rather than year of appointment to see whether newer judges are more ideoligical than older ones.

# In[ ]:


age_ideology = judges.groupby(["Year of Birth","Judge Party"])[["Ideology Score","Absolute Ideology"]].mean()
age_ideology.reset_index(inplace = True)

fig, ax = plt.subplots(figsize=(11,8.5))

ax.set_facecolor('white')
ax.grid(False)

# Data is sparse for judges born before 1900 
initial_age = 1900

recent = age_ideology["Year of Birth"] >= initial_age

democrats = age_ideology["Judge Party"] == "Democrat"
republicans = age_ideology["Judge Party"] == "Republican"

ax.plot(age_ideology[democrats & recent]["Year of Birth"],
        age_ideology[democrats & recent]["Ideology Score"],"-o",label="Democrats",color="#0e44f5")

ax.plot(age_ideology[republicans & recent]["Year of Birth"],
        age_ideology[republicans & recent]["Ideology Score"],"-o",label="Republicans",color="#f23417")


ax.legend()
ax.set_title("Divergence of Judge Ideology By Party")

ax.set_xlabel("Year of Birth")
ax.set_ylabel("Mean Ideology Score")


# ## Judge Ideology Over Time and States

# Finally, we explore how ideology, both in real and absolute terms, has varied across states and time. We note that this appears to be very dependent on who is president.

# In[ ]:


# Load crosswalk between state names and state codes
state_name_to_code = pd.read_csv('State Name to Code.csv')
state_name_to_code.sample(5)


# In[ ]:


# Clean judge ideology data and take only necessary columns
map_data = judge_att_data[['Name','State','Year of Appointment','Year of Departure','Ideology Score']].copy()
map_data = map_data[map_data['State'] != 'Puerto Rico']
map_data = map_data.merge(state_name_to_code, how = 'left')
map_data = map_data[map_data['Ideology Score'].apply(lambda x : ~np.isnan(x))]

# Create empty dataframe to be filled by each unique judge-year pair
# A judge-year pair exists if the judge was active in the year
judge_ideo_by_year = pd.DataFrame(columns = ['Name','State','StateCode','Year','Ideology Score'])

# Fill dataframe
for index, row in map_data.iterrows():
    name = row['Name']
    state = row['State']
    statecode = row['StateCode']
    app_year = int(row['Year of Appointment'])
    # If the judge is still active in 2004, the year of departure is '9999'
    dep_year = np.min([2004, int(row['Year of Departure'])])
    ideo = row['Ideology Score']
    for year in range(app_year, dep_year + 1):
        judge_ideo_by_year = judge_ideo_by_year.append({'Name':name,'State':state,'StateCode':statecode,
                                                        'Year':year,'Ideology Score':ideo},
                                                        ignore_index = True)
# Add absolute ideology score
judge_ideo_by_year['Absolute Ideology Score'] = judge_ideo_by_year['Ideology Score'].apply(np.abs)
# Now group by year and state, and take means
state_ideo_by_year = judge_ideo_by_year.groupby(['Year','State','StateCode']).mean().reset_index()
# Scale ideology score for readability
state_ideo_by_year['Ideology Score'] = state_ideo_by_year['Ideology Score'].apply(lambda x: 100*x)
state_ideo_by_year['Absolute Ideology Score'] = state_ideo_by_year['Absolute Ideology Score'].apply(lambda x: 100*x)
# Here is what a random sample of the data looks like
state_ideo_by_year.sample(10)


# Now we make interactive map of average state judge ideology by year. For the two maps presented, use the slider to see ideology changing by year.

# In[ ]:


# Create dict of US presidents by year
president_by_year = dict(
                    [(n, 'George W. Bush')
                        for n in range(2001, 2005)] +
                    [(n, 'Bill Clinton')
                        for n in range(1993, 2001)] +
                    [(n, 'George H. W. Bush')
                        for n in range(1989, 1993)] +
                    [(n, 'Ronald Reagan')
                        for n in range(1981, 1989)] +
                    [(n, 'Jimmy Carter')
                        for n in range(1977, 1981)] +
                    [(n, 'Gerald Ford')
                        for n in range(1975, 1977)] +
                    [(n, 'Richard Nixon')
                        for n in range(1969, 1975)] +
                    [(n, 'Lyndon B. Johnson')
                        for n in range(1964, 1969)] +
                    [(n, 'John F. Kennedy')
                        for n in range(1961, 1964)] +
                    [(n, 'Dwight D. Eisenhower')
                        for n in range(1956, 1961)]
                    )

# Define plotting function.
def plot_ideology_by_year(beginyear = 1956, scl = None, absolute = False):
    
    plotly.offline.init_notebook_mode()
    
    if absolute:
        ideo = 'Absolute Ideology Score'
        zmin = 0
        text = 'Average Absolute Judge Ideology Score by State and Year'
    else:
        ideo = 'Ideology Score'
        zmin = -60
        text = 'Average Judge Ideology Score by State and Year'

    # Create dict of data to feed into plotly. 
    data = [dict(type='choropleth',
                 marker = go.choropleth.Marker(
                     line = go.choropleth.marker.Line(
                         color = 'rgb(255,255,255)',
                         width = 1.5
                     )),
                 hoverinfo = 'z+text',
                 colorbar = go.choropleth.ColorBar(
                     title = ideo,
                     thickness = 40
                 ),
                 colorscale = scl,
                 zmin = zmin,
                 zmax = 60,
                 autocolorscale = False,
                 locations = state_ideo_by_year[state_ideo_by_year['Year'] == year]['StateCode'],
                 z = state_ideo_by_year[state_ideo_by_year['Year'] == year][ideo].astype(float),
                 text = state_ideo_by_year[state_ideo_by_year['Year'] == year]['State'],
                 locationmode='USA-states') 
            for year in range(beginyear,2005)]

    # Create slider for the map
    steps = []
    for i in range(len(data)):
        step = dict(method='update',
                    args = [
                        # Make the ith trace visible
                        {'visible': [False for t in range(len(data))]},

                        # Set the title for the ith trace
                        {'title.text': text + "<br />" + f"President is {president_by_year[i+beginyear]}"}],
                    label='Year {}'.format(i + beginyear))
        step['args'][0]['visible'][i] = True
        steps.append(step)
    sliders = [dict(active=0,
                    pad={"t": 1},
                    steps=steps)]    

    # Define layout
    layout = dict(geo = go.layout.Geo(
                    scope = 'usa',
                    projection = go.layout.geo.Projection(type = 'albers usa')),
                  sliders=sliders,
                  title = {'text': text + "<br />" + f"President is {president_by_year[i+beginyear]}"}
                 )

    # Create map with plotly
    fig = dict(data=data, layout=layout)
    return plotly.offline.iplot(fig)


# This first map presents judge ideology in absolute terms. Unsurprisingly, as the presidency changes from one party to another, we see judge ideology follow the President's ideology.

# In[ ]:


# Choose a colour scale from blue (democrat) to red (republican)
demrep_scl = [
    [0.0, 'rgb(0,24,229)'],
    [0.2, 'rgb(38,19,186)'],
    [0.4, 'rgb(76,14,144)'],
    [0.6, 'rgb(114,9,102)'],
    [0.8, 'rgb(152,4,60)'],
    [1.0, 'rgb(191,0,18)']
]

plot_ideology_by_year(scl = demrep_scl, absolute = False)


# Next, we show the same map with absolute ideology. We see absolute ideology increasing over time.

# In[ ]:


# Choose a colour scale from light orange (low) to dark orange (high)
hl_scl = [
    [0.0, 'rgb(247,232,206)'],
    [0.2, 'rgb(248,219,171)'],
    [0.4, 'rgb(249,207,137)'],
    [0.6, 'rgb(250,194,102)'],
    [0.8, 'rgb(251,182,68)'],
    [1.0, 'rgb(252,170,34)']
]

plot_ideology_by_year(scl = hl_scl, absolute = True)


# # Part 3 - Judge Characteristics

# We now move on to an analysis of the judge attribute and ideology data. Specifically, we attempt to determine how ideology is impacted by characteristics of the judges. We begin by preparing the data for prediction.

# In[20]:


# Prep data for prediction
def prep_data(df, continuous_variables, categories, y_var, test_size=0.15):

    ohe = preprocessing.OneHotEncoder(sparse=False)

    y = df[y_var].values
    X = np.zeros((y.size, 0))

    # Add continuous variables if exist
    if len(continuous_variables) > 0:
        X = np.hstack([X, df[continuous_variables].values])

    if len(categories) > 0:
        X = np.hstack([X, ohe.fit_transform(df[categories])])

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=test_size, random_state=42
    )

    return X_train, X_test, y_train, y_test

# This function will allow us to compare the MSE's of each model
def fit_and_report_mses(mod, X_train, X_test, y_train, y_test):
    mod.fit(X_train, y_train)
    return dict(
        mse_train=metrics.mean_squared_error(y_train, mod.predict(X_train)),
        mse_test=metrics.mean_squared_error(y_test, mod.predict(X_test))
    )

# Dropping everything but the regressors and the outcome variable
# Net Worth and Assets have NA/missinng values for 45% of entries so we drop them

judges_ideology = judges.drop([ "Name","ID","Year of Death",
                               "Net Worth","Assets","Congress"],1)

# Removing rows with NAs
judges_ideology = judges_ideology.dropna()

# Continuous variables or variables that are already indicators
continuous_variables = ['House Democrats', 'House Republicans', 'Senate Democrats', 'Senate Republicans',
                        'House Independents', 'Senate Independents', 'Previous Position - ssc', 'Previous Position - slc',
                        'Previous Position - locct', 'Previous Position - sjdget', 'Previous Position - ausa',
                        'Previous Position - usa', 'Previous Position - sgo', 'Previous Position - sg', 
                        'Previous Position - ago', 'Previous Position - ag', 'Previous Position - cc', 
                        'Previous Position - sp', 'Previous Position - mag', 'Previous Position - bank',
                        'Previous Position - terr', 'Previous Position - cab', 'Previous Position - asatty',
                        'Previous Position - satty', 'Previous Position - cabdept', 'Previous Position - scab',
                        'Previous Position - scabdpt', 'Previous Position - aag', 'Previous Position - indreg1',
                        'Previous Position - reg1', 'Previous Position - reg2', 'Previous Position - reg3', 
                        'Previous Position - house', 'Previous Position - senate', 'Previous Position - gov',
                        'Previous Position - ssenate', 'Previous Position - shouse', 'Previous Position - mayor',
                        'Previous Position - ccoun', 'Previous Position - ccom', 'Previous Position - ada',
                        'Previous Position - da', 'Previous Position - lother', 'Previous Position - lotherl',
                        'Previous Position - lawprof', 'Previous Position - private']

# Categorical variables 
categories = ['Year of Appointment', 'Year of Birth', 'Year of Departure', 'Cross Appointment', 'Recess Appointment',
              'Unity', 'Circuit', 'Appointing President', 'Appointing President Party', 'President when Departed',
              'Reason for Departing', 'Judge Party', 'District', 'State', 'City', 'Gender', 'Race', 'ABA Rating']

# Creating test and training data for the two outcomes: ideology and absolute ideology
X_train, X_test, ideo_train, ideo_test = prep_data(
    judges_ideology,continuous_variables , categories, "Ideology Score"
)

X_train, X_test, abs_ideo_train, abs_ideo_test = prep_data(
    judges_ideology,continuous_variables , categories, "Absolute Ideology"
)

# After some experimentation, we've selected some model parameters for lasso, random forest, and a neural network model
alphas = np.exp(np.linspace(-2., -12., 25))

lr_model = linear_model.LinearRegression()
lasso_model = linear_model.LassoCV(cv=6, alphas = alphas,  max_iter=500)
forest_model = RandomForestRegressor(n_estimators = 100)
nn_scaled_model = pipeline.make_pipeline(
    preprocessing.StandardScaler(),  # this will do the input scaling
    neural_network.MLPRegressor((150,),activation = "logistic",
                                solver="adam",alpha=0.005)) # we tried a few alphas chose this one 
                                                            # based on minimizing mse_test ... but it was somewhat arbitrary

models = { "OLS": lr_model, "Lasso": lasso_model,
          "Random Forest": forest_model, "Neural Network": nn_scaled_model}

# Creating an empty dataframe (with hierarchical columns) for the test and training MSE of each model 
MSE_by_model = pd.DataFrame(index = models.keys(),
                            columns= pd.MultiIndex.from_arrays((["Ideology","Ideology","Absolute Ideology","Absolute Ideology"],
                                                                     ["Train MSE","Test MSE","Train MSE","Test MSE"])))
# This is what that dataframe looks like
MSE_by_model


# In[21]:


# Takes a few minutes to run
ideo_mse_list = [fit_and_report_mses(model,X_train,X_test, ideo_train,ideo_test) for model in models.values()]
abs_ideo_mse_list = [fit_and_report_mses(model,X_train,X_test, abs_ideo_train,abs_ideo_test) for model in models.values()]
MSE_by_model[("Ideology","Train MSE")] = [result["mse_train"] for result in ideo_mse_list]
MSE_by_model[("Ideology","Test MSE")] = [result["mse_test"] for result in ideo_mse_list]
MSE_by_model[("Absolute Ideology","Train MSE")] = [result["mse_train"] for result in abs_ideo_mse_list]
MSE_by_model[("Absolute Ideology","Test MSE")] = [result["mse_test"] for result in abs_ideo_mse_list]
MSE_by_model


# With our parameters (and computing time limitations), the Random Forest clearly does the best job predicting ideology and absolute ideology.
# 
# We'll now use the Random Forest to see how much of the growth in absolute ideology over time is explained by the judge attributes.

# ## How much do changing attributes explain rising absolute ideology?
# 
# We will generate fitted values and residuals for the absolute ideology using the Random Forest (because it performed the best above) and the entire sample (not just training data). Then we'll plot the average change in residuls and fitted values in a graph similar to one above that we used to motivate analysis.

# In[22]:


# Using the fulldataset is equivalent to setting test_size = 0 
X_full, X_empty_test, abs_ideo_full, abs_ideo_empty_train = prep_data(
    judges_ideology,continuous_variables , categories, "Absolute Ideology",test_size=0
)

# Fitted values
abs_ideo_hat = forest_model.fit(X_full,abs_ideo_full).predict(X_full)

# Residuals
abs_ideo_resid = abs_ideo_full - abs_ideo_hat


# In[23]:


# Creating dataframe to ultimately generate mean observed, predicted, and residual values by year 
judges_decomposition = judges_ideology[["Year of Appointment","State","Absolute Ideology"]].copy()
judges_decomposition["Residuals"] = abs_ideo_resid
judges_decomposition["Fitted"] = abs_ideo_hat
year_decomposition =  judges_decomposition.groupby("Year of Appointment")[["Absolute Ideology","Residuals","Fitted"]].mean()
year_decomposition.reset_index(inplace = True)

plt.style.use("fivethirtyeight")
fig, ax = plt.subplots(1,2,figsize=(22,8.5))

colors = ["#4135ed" ,"#5cb6cf"]
    
for counter,value in enumerate(["Absolute Ideology","Fitted"]):
    ax[0].plot(year_decomposition["Year of Appointment"],
                    year_decomposition[value],"-o", color = colors[counter],label=value)
    ax[1].scatter(year_decomposition["Year of Appointment"],
                    year_decomposition[value],  color = colors[counter],label=value)
    slope, intercept, r_value, p_value, std_err = stats.linregress(year_decomposition["Year of Appointment"],year_decomposition[value])
    line = slope*year_decomposition["Year of Appointment"]+intercept
    ax[1].plot(year_decomposition["Year of Appointment"], line,color=colors[counter],label="_")
      
ax[0].set_title("Are Judge Attributes Driving Rising Absolute Ideology?")   
ax[1].set_title("...They Do Not Appear to Explain Increasing Absolute Ideology")   

for n in [0,1]:
    ax[n].set_facecolor('white')
    ax[n].grid(False)
    ax[n].set_ylabel("Yearly Mean")
    ax[n].set_xlabel("Year of Judge Appointment")
    ax[n].legend()

fig.tight_layout()


# The fitted values do not show an increase in predicted absolute ideology over time. But part of the reason is structural: we are controlling for year of appointment and other variables that vary over time such as appointed president. Next, we'll look at whether the judge's static _personal_ attributes (excluding age) predict the rise in absolute over time. We'll use the following variables: race, gender, previous job experience, judge's political party, and American Bar Association Rating (level of qualification).
# 
# First, we'll look at how judge personal attributes have changed over time. 
# 
# ## Attributes Over Time

# In[24]:


# Subset of full dataframe with only key judge personal attributes, as well as year of appointment
judges_personal = judges_ideology[["Age When Appointed","Race","Gender","Judge Party","Politician","ABA Rating","Year of Appointment"]]

# Manual "one-hot encoding" or in other words, creating columns which serve as indicator variables for key attributes
for race_name in ["White","African American"]:
    judges_personal[race_name] = [1 if race == race_name else 0 for race in judges_personal["Race"]]

judges_personal["Female"] = [1 if gender == "Female" else 0 for gender in judges_personal["Gender"]]
judges_personal["Republican"] = [1 if party == "Republican" else 0 for party in judges_personal["Judge Party"]]
judges_personal["Well Qualified"] = [1 if rating == "Well-Qual." else 0 for rating in judges_personal["ABA Rating"]]

mean_age = np.mean(judges["Age When Appointed"])
judges_personal["Young"] = [1 if age < mean_age else 0 for age in judges_personal["Age When Appointed"]]

judges_personal.head()


# In[25]:


# Dataframe with the average fraction of judges with each personal attribute by year
year_personal = judges_personal.groupby("Year of Appointment")[["White","African American",
                                                               "Female","Republican","Well Qualified","Young"]].mean()

# Multiplying by 100 to provide percentage term interpretation 
for name in list(year_personal):
    year_personal[name] = 100*year_personal[name]
    
year_personal.head(n=15)  


# This plot shows considerable variation over time in the proportion of judges with each of the personal attributes 

# In[26]:


fig, ax = plt.subplots(7,1,figsize=(11,8.5*7))

for counter,attribute in enumerate(list(year_personal)):
    ax[counter].set_facecolor('white')
    ax[counter].grid(False)
    ax[counter].set_ylabel("Percentage")
    ax[counter].set_xlabel("Year of Appointment")
    ax[counter].set_yticks(range(0,100,10))
    ax[counter].plot(list(year_personal.index),year_personal[attribute],"-o",label=attribute)
    ax[counter].set_title(attribute)
    

# All the attributes in one plot (a bit overwhelming, but makes for comparison)
for attribute in list(year_personal):
    ax[6].set_facecolor('white')
    ax[6].grid(False)
    ax[6].set_ylabel("Percentage")
    ax[6].set_xlabel("Year of Appointment")
    ax[6].set_yticks(range(0,100,10))
    ax[6].plot(list(year_personal.index),year_personal[attribute],"-o",label=attribute)
    ax[6].set_title("All Attributes")
    ax[6].legend()
    
fig.tight_layout()


# As the plot above shows, there variation in judge attributes over time. But do the changing attributes predict changing ideology?
# 
# Now we'll determine whether static personal attributes help explain rising judge absolute ideology.
# 
# ## Static Attributes and Ideology

# In[27]:


# We'll add "P" to the varialbe names to refer to "personal" and distinguish them from the variables above
continuous_variablesP = ['Age When Appointed', 'Previous Position - slc', 'Previous Position - locct', 
                         'Previous Position - sjdget', 'Previous Position - ausa', 'Previous Position - usa',
                         'Previous Position - sgo', 'Previous Position - sg', 'Previous Position - ago',
                         'Previous Position - ag', 'Previous Position - cc', 'Previous Position - sp', 
                         'Previous Position - mag', 'Previous Position - bank', 'Previous Position - terr',
                         'Previous Position - cab', 'Previous Position - asatty', 'Previous Position - satty',
                         'Previous Position - cabdept', 'Previous Position - scab', 'Previous Position - scabdpt',
                         'Previous Position - aag', 'Previous Position - indreg1', 'Previous Position - reg1',
                         'Previous Position - reg2', 'Previous Position - reg3', 'Previous Position - house',
                         'Previous Position - senate', 'Previous Position - gov', 'Previous Position - ssenate',
                         'Previous Position - shouse', 'Previous Position - mayor', 'Previous Position - ccoun',
                         'Previous Position - ccom', 'Previous Position - ada', 'Previous Position - da',
                         'Previous Position - lother', 'Previous Position - lotherl', 'Previous Position - lawprof',
                         'Previous Position - private']

# Categorical variables 
categoriesP = ['Judge Party','Gender','Race','ABA Rating']

# Using the fulldataset is equivalent to setting test_size = 0 
X_fullP, X_empty_test, abs_ideo_fullP, abs_ideo_empty_train = prep_data(
    judges_ideology,continuous_variablesP , categoriesP, "Absolute Ideology",test_size=0
)

# Fitted values
abs_ideo_hatP = forest_model.fit(X_fullP,abs_ideo_fullP).predict(X_fullP)

# Residuals
abs_ideo_residP = abs_ideo_fullP - abs_ideo_hatP

# Similar to above, creating dataframe with mean observed, residual and fitted values by year
judges_decompositionP = judges_ideology[["Year of Appointment","State","Absolute Ideology"]].copy()
judges_decompositionP["Residuals"] = abs_ideo_residP
judges_decompositionP["Fitted"] = abs_ideo_hatP
year_decompositionP = judges_decompositionP.groupby("Year of Appointment")[["Absolute Ideology","Residuals","Fitted"]].mean()
year_decompositionP.reset_index(inplace = True)

plt.style.use("fivethirtyeight")
fig, ax = plt.subplots(1,2,figsize=(22,8.5))
colors = ["#4135ed" ,"#5cb6cf"]

# Plotting trends in observed and fitted values
for counter,value in enumerate(["Absolute Ideology","Fitted"]):
    ax[0].plot(year_decompositionP["Year of Appointment"],
                    year_decompositionP[value],"-o", color = colors[counter],label=value)
    ax[1].scatter(year_decompositionP["Year of Appointment"],
                    year_decompositionP[value],  color = colors[counter],label=value)
    slope, intercept, r_value, p_value, std_err = stats.linregress(year_decompositionP["Year of Appointment"],year_decompositionP[value])
    line = slope*year_decompositionP["Year of Appointment"]+intercept
    ax[1].plot(year_decompositionP["Year of Appointment"], line,color=colors[counter],label="_")
      
ax[0].set_title("Are Judge Personal Attributes Driving Rising Absolute Ideology?")   
ax[1].set_title("...Personal Attributes Do not Predict Rising Absolute Ideology")
for n in [0,1]:
    ax[n].set_facecolor('white')
    ax[n].grid(False)
    ax[n].set_ylabel("Yearly Mean")
    ax[n].set_xlabel("Year of Judge Appointment")
    ax[n].legend()
    #ax[n].set_yticks(range(0,40,5))

fig.tight_layout()


# The meaning behind this chart is that the shifting of judges' static personal attributes over time does not explain the rise in absolute ideology. It is interest how similar this chart is to the the chart above, which included dynamic attributes of judges (including year of appointment, appointing president, etc.). The chart may beg the question: how much does adding dynamic attributes really improve the predictive power of the random forest model? Comparing the the MSE's of the random forest model with and without dynamic attributes (see below), it is clear the dynamic attributes do add predictive power. That said, the charts, taken together, suggest there are unobserved forces behind the rising absolute ideology of US judges

# In[28]:


# All judge attributes
full_mse = metrics.mean_squared_error(abs_ideo_full,
                                      forest_model.fit(X_full,abs_ideo_full).predict(X_full))


# Only including static personal attributes
full_mseP = metrics.mean_squared_error(abs_ideo_fullP,
                                      forest_model.fit(X_fullP,abs_ideo_fullP).predict(X_fullP))

print(f"MSE with all Attributes: {full_mse}")
print(f"MSE with only Static Personal Attributes: {full_mseP}")


# # Part 4 - Estimating Nuisance Function
# 
# We'll estimate the influence of judge political experience on judge ideology
# 
# Are judges with past experience as elected officials more ideological?
# 
# We'll estimate regressions of the form 
# 
# $$Y = \beta Politician + f(x) + \epsilon $$
# 
# where $Y$ is ideology or absolute ideology, $Politician$ is an indicator for past politican experience, and $f(x)$ is the nuisance function for the controls (judge attributes) 

# ## Preparing Data

# In[29]:


# Creating a dataframe which is ready for regressions/partial regressions

# Setting continuous variables for this analysis (the categories are the same as the ones above)

continuous_variables2 = ["Ideology Score","Absolute Ideology","Politician",
                          'Previous Position - house', 'Previous Position - senate',
                         'Previous Position - gov','Previous Position - ssenate',
                         'Previous Position - shouse','Previous Position - mayor',
                         'Previous Position - ccoun','House Democrats','House Republicans',
                        'Senate Democrats', 'Senate Republicans','House Independents','Senate Independents']

political_judges = judges_ideology[continuous_variables2]
political_judges.reset_index(inplace = True)
political_judges = political_judges.drop(["index"],axis=1)
political_judges.head()


# In[30]:


#Creating indicator variables for all categorical variables

ohe = preprocessing.OneHotEncoder(sparse=False)


#OneHotEncoder creates a numpy array which we have to convert to a dataframe. 
political_judges_dummies = pd.DataFrame.from_records(ohe.fit_transform(judges_ideology[categories]))

#From the array, we have no meaningful variable/column names.
#Nonetheless, we are not particularly interested in which of these indicator variables
#have predictive power, so we'll just name them numerically
dummy_columns = ["dummy" + str(i) for i in range(654)]

political_judges_dummies.columns = dummy_columns
political_judges_dummies.head(n=10)


# In[31]:


#Combining the dataframes with the continuous and indicator variables
political_judges = pd.concat([political_judges, political_judges_dummies],axis=1)
political_judges.head()


# ## OLS 
# 
# First, we'll run the OLS regression. We have hundreds of dummy variables, and including them manually in the regression would be tedious.

# In[32]:


def list_to_formula(var_list):
    """ 
    Given a list of variable names: var1, var2, ... varN,
    Output single string: + var1 + var2 + ... + varN
    """
    output = ""
    for var in var_list:
        output = output + " + " + var
    return output


# To include variables with spaces in their name in the regression, we need to rely on patsy and put them in the form Q('variable name')

# In[33]:


congress_composition = ['House Democrats','House Republicans','Senate Democrats',
                        'Senate Republicans','House Independents','Senate Independents',]

congress_composition_Q = ["Q('" + val + "')" for val in congress_composition]

composition_formula = list_to_formula(congress_composition_Q)

# We'll also make a formula for the dummy variables
dummies_formula = list_to_formula(dummy_columns)

# Estimating three variations of the model (adding controls)
lm_ideo = list()
lm_ideo.append(smf.ols(formula="Q('Ideology Score') ~ Politician", data=political_judges,
                  missing="drop").fit(cov_type='HC0'))

lm_ideo.append(smf.ols(formula="Q('Ideology Score') ~ Politician" + composition_formula
                  ,data=political_judges,
                  missing="drop").fit(cov_type='HC0'))

lm_ideo.append(smf.ols(formula="Q('Ideology Score') ~ Politician" + composition_formula + dummies_formula
                  ,data=political_judges,
                  missing="drop").fit(cov_type='HC0'))


# Notably, the inclusion of the dummies (important controls) flipped the sign of the Politician coefficient! 
summary_col(lm_ideo, stars=True)


# Repeating with absolute ideology as the outcome

# In[34]:


lm_asb_ideo = list()
lm_asb_ideo.append(smf.ols(formula="Q('Absolute Ideology') ~ Politician", data=political_judges,
                  missing="drop").fit(cov_type='HC0'))

lm_asb_ideo.append(smf.ols(formula="Q('Absolute Ideology') ~ Politician" + composition_formula
                  ,data=political_judges,
                  missing="drop").fit(cov_type='HC0'))

lm_asb_ideo.append(smf.ols(formula="Q('Absolute Ideology') ~ Politician" + composition_formula + dummies_formula
                  ,data=political_judges,
                  missing="drop").fit(cov_type='HC0'))

# Inclusion of the full set of dummies flipped the sign of the Politician coefficient, again
summary_col(lm_asb_ideo, stars=True)


# ## Partial Linear Regression With Lasso

# In[35]:


def partial_linear(y, d, X, yestimator, destimator, folds=3):
    """Estimate the partially linear model y = d*C + f(x) + e

    Parameters
    ----------
    y : array_like
        vector of outcomes
    d : array_like
        vector or matrix of regressors of interest
    X : array_like
        matrix of controls
    mlestimate : Estimator object for partialling out X. Must have ‘fit’
        and ‘predict’ methods.
    folds : int
        Number of folds for cross-fitting

    Returns
    -------
    ols : statsmodels regression results containing estimate of coefficient on d.
    yhat : cross-fitted predictions of y
    dhat : cross-fitted predictions of d
    """

    # we want predicted probabilities if y or d is discrete
    ymethod = "predict" if False==getattr(yestimator, "predict_proba",False) else "predict_proba"
    dmethod = "predict" if False==getattr(destimator, "predict_proba",False) else "predict_proba"
    # get the predictions
    yhat = cross_val_predict(yestimator,X,y,cv=folds,method=ymethod)
    dhat = cross_val_predict(destimator,X,d,cv=folds,method=dmethod)
    ey = np.array(y - yhat)
    ed = np.array(d - dhat)
    ols = sm.regression.linear_model.OLS(ey,ed).fit(cov_type='HC0')

    return(ols, yhat, dhat)

# Prepare data
formula="Q('Ideology Score') + Q('Absolute Ideology') + Politician ~ " + composition_formula + dummies_formula

yd, X = dmatrices(formula,political_judges)
ideo = yd[:,0]
abs_ideo = yd[:,1]
politician = yd[:,2]

# We use another list of alphas compared to the above, so this is number 2
alphas2 = np.exp(np.linspace(1.5, -10., 25))
lasso_ideo = linear_model.LassoCV(cv=6, alphas=alphas2, max_iter=500).fit(X,ideo)
lasso_abs_ideo = linear_model.LassoCV(cv=6, alphas=alphas2, max_iter=500).fit(X,abs_ideo)
lasso_politician = linear_model.LassoCV(cv=6, alphas=alphas2, max_iter=500).fit(X,politician)

fig, ax = plt.subplots(1,3, figsize=(15,5))

# Plotting the MSE for the two outcomes and the explanatory variable, "Politician"
def plotlassocv(l, ax) :
    alphas = l.alphas_
    mse = l.mse_path_.mean(axis=1)
    std_error = l.mse_path_.std(axis=1)
    ax.plot(alphas,mse)
    ax.fill_between(alphas, mse + std_error, mse - std_error, alpha=0.2)

    ax.set_ylabel('MSE +/- std error')
    ax.set_xlabel('alpha')
    ax.set_xlim([alphas[0], alphas[-1]])
    ax.set_xscale("log")
    return(ax)
   
ax[0] = plotlassocv(lasso_ideo,ax[0])
ax[0].set_title("MSE for Ideology Score")
ax[1] = plotlassocv(lasso_abs_ideo,ax[1])
ax[1].set_title("MSE for Absolute Ideology")
ax[2] = plotlassocv(lasso_politician,ax[2])
ax[2].set_title("MSE for Politician")

fig.tight_layout()


# In[36]:


# Summarizing the lasso model partial regression
pl_lasso_ideo = partial_linear(ideo, politician, X,
                          linear_model.Lasso(alpha=lasso_ideo.alpha_),
                          linear_model.Lasso(alpha=lasso_politician.alpha_))
pl_lasso_ideo[0].summary()


# In[37]:


# Now for absolute ideology
pl_lasso_abs_ideo = partial_linear(abs_ideo, politician, X,
                          linear_model.Lasso(alpha=lasso_abs_ideo.alpha_),
                          linear_model.Lasso(alpha=lasso_politician.alpha_))
pl_lasso_abs_ideo[0].summary()


# ## Partial Linear Regression with Random Forests

# In[38]:


# Fitting the forest model
forest_ideo = RandomForestRegressor(n_estimators = 100).fit(X,ideo)
forest_abs_ideo = RandomForestRegressor(n_estimators = 100).fit(X,abs_ideo)
fores_politician = RandomForestRegressor(n_estimators = 100).fit(X,politician)

# Summarizing the partial linear regression with Random Forests
pl_forest_ideo = partial_linear(ideo, politician, X,
                          RandomForestRegressor(n_estimators = 100),
                          RandomForestRegressor(n_estimators = 100))
pl_lasso_ideo[0].summary()


# In[39]:


# Absolute ideology
pl_forest_abs_ideo = partial_linear(abs_ideo, politician, X,
                          RandomForestRegressor(n_estimators = 100),
                          RandomForestRegressor(n_estimators = 100))
pl_forest_abs_ideo[0].summary()


# ## Partial Linear Regression with Neural Networks

# In[40]:


#Summarizing the neural network model's partial linear regression
pl_nn_ideo = partial_linear(ideo, politician, X,
                          nn_scaled_model, #from above
                          nn_scaled_model)

pl_nn_ideo[0].summary()


# In[41]:


pl_nn_abs_ideo = partial_linear(abs_ideo, politician, X,
                          nn_scaled_model,
                          nn_scaled_model)

pl_nn_abs_ideo[0].summary()


# ## Summary of Partial Linear Models 

# In[42]:


print("Outcome: Ideology (positive = conservative)")
summary_col([pl_lasso_ideo[0], pl_forest_ideo[0],pl_nn_ideo[0]],
            model_names=["Lasso", "Random Forest","Neural Network"] ,stars=False)


# In[43]:


print("Outcome: Absolute Ideology")
summary_col([pl_lasso_abs_ideo[0], pl_forest_abs_ideo[0], pl_nn_abs_ideo[0]],
            model_names=["Lasso", "Random Forest","Neural Network"],stars=False)


# Overall, due to the imprecise (and somewhat noisy estimates across methods), we don't have evidence that past political experience is highly correlated with either ideology or absolute ideology. For absolute ideology, there is suggestive evidence that past political experience is correlated with more ideologically centrist judges (lower absolute ideology). The story for raw ideology as the outcome is less clear, as the coefficients have different signs across methods.

# # Part 5 - Overview of Judge Decision Making

# We now turn to our final dataset, which contains every decision made by US Federal District Courts that can be categorized as political - over 100000 in total. We begin by showing key characteristics of the data.

# ## Data Overview

# In[44]:


# We only need some of this data
judge_decisions = decision_data[['Authoring Judge','Court Location','Number of Judges','Circuit','Year','Decision Ideology',
                                'Case Type','Case Category','Year of Appointment','Appointing President','Judge Party',
                                'Gender','Race','Year of Birth','ABA Rating','Congress','Unity']].copy()

# We begin by converting the decision ideology, which is either liberal or conservative, into a dummy.
def libcon_dummy(ideology):
    if ideology == 'Conservative':
        return 100
    if ideology == 'Liberal':
        return -100
    else:
        return np.nan
judge_decisions['Decision Ideology'] = judge_decisions['Decision Ideology'].apply(libcon_dummy)

# Here is what the data looks like
judge_decisions.head()


# First, we plot the average decision ideology by case type.

# In[45]:


# Get the list of case types with more than 1000 cases
high_vol_types = judge_decisions[['Case Type','Authoring Judge']].groupby('Case Type').count().reset_index()
high_vol_types['High Vol'] = [1 if count >= 1000 else 0 for count in high_vol_types['Authoring Judge']]
high_vol_types = high_vol_types[high_vol_types['High Vol'] == 1]
high_vol_types = high_vol_types.drop(columns = ['Authoring Judge','High Vol'])

ideology_by_type = (judge_decisions[['Case Type','Decision Ideology']].groupby('Case Type')
                    .mean().sort_values('Decision Ideology').reset_index())
ideology_by_type = ideology_by_type.merge(high_vol_types, on = 'Case Type', how = 'right')

plt.style.use("fivethirtyeight")
fig, ax = plt.subplots(figsize=(11,8.5))
ax.set_facecolor('white')
ax.grid(False)
ax.set_title("Average Ideological Result by Case Type")
ax.set_xlabel("Average Ideological Score")
colors = ["b" if x < 0 else "r" for x in ideology_by_type['Decision Ideology'].values]

ideology_by_type.plot(kind = 'barh', x = 'Case Type', y = 'Decision Ideology', ax = ax, color = colors, legend = False)


# Nothing too crazy is happening here - certain types seem to be prone to a certain ideological result, but everything seems balanced overall. Next, we explore how the results have changed over time, and how they depend on judge ideology.

# In[46]:


ideology_by_party_year = (judge_decisions[['Year','Judge Party','Decision Ideology']].groupby(['Year','Judge Party',])
                          .mean().reset_index())
ideology_by_party_year = ideology_by_party_year[(ideology_by_party_year['Judge Party'] == 'Democrat') |
                                                (ideology_by_party_year['Judge Party'] == 'Republican')]
# 1933 is first year where both parties are present
ideology_by_party_year = ideology_by_party_year[ideology_by_party_year['Year'] >= 1933]
democrats = ideology_by_party_year["Judge Party"] == "Democrat"
republicans = ideology_by_party_year["Judge Party"] == "Republican"

fig, ax = plt.subplots(figsize=(11,8.5))
ax.set_facecolor('white')
ax.grid(False)

ax.plot(ideology_by_party_year[republicans]["Year"],
        ideology_by_party_year[republicans]["Decision Ideology"],"-o",label="Republicans",color='r')

ax.plot(ideology_by_party_year[democrats]["Year"],
        ideology_by_party_year[democrats]["Decision Ideology"],"-o",label="Democrats",color='b')

ax.legend()
ax.set_title("Decision Ideology By Judge Party and Time")
ax.set_xlabel("Year")
ax.set_ylabel("Mean Decision Ideology Score")


# Between 1930 and 1970, Democratic and Republican judges were practically indistinguishable! But since 1970, there has been a sharp divide, with judges sticking to their ideology more consistently. However, a similar trend appears for both parties. Overall, results seem to generally be more conservative. Since presidents appoint these judges, it is interesting to ask how results vary by appointing president. Could this be why things have changed since 1970?

# In[47]:


ideology_by_app_pres = (judge_decisions[judge_decisions['Year of Appointment']>=1933]
                        [['Appointing President','Decision Ideology']]
                        .groupby(['Appointing President'])
                        .mean().reset_index().reindex([9,12,2,0,10,1,5,8,6,7,3,11,4]))
fig, ax = plt.subplots(figsize=(11,8.5))
ax.set_facecolor('white')
ax.grid(False)
ax.set_title("Average Ideological Result by Appointing President")
ax.set_xlabel("Average Ideological Score")
colors = ["b" if x in ['F. ROOSEVELT','TRUMAN','KENNEDY','JOHNSON','CARTER','CLINTON','OBAMA'] 
          else "r" for x in ideology_by_app_pres['Appointing President'].values]

ideology_by_app_pres.plot(kind = 'barh', x = 'Appointing President', y = 'Decision Ideology',
                          ax = ax, color = colors, legend = False)


# This seems to explain some of the results! Roosevelt, Truman and Kennedy's Judges voted more conservative despite being appointed by a Democrat! Judges have generally stuck to their appointing president's ideology since then. What if, however, party and president have nothing to do with these decisions? Could personal attributes of judges drive their decision-making?

# In[48]:


ideology_by_gen_race = (judge_decisions[judge_decisions['Race'].isin(
                                        ['African-American/black','Asian-American','Latino/Hispanic','white/caucasian'])]
                       .pivot_table(index='Race', columns='Gender', values='Decision Ideology', margins=True))
ideology_by_gen_race


# This seems to be the case! Overall, male judges vote more conservatively than female judges. Meanwhile, Afrian-American and Asian-American judges are more liberal than the rest. However, when comparing the overall mean to the mean for white male judges, we realize that these are practically the same. The majority of judges are indeed white men - so while this might explain some individual nuances, it can't be driving the large divergence. Finally, we ask: could this be due to differences in the quality of judges?

# In[49]:


ideology_by_rating = (judge_decisions[judge_decisions['ABA Rating'].isin(
                                      ['Not Rated','Well-Qual.','Qualified','Not Qualified','Excep. Well-Qual.']) &
                                      judge_decisions['Judge Party'].isin(['Democrat','Republican'])]
                      .groupby(['ABA Rating','Judge Party'])[['Decision Ideology']].mean()
                      .unstack().reset_index().reindex([2,1,3,4,0]))
ideology_by_rating.columns = ['ABA Rating','Democrat','Republican']

# Add counts of each rating-party pair
count_by_rating = (judge_decisions[judge_decisions['ABA Rating'].isin(
                                      ['Not Rated','Well-Qual.','Qualified','Not Qualified','Excep. Well-Qual.']) &
                                      judge_decisions['Judge Party'].isin(['Democrat','Republican'])]
                      .groupby(['ABA Rating','Judge Party'])[['Decision Ideology']].count().unstack().reset_index())
count_by_rating.columns = ['ABA Rating','Democrat Count','Republican Count']
ideology_by_rating = ideology_by_rating.merge(count_by_rating, on = 'ABA Rating')

fig, ax = plt.subplots(figsize=(11,8.5))
ax.set_facecolor('white')
ax.grid(False)
ideology_by_rating.plot(kind = 'bar', x = 'ABA Rating', y = ['Democrat','Republican'], ax = ax, color =['b','r'])
ax.set_title("Average Ideological Result by ABA Rating and Party")
ax.set_ylabel("Average Ideological Result")
for i, v in enumerate(ideology_by_rating['Democrat'].values):
    text = ideology_by_rating[ideology_by_rating['Democrat'] == v]['Democrat Count'].iloc[0]
    if v > 0:
        ax.text(i - 0.25 , v + 1, text, color='b', fontdict = {'size': 10})
    else: 
        ax.text(i - 0.25 , v - 1, text, color='b', fontdict = {'size': 10})
for i, v in enumerate(ideology_by_rating['Republican'].values):
    text = ideology_by_rating[ideology_by_rating['Republican'] == v]['Republican Count'].iloc[0]
    if v > 0:
        ax.text(i + 0.05, v + 1, text, color='r', fontdict = {'size': 10})
    else: 
        ax.text(i + 0.05, v - 1, text, color='r', fontdict = {'size': 10})


# There is practically no difference in decision-making between judges that are not rated or not qualified, according to the ABA. However, we see that qualified, well-qualified, and exceptionally well-qualified judges are much more likely to default to their own ideology! Exceptionally well-qualified democratic judges hold this bias so hard that their mean decision is liberal, despite mean decisions generally being conservative. It seems that more qualified judges are also more ideological.

# ## Some Prediction

# Visually, it seems that a combination of factors, including case type, year, judge party, appointing president, ABA rating, gender and ethnicity, are determining factors in how judges make decisions. We ask one final question: is this complete? That is to say, do these factors combined account for everything? We run some statistical models to find out.

# In[50]:


# We clean the data some more and take a new subset with more factors
decisions = decision_data[list(judge_decisions) + ['House Democrats','House Republicans','Senate Democrats',
                                                   'Senate Republicans','House Independents','Senate Independents',
                                                   'Recess Appointment']]
decisions = decisions.drop(columns = ['Authoring Judge'])
decisions = decisions.dropna()

# We again convert the decision ideology, which is either liberal or conservative, into a dummy.
def libcon_dummy(ideology):
    if ideology == 'Conservative':
        return 1
    if ideology == 'Liberal':
        return 0
    else:
        return np.nan
decisions['Decision Ideology'] = decisions['Decision Ideology'].apply(libcon_dummy)

# Categorical variables
categories_dec = ['Court Location', 'Circuit', 'Case Type', 'Case Category', 'Appointing President', 'Judge Party', 'Gender',
              'Race', 'ABA Rating', 'Congress', 'Unity', 'Recess Appointment']

# Continuous variables 
continuous_variables_dec = list(filter(lambda col: col not in categories_dec + ['Decision Ideology'], list(decisions)))

# Split into test and train
X_dec_train, X_dec_test, y_dec_train, y_dec_test = prep_data(
    decisions, continuous_variables_dec, categories_dec, 'Decision Ideology'
)

# A bunch of different models
alphas = np.exp(np.linspace(-2., -12., 25))
lr_model = linear_model.LinearRegression(n_jobs = 100)
logistic_model = linear_model.LogisticRegression(solver="liblinear", max_iter = 1000, n_jobs = 100)
lasso_model = linear_model.LassoCV(cv=6, alphas = alphas,  max_iter=500, n_jobs = 100)
forest_model = RandomForestRegressor(n_estimators = 100, n_jobs = 100)
nn_scaled_model = pipeline.make_pipeline(
    preprocessing.StandardScaler(),  # this will do the input scaling
    neural_network.MLPRegressor((150,),activation = "logistic",
                                solver="adam",alpha=0.005))
models = { "OLS": lr_model, "Logistic": logistic_model, "Lasso": lasso_model,
          "Random Forest": forest_model, "Neural Network":nn_scaled_model}

MSE_by_model = pd.DataFrame(index = models.keys(),
                            columns= ["Train MSE","Test MSE"])

mse_list = [fit_and_report_mses(model,X_dec_train, X_dec_test, y_dec_train, y_dec_test) for model in models.values()]

MSE_by_model['Train MSE'] = ['%f' % result["mse_train"] for result in mse_list]
MSE_by_model['Test MSE'] = ['%f' % result["mse_test"] for result in mse_list]
MSE_by_model


# In[55]:


dev = st.stdev(decisions['Decision Ideology'])
print(f'The standard deviation for decision ideology is {dev}.')


# OLS does a terrible job of predicting the data! Clearly the relationship between these factors and judge decision making is not linear. However, when fed into logistic regression, lasso, random forests, and neural networks, the predictive capacity becomes extremely strong! All MSEs are much lower than the standard deviation. Clearly there is strong predictive power from these attributes.

# We unfortunately do not estimate nuisance function due to the complications induced by the fact that our regressors of interest are categorical as opposed to dummy variables.