In this notebook, we will:
import pandas as pd
import numpy as np
import os
import jellyfish
# Load raw data
judge_att_data = pd.read_csv('Judge Attribute Data.csv')
# Drop unnecessary columns, rename necessary columns
judge_att_data = judge_att_data.drop(columns = ['name_original','___l','___j','___char','elevate','dcother',
'liable', 'dummy','religion','circuit',
'songer_code','amon','crossl','pred','appt','temp',
'trans','liable','abamin','dsenate','rsenate','dhouse',
'rhouse','fhouse','fsenate','drhouse','drsenate',
'whouse','wsenate','nrhouse','nrsenate','dsens','rsens',
'yeari','yearc','e1','e2','e3','e4','e5','e6','congresi',
'unity','e7','e8','yearo','congreso','unityo','cityb',
'badeg','bastate','bastatus','jddeg','jdstate','jdstatus',
'grad1','grad2','tperm','fsens','drsens','wsens','nrsens',
'osens','agego','service','csb','ba','bast','bapp','ls',
'lsst','jdpp','graddeg1','graddeg2','statecab','state2',
'recdate','ageon'])
judge_att_data = judge_att_data.rename(columns = {'name':'Name','circuit_original':'Circuit','id':'ID',
'pres':'Appointing President','yearl':'Year of Departure',
'yearb':'Year of Birth','yeard':'Year of Death',
'pleft':'President when Departed','left':'Reason for Departing',
'party':'Judge Party','district':'District','state':'State',
'city':'City','gender':'Gender','race':'Race',
'ayear':'Year of Appointment','crossa':'Cross Appointment',
'recess':'Recess Appointment','aba':'ABA Rating',
'assets':'Assets','congress':'Congress','unityi':'Unity',
'hdem':'House Democrats','hrep':'House Republicans',
'sdem':'Senate Democrats','srep':'Senate Republicans',
'hother':'House Independents','sother':'Senate Independents',
'networth':'Net Worth','appres':'Appointing President Party'})
# Replace zero values with missing for net worth and assets
def replace_zero_with_na(x):
if x == 0:
return np.nan
else:
return x
judge_att_data['Assets'] = judge_att_data['Assets'].apply(replace_zero_with_na)
judge_att_data['Net Worth'] = judge_att_data['Net Worth'].apply(replace_zero_with_na)
# Turn the position indicator columns into dummies and rename
def turn_into_dummy(val):
if np.isnan(val):
return 0
else:
return 1
position_columns = list(filter(lambda col: col[0] == 'p', list(judge_att_data.columns)))
for col in position_columns:
judge_att_data[col] = judge_att_data[col].apply(turn_into_dummy)
judge_att_data = judge_att_data.rename(columns = {col:'Previous Position - ' + col[1:]})
# Load ideology data
judge_ideo_score = pd.read_excel('Judge Ideology Scores.xlsx')
judge_ideo_score = judge_ideo_score[['judgename','ideology_score']]
judge_ideo_score = judge_ideo_score.rename(columns = {'judgename':'Name','ideology_score':'Ideology Score'})
def get_best_name_match_from_list(name, data_list):
best_match = ""
highest_jw = 0
for potential_match in data_list:
current_score = jellyfish.jaro_winkler(potential_match, name)
if ((current_score > highest_jw) and (current_score > 0.89)):
highest_jw = current_score
best_match = potential_match
return best_match
judge_att_data['Closest Name'] = judge_att_data['Name'].apply(lambda x : get_best_name_match_from_list(x,judge_ideo_score['Name']))
judge_att_data = judge_att_data.merge(judge_ideo_score, left_on = 'Closest Name', right_on = 'Name', how = 'left')
judge_att_data = judge_att_data.drop(columns = ['Name_y','Closest Name'])
judge_att_data = judge_att_data.rename(columns = {'Name_x':'Name'})
judge_att_data.to_csv('Judge Attribute and Ideology.csv', index = False)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import (
linear_model, metrics, neural_network, pipeline, preprocessing, model_selection
)
#this file contains judges attributes and their ideology
#negative ideology score = liberal, positive ideology score = conservative
#zero is the approximately the median/centrist score
judges = pd.read_csv("Judge Attribute and Ideology.csv")
#ideology data are not consistent before 1956
judges = judges[judges["Year of Appointment"] > 1956]
#rescaling the ideology variable for presentation
judges["Ideology Score"] = 100*judges["Ideology Score"]
#the absolute value of the ideology score measures a judge's distance from the ideological centre
judges["Absolute Ideology"] = abs(judges["Ideology Score"])
#the dataset includes attributes of judges including:
#gender, race, age, politican party, past experience (including in politics)
judges.head()
Name | Circuit | ID | Appointing President | Appointing President Party | Year of Departure | Year of Birth | Year of Death | President when Departed | Reason for Departing | ... | Previous Position - ccoun | Previous Position - ccom | Previous Position - ada | Previous Position - da | Previous Position - lother | Previous Position - lotherl | Previous Position - lawprof | Previous Position - private | Ideology Score | Absolute Ideology | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | Acker, William Marsh, Jr. | 11 | 10 | Reagan | Republican | 1996 | 1927.0 | 9999.0 | Clinton | Retired | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 40.700001 | 40.700001 |
3 | Ackerman, Harold Arnold | 3 | 15 | Carter | Democrat | 1994 | 1928.0 | 9999.0 | Clinton | Retired | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -30.599999 | 30.599999 |
4 | Ackerman, James Waldo | 7 | 20 | Ford | Republican | 1984 | 1926.0 | 1984.0 | Reagan | Died | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 6.100000 | 6.100000 |
5 | Acosta, Raymond L. | 1 | 25 | Reagan | Republican | 1994 | 1925.0 | 9999.0 | Clinton | Retired | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 53.799999 | 53.799999 |
8 | Adams, John R. | 6 | 30323 | Bush, 43 | Republican | 9999 | 1955.0 | 9999.0 | Active Ser. | Active Service | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16.850001 | 16.850001 |
5 rows × 72 columns
#to provide motivation for our analysis, we will create a graph
#showing how the average ideology of judges shifts over time
year_ideology = judges.groupby("Year of Appointment")[["Absolute Ideology","Ideology Score"]].mean()
year_ideology.reset_index(inplace = True)
plt.style.use("fivethirtyeight")
fig, ax = plt.subplots(2,1,figsize=(11,8.5))
# Hide grid lines, set a white face color, and set yaxis label
for counter, value in enumerate(["Mean Ideology Score","Mean Absolute Ideology Score"]):
ax[counter].set_facecolor('white')
ax[counter].grid(False)
ax[counter].set_ylabel(value)
for counter, value in enumerate(["Ideology Score","Absolute Ideology"]):
ax[counter].plot(year_ideology["Year of Appointment"],
year_ideology[value],"-o")
ax[0].set_title("Judge Ideology Varies by President...")
ax[1].set_title("...But Absolute Ideology is on the Rise")
ax[1].set_xlabel("Year of Judge Appointment")
#fig.text(.2, 0.03, "*Mean Score of Judges ", ha='center',fontsize=14)
fig.tight_layout()
#the graph above showed judges have become more ideological.
#our next question is whether this trend occurred in Democratic judges, Republican judges, or both?
#since ideology varies so much based on the president in power, we'll look at the judge's year of birth,
#rather than year of appointment to see whether newer judges are more ideoligical than older ones
age_ideology = judges.groupby(["Year of Birth","Judge Party"])[["Ideology Score","Absolute Ideology"]].mean()
age_ideology.reset_index(inplace = True)
fig, ax = plt.subplots(figsize=(11,8.5))
ax.set_facecolor('white')
ax.grid(False)
initial_age = 1900
recent = age_ideology["Year of Birth"] >= initial_age
democrats = age_ideology["Judge Party"] == "Democrat"
republicans = age_ideology["Judge Party"] == "Republican"
ax.plot(age_ideology[democrats & recent]["Year of Birth"],
age_ideology[democrats & recent]["Ideology Score"],"-o",label="Democrats",color="#0e44f5")
ax.plot(age_ideology[republicans & recent]["Year of Birth"],
age_ideology[republicans & recent]["Ideology Score"],"-o",label="Republicans",color="#f23417")
ax.legend()
ax.set_title("Divergence of Judge Ideology By Party")
ax.set_xlabel("Year of Birth")
ax.set_ylabel("Mean Ideology Score")
#fig.tight_layout()
Text(0, 0.5, 'Mean Ideology Score')
import pandas as pd
import numpy as np
import os
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
# Load judge ideology data and crosswalk between state names and codes
judge_ideology_raw = pd.read_csv('Judge Attribute and Ideology.csv')
state_geo = 'us-states.json'
state_name_to_code = pd.read_csv('State Name to Code.csv')
# Clean judge ideology data and take only necessary columns
judge_ideology = judge_ideology_raw[['Name','State','Year of Appointment','Year of Departure','Ideology Score']]
judge_ideology = judge_ideology[judge_ideology['State'] != 'Puerto Rico']
judge_ideology = judge_ideology.merge(state_name_to_code, how = 'left')
judge_ideology = judge_ideology[judge_ideology['Ideology Score'].apply(lambda x : ~np.isnan(x))]
# Create empty dataframe to be filled by each unique judge-year pair
# A judge-year pair exists if the judge was active in the year
judge_ideo_by_year = pd.DataFrame(columns = ['Name','State','StateCode','Year','Ideology Score'])
# Fill dataframe
for index, row in judge_ideology.iterrows():
name = row['Name']
state = row['State']
statecode = row['StateCode']
app_year = int(row['Year of Appointment'])
# If the judge is still active in 2004, the year of departure is '9999'
dep_year = np.min([2004, int(row['Year of Departure'])])
ideo = row['Ideology Score']
for year in range(app_year, dep_year + 1):
judge_ideo_by_year = judge_ideo_by_year.append({'Name':name,'State':state,'StateCode':statecode,
'Year':year,'Ideology Score':ideo},
ignore_index = True)
# Add absolute ideology score
judge_ideo_by_year['Absolute Ideology Score'] = judge_ideo_by_year['Ideology Score'].apply(np.abs)
# Now group by year and state, and take means
state_ideo_by_year = judge_ideo_by_year.groupby(['Year','State','StateCode']).mean().reset_index()
# Scale ideology score for readability
state_ideo_by_year['Ideology Score'] = state_ideo_by_year['Ideology Score'].apply(lambda x: 100*x)
state_ideo_by_year['Absolute Ideology Score'] = state_ideo_by_year['Absolute Ideology Score'].apply(lambda x: 100*x)
# Here is what a random sample of the data looks like
state_ideo_by_year.sample(10)
Year | State | StateCode | Ideology Score | Absolute Ideology Score | |
---|---|---|---|---|---|
803 | 1949 | Louisiana | LA | 5.100000 | 7.850000 |
1315 | 1960 | Nevada | NV | 32.400000 | 32.400000 |
1316 | 1960 | New Jersey | NJ | -12.700000 | 12.785714 |
3134 | 1996 | Maine | ME | 4.100000 | 4.100000 |
2892 | 1991 | New York | NY | -1.774419 | 16.620930 |
1439 | 1963 | Arizona | AZ | -17.033334 | 17.033334 |
2860 | 1991 | Alabama | AL | 21.446154 | 32.984616 |
1912 | 1972 | Massachusetts | MA | -15.362500 | 19.962500 |
2754 | 1988 | Washington | WA | -1.825000 | 21.695000 |
2448 | 1982 | Washington | WA | -3.342857 | 30.257143 |
# Make interactive map of average state judge ideology by year
# Choose a colour scale from blue (democrat) to red (republican)
demrep_scl = [
[0.0, 'rgb(0,24,229)'],
[0.2, 'rgb(38,19,186)'],
[0.4, 'rgb(76,14,144)'],
[0.6, 'rgb(114,9,102)'],
[0.8, 'rgb(152,4,60)'],
[1.0, 'rgb(191,0,18)']
]
# Choose a colour scale from light (low) to dark (high)
hl_scl = [
[0.0, 'rgb(247,232,206)'],
[0.2, 'rgb(248,219,171)'],
[0.4, 'rgb(249,207,137)'],
[0.6, 'rgb(250,194,102)'],
[0.8, 'rgb(251,182,68)'],
[1.0, 'rgb(252,170,34)']
]
def plot_ideology_by_year(beginyear = 1956, scl = None, absolute = False):
plotly.offline.init_notebook_mode()
if absolute:
ideo = 'Absolute Ideology Score'
zmin = 0
text = 'Average Absolute Judge Ideology Score by State and Year'
else:
ideo = 'Ideology Score'
zmin = -60
text = 'Average Judge Ideology Score by State and Year'
# Create dict of data to feed into plotly.
data = [dict(type='choropleth',
marker = go.choropleth.Marker(
line = go.choropleth.marker.Line(
color = 'rgb(255,255,255)',
width = 1.5
)),
hoverinfo = 'z+text',
colorbar = go.choropleth.ColorBar(
title = ideo,
thickness = 40
),
colorscale = scl,
zmin = zmin,
zmax = 60,
autocolorscale = False,
locations = state_ideo_by_year[state_ideo_by_year['Year'] == year]['StateCode'],
z = state_ideo_by_year[state_ideo_by_year['Year'] == year][ideo].astype(float),
text = state_ideo_by_year[state_ideo_by_year['Year'] == year]['State'],
locationmode='USA-states')
for year in range(beginyear,2005)]
# Create slider for the map
steps = []
for i in range(len(data)):
step = dict(method='restyle',
args=['visible', [False] * len(data)],
label='Year {}'.format(i + beginyear))
step['args'][1][i] = True
steps.append(step)
sliders = [dict(active=0,
pad={"t": 1},
steps=steps)]
# Define layout
layout = dict(geo = go.layout.Geo(
scope = 'usa',
projection = go.layout.geo.Projection(type = 'albers usa')),
sliders=sliders,
title = go.layout.Title(
text = text
))
# Create map with plotly
fig = dict(data=data, layout=layout)
return plotly.offline.iplot(fig)
plot_ideology_by_year(scl = demrep_scl, absolute = False)
plot_ideology_by_year(scl = hl_scl, absolute = True)
This section is incomplete!
#now we turn to whether attributes of a judge can explain their ideology
import sklearn
from sklearn import (
linear_model, metrics, neural_network, pipeline,
model_selection, tree
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
#We will prepare the data for prediction
def prep_data(df, continuous_variables, categories, y_var, test_size=0.15):
ohe = preprocessing.OneHotEncoder(sparse=False)
y = df[y_var].values
X = np.zeros((y.size, 0))
# Add continuous variables if exist
if len(continuous_variables) > 0:
X = np.hstack([X, df[continuous_variables].values])
if len(categories) > 0:
X = np.hstack([X, ohe.fit_transform(df[categories])])
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=test_size, random_state=42
)
return X_train, X_test, y_train, y_test
def fit_and_report_mses(mod, X_train, X_test, y_train, y_test):
mod.fit(X_train, y_train)
return dict(
mse_train=metrics.mean_squared_error(y_train, mod.predict(X_train)),
mse_test=metrics.mean_squared_error(y_test, mod.predict(X_test))
)
#Dropping everything but the regressors and the outcome variable
#Net Worth and Assets have NA/missinng values for 45% of entries so we drop them
judges_ideology = judges.drop([ "Name","ID","Year of Death",
"Net Worth","Assets","Congress"],1)
#removing Nas
judges_ideology = judges_ideology.dropna()
#continuous variables or variables that are already indicators
#formatting the long list is a bit ugly...
continuous_variables = ['Year of Appointment',
'Year of Birth',
'Year of Departure',
'House Democrats',
'House Republicans',
'Senate Democrats',
'Senate Republicans',
'House Independents',
'Senate Independents',
'Previous Position - ssc',
'Previous Position - slc',
'Previous Position - locct',
'Previous Position - sjdget',
'Previous Position - ausa',
'Previous Position - usa',
'Previous Position - sgo',
'Previous Position - sg',
'Previous Position - ago',
'Previous Position - ag',
'Previous Position - cc',
'Previous Position - sp',
'Previous Position - mag',
'Previous Position - bank',
'Previous Position - terr',
'Previous Position - cab',
'Previous Position - asatty',
'Previous Position - satty',
'Previous Position - cabdept',
'Previous Position - scab',
'Previous Position - scabdpt',
'Previous Position - aag',
'Previous Position - indreg1',
'Previous Position - reg1',
'Previous Position - reg2',
'Previous Position - reg3',
'Previous Position - house',
'Previous Position - senate',
'Previous Position - gov',
'Previous Position - ssenate',
'Previous Position - shouse',
'Previous Position - mayor',
'Previous Position - ccoun',
'Previous Position - ccom',
'Previous Position - ada',
'Previous Position - da',
'Previous Position - lother',
'Previous Position - lotherl',
'Previous Position - lawprof',
'Previous Position - private']
#categorical variables
categories = ['Cross Appointment', 'Recess Appointment',"Unity",'Circuit',
'Appointing President',
'Appointing President Party',
'President when Departed',
'Reason for Departing',
'Judge Party',
'District',
'State',
'City',
'Gender',
'Race',
'ABA Rating']
#Using the function defined above to create testing and training data
X_train, X_test, ideo_train, ideo_test = prep_data(
judges_ideology,continuous_variables , categories, "Ideology Score"
)
X_train, X_test, abs_ideo_train, abs_ideo_test = prep_data(
judges_ideology,continuous_variables , categories, "Absolute Ideology"
)
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, pipeline
#creating four models: linear, lasso, random forest, and neural network,
#and comparing the MSE's on the training and test data
lr_model = linear_model.LinearRegression()
lasso_model = linear_model.LassoCV(cv=5)
forest_model = RandomForestRegressor(n_estimators = 10)
nn_scaled_model = pipeline.make_pipeline(
preprocessing.StandardScaler(), # this will do the input scaling
neural_network.MLPRegressor((150,100,50)))
#we'll run each model with both ideology and absolute ideology as the outcome
#later we will make this cleaner, more aesthetic...we'll also work on overfitting
print("Linear Regression -- Outcome: Ideology")
fit_and_report_mses(lr_model,X_train, X_test,ideo_train,ideo_test)
Linear Regression -- Outcome: Ideology
{'mse_train': 111.78053193381386, 'mse_test': 278.57250003044874}
print("Lasso -- Outcome: Ideology")
fit_and_report_mses(lasso_model,X_train, X_test,ideo_train,ideo_test)
Lasso -- Outcome: Ideology
{'mse_train': 777.1944044812514, 'mse_test': 776.4949630312258}
print("Random Forest -- Outcome: Ideology")
fit_and_report_mses(forest_model,X_train, X_test,ideo_train,ideo_test)
Random Forest -- Outcome: Ideology
{'mse_train': 19.967272180736455, 'mse_test': 132.07617718374257}
print("Neural Network -- Outcome: Ideology")
fit_and_report_mses(nn_scaled_model,X_train, X_test,ideo_train,ideo_test)
Neural Network -- Outcome: Ideology
{'mse_train': 11.026972272015236, 'mse_test': 209.33219156377234}
print("Linear Regression -- Outcome: Absolute Ideology")
fit_and_report_mses(lr_model,X_train, X_test,abs_ideo_train,abs_ideo_test)
Linear Regression -- Outcome: Absolute Ideology
{'mse_train': 118.77854044079918, 'mse_test': 228.6969772282095}
print("Lasso -- Outcome: Absolute Ideology")
fit_and_report_mses(lasso_model,X_train, X_test,abs_ideo_train,abs_ideo_test)
Lasso -- Outcome: Absolute Ideology
{'mse_train': 218.8407008692007, 'mse_test': 194.63559307148134}
print("Random Forest -- Outcome: Absolute Ideology")
fit_and_report_mses(forest_model,X_train, X_test,abs_ideo_train,abs_ideo_test)
Random Forest -- Outcome: Ideology
{'mse_train': 12.878166895696044, 'mse_test': 64.19114455535414}
print("Neural Network -- Outcome: Absolute Ideology")
fit_and_report_mses(nn_scaled_model,X_train, X_test,abs_ideo_train,abs_ideo_test)
Neural Network -- Outcome: Absolute Ideology
{'mse_train': 2.1850053522338215, 'mse_test': 140.2058481216295}