#!/usr/bin/env python
# coding: utf-8

# # Petition Maps
# 
# Several ways of looking at *that* petition data...
# 
# This notebook shows how to load in data from the petitions website and merge it with shapefile data so that we can plot it on a map.
# 
# First, let's get the petition data.

# In[1]:


#I cheated in finding this URL
#I useed browser devtools Network view to see what loaded when
#I loaded https://petitionmap.unboxedconsulting.com/?petition=241584
petitiondata_url = 'https://petition.parliament.uk/petitions/241584.json'


# In[2]:


import requests

petitiondata = requests.get(petitiondata_url).json()


# In[3]:


#The genson package will create a schema from a JSON object
#The schema concisely represents the different attributes in the JSON object
# along with what type they are
from genson import SchemaBuilder

builder = SchemaBuilder()
builder.add_object(petitiondata)
builder.to_schema()


# In[4]:


from pandas.io.json import json_normalize

df_country = json_normalize(petitiondata, ['data','attributes', 'signatures_by_country'])
df_country.head()


# In[5]:


df_constituency = json_normalize(petitiondata, ['data','attributes', 'signatures_by_constituency'])
df_constituency.sort_values('ons_code').head()


# Alasdair Rae publishes shapefiles / geojson files containing constituency boundary files as well as electoral data ( http://ajrae.staff.shef.ac.uk/#resources).
# 
# We can load these directly into a `geopandas` dataframe.

# In[6]:


electiondata_url = 'http://ajrae.staff.shef.ac.uk/wpc/geojson/uk_wpc_2018_with_data.geojson'


# In[7]:


import geopandas

#Enable inline plotting
get_ipython().run_line_magic('matplotlib', 'inline')

gdf = geopandas.read_file(electiondata_url)

gdf.head()


# In[8]:


gdf.columns


# Generate a map to show parliamentray majorities:

# In[14]:


#Set the default plot size
from matplotlib import pyplot as plt
fig, ax = plt.subplots(1, figsize=(12, 12))

ax = gdf.plot(column='majority', ax=ax)
ax.axis('off');


# We can also plot by party, though by default a default set of colour maps will be applied.

# In[15]:


fig, ax = plt.subplots(1, figsize=(12, 12))

ax = gdf.plot(column='Party' , ax=ax)
ax.axis('off');


# What we really want to do is to be able to define color maps based on party...

# In[17]:


from matplotlib.colors import ListedColormap

#Set up color maps by party
partycolors = {'Conservative':'blue', 
               'Labour':'red',
               'Independent':'black',
               'Liberal Democrat':'orange',
               'Labour/Co-operative':'red', 
               'Green':'green' , 
               'Speaker':'black',
               'DUP':'pink',
               'Sinn Féin':'darkgreen',
               'Scottish National Party':'yellow', 
               'Plaid Cymru':'brown'}

#The dataframe seems to assign items to categories based on the selected column sort order
#We can define a color map with a similar sorting
colors = [partycolors[k] for k in sorted(partycolors.keys())]

fig, ax = plt.subplots(1, figsize=(12, 12))

ax = gdf.plot(column='Party', cmap = ListedColormap(colors), ax=ax)
ax.axis('off');


# ## Merge in The Petition Data
# 
# We can merge the referendum data into the geodataframe using the constituency codes.
# 
# Before we do that, let's just check the data qulaity to make sure the code columns in each dataframe are consistent with each other.

# In[18]:


#Check that the ons_code and PCONCODE15 columns containing matching values
#Find symmetric differences between the code columns in each dataframe
#They should match... i.e. there should be no differences
set(df_constituency['ons_code']) ^ set(gdf['PCONCODE15'])


# In[19]:


#An example of why mathcing on something like constituency names is fraught with risk...
#Find symmetric differences between constituency names
#That is, are there names in one set not in another?
set(df_constituency['name']) ^ set(gdf['PCONNAME15'])


# Now we can add the consituency count into the geodataframe.
# 
# The easiest way to do this is to set the index of each dataframe to the cosntituency code. The *geopandas* package will then automatically align rows if we just add the `signature_count` column from the `df_constituency` dataframe to the `gdf` dataframe.

# In[20]:


gdf.set_index('PCONCODE15', inplace=True)
df_constituency.set_index('ons_code', inplace=True)
gdf['signature_count'] = df_constituency['signature_count']


# We can now plot a map of the number of people signing the petition by constituency.

# In[24]:


fig, ax = plt.subplots(1, figsize=(12, 12))

ax = gdf.plot(column='signature_count', ax=ax, legend=True,)
ax.axis('off');


# We can look at that as a percentage of the electorate in the constituency at the last general election:

# In[25]:


gdf['signature_count_pc'] = 100 * gdf['signature_count'] / gdf['electorate']

fig, ax = plt.subplots(1, figsize=(12, 12))

ax = gdf.plot(column='signature_count_pc', ax=ax, legend=True,)
ax.axis('off');


# ## Merge in Referendum Data
# 
# The referendum did not report results by Parliamentary Constituency, but estimates of votes by constituency have been created.
# 
# I'll use the estimates created by Chris Hanretty as described [here](https://medium.com/@chrishanretty/final-estimates-of-the-leave-vote-or-areal-interpolation-and-the-uks-referendum-on-eu-membership-5490b6cab878) and published here: *“Areal interpolation and the UK’s referendum on EU membership”, Chris Hanretty, Journal Of Elections, Public Opinion And Parties, Online Early Access, http://dx.doi.org/10.1080/17457289.2017.1287081* The data is published [here](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/S4DLWJ).
# 
# In some constituencies, actual results are known. A comparison between known and estimated results can be found in data collated by the House of Commons Library: [Brexit: votes by constituency](https://commonslibrary.parliament.uk/parliament-and-elections/elections-elections/brexit-votes-by-constituency/).
# 
# Unfortunately, we don't have estimates for the number of votes cast in each constituency.

# In[27]:


import pandas as pd

referendum_url = 'https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/S4DLWJ/TCBV7Q'
#The data is tab delimited
df_referendum = pd.read_csv(referendum_url, sep='\t')
df_referendum.head()


# The methodology all goes a bit approximate from now on...
# 
# First, let's get the electorate size by constituency in 2016. We could use the electorate value from the general election geodataframe, but this aside demonstrates how to load in data from an Excel file, if nothing else...
# 
# The ONS publish the data as an Excel spreadsheet [here](https://www.ons.gov.uk/peoplepopulationandcommunity/elections/electoralregistration/datasets/electoralstatisticsforuk).

# In[28]:


#Number of electors by constituency in 2016
electorate_url = 'https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/elections/electoralregistration/datasets/electoralstatisticsforuk/2016unformatted/elec5dt2unformattedelectoralstatisticsuk2016.xls'
fname = 'electorate.xls'
get_ipython().system('wget -O $fname  $electorate_url')


# In[29]:


#See what the sheets are in the file
xl = pd.ExcelFile(fname)

xl.sheet_names 


# In[30]:


#Table name does have the whitespace in it...
df_electorate = pd.read_excel(fname, sheetname='Table 2 ')
df_electorate.head()


# In[31]:


turnout = 0.72


# In[32]:


df_referendum.set_index('PCON11CD', inplace=True)
df_electorate.set_index('Code', inplace=True)

df_referendum['electorate'] = pd.merge(df_referendum, df_electorate, left_index=True, right_index=True)['Total electors 2016']

df_referendum.head()


# Let's guesstimate the votes cast. Assume a consistent turnout and then estimate votes cast based on the turnout and the electorate.

# In[33]:


#Estimated turnout - use the saem figure for all constituencies
#This is all very rule of thumb...
turnout = 0.72

df_referendum['leave votes'] = (turnout * df_referendum['electorate'] * df_referendum['Leave.estimate']).astype(int)
df_referendum['remain votes'] = (turnout * df_referendum['electorate'] * df_referendum['Remain.estimate']).astype(int)

df_referendum.head()


# In[34]:


df_referendum['referendum majority'] = (df_referendum['leave votes'] - df_referendum['remain votes']).abs()
#plotly express doesn't see boolean as a categorical. So make it a string.
df_referendum['leave'] = (df_referendum['leave votes'] > df_referendum['remain votes']).astype(str)

df_referendum.head()


# ## Charting the Referendum Counts
# 
# On social media, there has been a lot of commentary comparing the number of people signing the petition with the majhority in the constituency. There's a lot of apples and pears there, but we can maybe use this as an indicator of sentiment about how safe a seat is compared to the number of people expressing a view about revoking article 50 in each constituency.
# 
# So how about we plot the log of the number of signatories to the petition divided by the majority vote (either way) at the referendum over the log of the number of signatories to the petition divided by the majority at the general election.
# 
# Then we can read the quadrants as:
# 
# - *bottom left*: safe seat, convinced
# - *top left*: safe seat, unconvinced
# - *top right*: unsafe seat, unconvinced
# - *bottom right*: unsafe seat, convinced

# In[35]:


df_dodgystats = pd.merge(gdf, df_referendum[['referendum majority','leave']], left_index=True, right_index=True)

df_dodgystats.head()


# In[36]:


import numpy as np

df_dodgystats['safety'] = np.log10(df_dodgystats['signature_count'] / df_dodgystats['majority'])
df_dodgystats['confidence'] = np.log10(df_dodgystats['signature_count'] / df_dodgystats['referendum majority'])

df_dodgystats.head()


# Create a column that summarises info we want in the tooltip

# In[37]:


df_dodgystats['header'] = df_dodgystats[['PCONNAME15', 'WINNER15', 'majority',
                                         'referendum majority', 'signature_count']].astype(str).apply(lambda x: '<br>'.join(x), axis=1)


df_dodgystats['header'] = df_dodgystats[['PCONNAME15', 'WINNER15']].astype(str).apply(lambda x: '<br>'.join(x), axis=1)
df_dodgystats['header'] = df_dodgystats['header'] + '<br>Majority:'
df_dodgystats['header'] = df_dodgystats[['header', 'majority']].astype(str).apply(lambda x: ' '.join(x), axis=1)
df_dodgystats['header'] = df_dodgystats['header'] + '<br>Referendum Majority:'
df_dodgystats['header'] = df_dodgystats[['header', 'referendum majority']].astype(str).apply(lambda x: ' '.join(x), axis=1)
df_dodgystats['header'] = df_dodgystats['header'] + '<br>Petition Signatures:'
df_dodgystats['header'] = df_dodgystats[['header', 'signature_count']].astype(str).apply(lambda x: ' '.join(x), axis=1)


# In[39]:


import plotly_express as px

g = px.scatter(df_dodgystats, x='safety', y='confidence', color='leave', hover_name='header',
               title='UK Petition Scatter: Referendum Confidence vs Parliamentary Seat Saftey')
g


# In[44]:


df_dodgystats['alpha'] = df_dodgystats['safety'] * df_dodgystats['confidence']
df_dodgystats['alpha'] = (df_dodgystats['alpha']/df_dodgystats['alpha'].abs().max()).abs()


# In[52]:


def quadCol(row):
    if row['safety']>0 and row['confidence']> 0:
        col='unsafe_and_not_confident'
    elif row['safety']<0 and row['confidence']> 0:
        col='safe_and_not_confident'
    elif row['safety']>0 and row['confidence']< 0:
        col='unsafe_and_confident'
    elif row['safety']<0 and row['confidence']< 0:
        col='safe_and_confident'
    return col

df_dodgystats['safetycon'] = df_dodgystats.apply(lambda x: quadCol(x), axis=1)


# In[53]:


fig, ax = plt.subplots(1, figsize=(12, 12))

ax = df_dodgystats.plot(column='safetycon', ax=ax, legend=True,)
ax.axis('off');


# In[ ]: