Notebook

In [1]:

%%HTML
<style>
    /* style for presentation only */
    .reveal .rendered_html table { font-size: 24px }
    /* centre images */
    .reveal .rendered_html img {
        display: table-cell;
        text-align: center;
        vertical-align: middle;
        horizontal-align: middle;
        margin-left: auto;
        margin-right: auto;
    }
</style> 

In [2]:

from IPython.display import Image

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning)

from matplotlib import pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
graph_figsize = (10,6)                    # I'm forgetful and lazy
plt.rcParams.update({'figure.figsize': graph_figsize})

%matplotlib nbagg
                                          # Jupyter Magics!
import geopandas as gp
import urllib.request
import zipfile
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import chain
from collections import defaultdict
            
import moviepy.editor as mpy
import moviepy.video as mpyv

import plotly.plotly as py
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from datetime import datetime
init_notebook_mode(connected=False)
import cufflinks

from bs4 import BeautifulSoup
import requests
from tqdm import tqdm_notebook as tqdm

from ckanapi import RemoteCKAN # it's on pip
def build_odni_connection():
    """Be nice to OpenDataNI and tell them how old I am. (And that it's me)"""
    version_no = (pd.to_datetime('now') -
    pd.to_datetime('1988/05/17')).days/365
    ua = f'@Bolster/{version_no:.2f} (+http://bolster.online/)'
    return RemoteCKAN('https://www.opendatani.gov.uk/', user_agent=ua)

Extraction, Transformation and Loading¶

Get the data
Clean the data
Store the data

Extraction:¶

use BS4 to walk the Department of Education datasets for post primary school level data for:

In [ ]:

def get_year_files_for_enrolement_data(dest_base_dir):
    base_url = "https://www.education-ni.gov.uk"
    year_files = defaultdict(list)
    listing_path = "/articles/school-enrolments-school-level-data"
    soup = BeautifulSoup(requests.get(base_url+listing_path).text, 'lxml')
    for link in tqdm(soup.find_all('a'),
                     desc='years',
                     leave=False
                    ):
        if 'School enrolments - school level data 20' in ' '.join(map(str, link.contents)):
            year = link.get('href')[-6:-2]
            # Follow year and get all the relevant files
            year_soup = BeautifulSoup(
                requests.get(base_url+link.get('href')).text,
                'lxml'
            )
            for link in tqdm(year_soup.find_all('a'), 
                             desc=f'files in {year}',
                             leave=False
                            ):
                year_dir = dest_base_dir.joinpath(str(year))
                year_dir.mkdir(parents=True, exist_ok=True)
                href = link.get('href', "")
                filename = href.split('/')[-1]
                contents = ' '.join(map(str, link.contents))
                if 'xls' in href.split('.')[-1].lower():
                    # See this requirement right 👆
                    # Mix of xls, XLSX, xlsx and XLS
                    dest_file = year_dir/filename
                    if not dest_file.exists():
                        urllib.request.urlretrieve(
                            href, dest_file
                        )
                    year_files[year].append(filename)

    return year_files


dest = Path('./data/education-ni/')
year_files = get_year_files_for_enrolement_data(dest)

In [ ]:

sheets = defaultdict(list)
for year, files in year_files.items():
    for file in files:
        if 'post' in file:
            xls = pd.ExcelFile(f'data/education-ni/{year}/{file}')
            sheets[year].extend(xls.sheet_names)
dict(sheets)

In [ ]:

from collections import Counter
all_sheet_names = Counter([_ for d in sheets.values() for _ in d])
all_sheet_names.most_common()

In [ ]:

df = pd.DataFrame.from_dict({
    year: [sn in sheet_names for sn in all_sheet_names] 
    for year, sheet_names in sheets.items()
}, orient='index')
df.columns=all_sheet_names.keys()
df.T

In [ ]:

import seaborn as sns
f,ax = plt.subplots(figsize=graph_figsize)
sns.heatmap(df.T, ax=ax)

In [ ]:

def parse_reference_table(xls):
    """
    From an ExcelFile, clean up:
    * School Mgmt Type disaster
    * Inconsistent header depth
    * Multi-row header names
    * inconsistent headers (ref_key_map)
    * inconsistent col order
    * inconsistent caps/spacing (strip|lower)
    """
    cols= [
        'de ref',
        'school name',
        'school type',
        'address 1',
        'postcode',
        'urban_rural',
        'school management type',
        'district council',
        'parliamentary constituency',
        'town'
    ]
    
    categories = [
        'school type',
        'urban_rural',
        'school management type',
        'district council',
        'parliamentary constituency',
        'town'
    ]

    ref_key_map={
        'denino':'de ref',
        'urban/ rural': 'urban_rural',
        'schoolname': 'school name'
    }
    
    reference_value_rename = {
        'school management type':{
             'gmi':'integrated',
             'controlled integrated':'integrated',
             'roman catholic maintained':'rc maintained',
             'grant maintained integrated':'integrated',
             'voluntary - other managed':'voluntary',
             'voluntary - rc managed':'voluntary',
             'catholic maintained':'rc maintained'
        }
    }
    join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()
    
    if 'reference data' in xls.sheet_names:
        df = pd.read_excel(xls, 'reference data', header=None)
        h_range = 2 if isinstance(df.ix[3,0], int) else 3
        try:
            df.columns=df.ix[1:h_range].fillna('').apply(join_n_strip_n_lower, axis=0).values
            df.rename(columns=ref_key_map, inplace=True)
            df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)
            df = df[cols]
            df['de ref'] = df['de ref'].astype(int)
            df.set_index('de ref', inplace=True)
            for c in df:
                 df[c]=df[c].str.lower().str.strip()            
            df.replace(reference_value_rename, inplace=True)
            for c in categories:
                df[c].fillna('NA', inplace=True)
                df[c] = df[c].astype('category')
        except TypeError as e:
            print(e)
    else:
        df=None
    return df

def parse_enrolments_table(xls):
    """From an ExcelFile, clean up:
    * Inconsistent header depth
    * fucked up nans/nulls all over the place
    * inconsistent *footer* depth...
    * Set de ref as join index and drop pointless fields
    """
    def join_n_strip_n_lower(l): return ' '.join(l).strip().lower()

    def strip_n_lower(s): return s.strip().lower()

    def unyearify(s): return int(s.replace('year ', ''))

    if 'enrolments' in xls.sheet_names:
        df = pd.read_excel(xls, 'enrolments', header=None, skip_footer=5)
        h_range = 2 if isinstance(df.ix[3, 0], int) else 3
        try:
            df.columns = df.ix[3].fillna('').apply(strip_n_lower).values
            df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)
            df.dropna(how='all', inplace=True, axis=0)
            df['de ref'] = df['de ref'].astype(int)
            df.drop('schoolname', axis=1, inplace=True)
            df.drop('total pupils', axis=1, inplace=True)
            df.set_index('de ref', inplace=True)
            df.rename(columns=unyearify, inplace=True)
            df = df.astype(float)
        except TypeError as e:
            print(e)
    else:
        df = None
    return df

def parse_fsm_table(xls):
    """From an ExcelFile, clean up:
    * Inconsistent header depth
    * fucked up nans/nulls all over the place
    * inconsistent *footer* depth...
    * Set de ref as join index and drop pointless fields
    """
    join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()
    strip_n_lower = lambda s: s.strip().lower()
    unyearify = lambda s: int(s.replace('year ',''))
    
    if 'free school meals' in xls.sheet_names:
        df = pd.read_excel(xls, 'free school meals', header=None, skip_footer=5)
        h_range = 2 if isinstance(df.ix[3,0], int) else 3
        try:
            df.columns=df.ix[3].fillna('').apply(strip_n_lower).values
            df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)
            df.dropna(how='all', inplace=True, axis=0)
            df['de ref'] = df['de ref'].astype(int)
            df.drop('schoolname',axis=1, inplace=True)
            df.drop('free school meals', axis=1, inplace=True)
            df.set_index('de ref', inplace=True)
            df.replace('#',pd.np.nan, inplace=True) # # = Undisclosed
            df.replace('*',2.0, inplace=True) # * == < 5
            df.replace('!',1, inplace=True) # ! avoid identification, so it's prob one or two
            
            df=df.astype(float)
        except TypeError as e:
            print(e)
    else:
        df=None
    return df

def parse_available_table(xls):
    """From an ExcelFile, clean up:
    * Inconsistent header depth
    * fucked up nans/nulls all over the place
    * inconsistent *footer* depth...
    * Set de ref as join index and drop pointless fields
    * Totally different schemas between years
    * Inconsistent metric naming
    * non numerical data flags (*/!)
    """
    ref_key_map={
        'schoolname': 'school name',
        'total unfilled places': 'available places',
        'unfilled places': 'available places',
        'total approved enrolment number': 'approved enrolments'
    }
    
    join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()
    
    if 'School level data' in xls.sheet_names:
        df = pd.read_excel(xls, 'School level data', header=None)
        h_range = 2 if isinstance(df.ix[3,0], int) else 3
    elif 'unfilled places' in xls.sheet_names:
        df = pd.read_excel(xls, 'unfilled places', header=None)
        h_range = 2 if isinstance(df.ix[3,0], int) else 3
    else:
        df=None
        
    if df is not None:
        try:
            df.columns=df.ix[1:h_range].fillna('').apply(join_n_strip_n_lower, axis=0).values
            df.rename(columns=ref_key_map, inplace=True)
            df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)
            df=df.applymap(lambda x: np.nan if isinstance(x, str) and x.isspace() else x)
            df.dropna(how='all', axis=1, inplace=True)
            df.dropna(how='any', axis=0, inplace=True)
            if df.shape[1] == 6: # recent doesn't have fecking headers
                cols = list(df.columns)
                cols[0] = 'de ref'
                cols[1] = 'school name'
                df.columns=cols
            df.drop('school name', axis=1, inplace=True)
            df['de ref'] = df['de ref'].astype(int)
            df.set_index('de ref', inplace=True)
            df.replace('*',2.0, inplace=True) # * == < 5
            df.replace('!',1, inplace=True) # ! avoid identification, so it's prob one or two
            df.dropna(how='all', inplace=True, axis=1)
            df.astype(int, inplace=True)
            

            
        except TypeError as e:
            print(e)

    return df

In [ ]:

re_dfs={}
av_dfs={}
en_dfs={}
fsm_dfs={}
for year, files in year_files.items():
    for file in files:
        if 'post' in file:
            xls = pd.ExcelFile(f'data/education-ni/{year}/{file}')
            df = parse_reference_table(xls)
            if df is not None:
                print(f'Got reference data for {year}')
                re_dfs[year]=df
            df = parse_enrolments_table(xls)
            if df is not None:
                print(f'Got enrolment data for {year}')
                en_dfs[year]=df
            df = parse_available_table(xls)
            if df is not None:
                print(f'Got available data for {year}')
                av_dfs[year]=df
            df = parse_fsm_table(xls)
            if df is not None:
                print(f'Got fsm data for {year}')
                fsm_dfs[year]=df
                
reference = pd.Panel(re_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)
available = pd.Panel(av_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)
enrolment = pd.Panel(en_dfs).sort_index().rename_axis('year').rename_axis('yeargroup', axis=2)
fsm = pd.Panel(fsm_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)

reference.to_hdf('data.h5', 'reference')
available.to_hdf('data.h5', 'available')
enrolment.to_hdf('data.h5', 'enrolment')
fsm.to_hdf('data.h5','fsm')

In [ ]:

reference

Population Data¶

In [ ]:

if not Path('data/cons_pop.csv').exists():
    odni = build_odni_connection()
    for dataset in odni.action.package_show(id='population-estimates-for-northern-ireland')['resources']:
        if dataset['name'] == "Parliamentary Constituencies by single year of age and gender (mid-2001 to mid-2017)":
            cons_pop = pd.read_csv(dataset['url'])
            cons_pop.to_csv("data/cons_pop.csv", index=False)
    cons_pop.head()

cons_pop = pd.read_csv('data/cons_pop.csv')
cons_pop['Mid_Year_Ending'] = cons_pop.Mid_Year_Ending.astype(int)
cons_pop['Population_Estimate'] = cons_pop.Population_Estimate.astype(float)
cons_pop['Age'] = cons_pop.Age.astype(int)
cons_pop.rename(columns={'Geo_Name':'constituency'}, inplace=True)
cons_pop['constituency']= cons_pop.constituency.str.strip().str.lower()
cons_pop[(cons_pop.Gender == 'All Persons') & (cons_pop.Mid_Year_Ending == 2016)].head()
cons_pop.to_hdf('data.h5','cons_pop')

Constituency Maps¶

In [ ]:

cons_gdf_zip="http://osni-spatial-ni.opendata.arcgis.com/datasets/563dc2ec3d9943428e3fe68966d40deb_3.zip"
cons_gdf_shp = "OSNI_Open_Data_Largescale_Boundaries__Parliamentary_Constituencies_2008.shp"
if not Path('data/'+cons_gdf_shp).exists():
    urllib.request.urlretrieve(cons_gdf_zip, 'data/_tmp.zip')
    with zipfile.ZipFile('data/_tmp.zip') as z:
        z.extractall('data/')
    Path('data/_tmp.zip').unlink()
    
cons_gdf=gp.GeoDataFrame.from_file('data/'+cons_gdf_shp)
cons_gdf.rename(columns={'PC_NAME':'constituency'}, inplace=True)
cons_gdf.drop(['OBJECTID','PC_ID'], axis=1, inplace=True)
cons_gdf['constituency'] = cons_gdf['constituency'].str.lower().str.strip()
cons_gdf.set_index('constituency', inplace=True)
cons_gdf.plot()

In [ ]:

import fiona; fiona.supported_drivers

In [ ]:

# Enrolment sum of year group for all schools in constituencey per consitituency
en_df= pd.DataFrame.from_dict({
    con:
    enrolment[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],:].sum().sum()
    for con in cons
}).T

# Available places sum of year group for all schools in constituencey per consitituency
av_df= pd.DataFrame.from_dict({
    con:
    available[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],'available places'].sum()
    for con in cons
}).T

# Free School Meals
fsm_df= pd.DataFrame.from_dict({
    con:
    fsm[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],'fsme'].sum()
    for con in cons
}).T

for c in av_df:
    cons_gdf[f"av_{c}"] = av_df[c]
    cons_gdf[f"av_{c}_rat"] = (av_df[c]/en_df[c])
for c in en_df:
    cons_gdf[f"en_{c}"] = en_df[c]
    cons_gdf[f"en_{c}_pk"] = en_df[c]/cons_gdf["Area_sqkm"]
for c in fsm_df:
    cons_gdf[f"fsm_{c}"] = fsm_df[c]
    cons_gdf[f"fsm_{c}_rat"] = (fsm_df[c]/en_df[c])
    
    
def get_winning_quartile(df, age_quartiles):
    age_counts = df.groupby("Age")['Population_Estimate'].sum()
    quartile_counts = pd.Series({
        f"{lower}-{upper}":age_counts.loc[lower:upper].sum()
        for lower,upper in zip([0]+list(age_quartiles), list(age_quartiles)+[99])
    })
    return quartile_counts.idxmax()

pop_years = set(cons_pop.Mid_Year_Ending.unique())
for c in tqdm(reference.axes[0].astype(int), desc="Year"):
    # Preserve edu data and use closest population year
    if c not in pop_years:
        yr = pop_years[np.abs(pop_years-c).argmin()]
    else:
        yr=c
        
    age_quartiles=annual_quartiles.loc[yr].astype(int)
        
    _cons_pop = cons_pop[(cons_pop.Gender != 'All Persons') \
                       & (cons_pop.Mid_Year_Ending == yr) \
                      ].groupby(['constituency', 'Gender'])['Population_Estimate'].sum()\
                        .unstack().rename(columns=lambda c: c.lower())
    _cons_pop['total'] = _cons_pop.sum(axis=1)
    _cons_pop['m_per_f'] = _cons_pop['males']/_cons_pop['females']
    
    cons_gdf[f"pop_{c}"] = _cons_pop['total']
    cons_gdf[f"pop_{c}_males"] = _cons_pop['males']
    cons_gdf[f"pop_{c}_females"] = _cons_pop['females']
    cons_gdf[f"pop_{c}_m_per_f"] = _cons_pop['m_per_f']
    
    

    cons_gdf[f"topqt_{c}"]=cons_pop[
        (cons_pop.Gender != 'All Persons') & (cons_pop.Mid_Year_Ending == yr)
    ].groupby('constituency').apply(get_winning_quartile, age_quartiles=annual_quartiles.loc[yr].values) 
    
    cons_gdf[f"en_{c}_pc"]=cons_gdf[f"en_{c}"]/cons_gdf[f"pop_{c}"]
    cons_gdf[f"fsm_{c}_pc"]=cons_gdf[f"fsm_{c}"]/cons_gdf[f"pop_{c}"]
    
    cons_pop_mean=cons_pop[(cons_pop.Gender.isin(["All Persons"])) & (cons_pop.Mid_Year_Ending == yr)]
    cons_pop_mean['popprod'] = cons_pop_mean[['Age','Population_Estimate']].product(axis=1)
    cons_pop_mean=cons_pop_mean.groupby('constituency')[['popprod','Population_Estimate']].sum(axis=0)
    cons_gdf[f'age_{c}_avg'] = cons_pop_mean['popprod']/cons_pop_mean['Population_Estimate']
    try:
        cons_gdf[f"av_{c}_pc"]=cons_gdf[f"av_{c}"]/cons_gdf[f"pop_{c}"]
    except:
        print(f"No data for av in {c}")
        
cons_gdf.to_file('data.h5', 'cons_gdf')

That's not a great way of summarising age distributions....¶

In [ ]:

def flatten_ages(s):
    return np.asarray(
        list(chain.from_iterable(
            ([age]*int(n) for age,n in s.iteritems())
        ))
    )

def get_age_quantiles(df, q=[0.25,0.5,0.75]):
    pop_sum=df.groupby('Age')['Population_Estimate'].sum()
    age_quartiles = np.quantile(flatten_ages(pop_sum),q)
    return pd.Series(dict(zip(q,age_quartiles)))

annual_cons_quartiles = cons_pop[(cons_pop.Gender == "All Persons")].groupby(["Mid_Year_Ending","constituency"]).apply(get_age_quantiles)
annual_quartiles = cons_pop[(cons_pop.Gender == "All Persons")].groupby("Mid_Year_Ending").apply(get_age_quantiles)

In [ ]:

pop_years = cons_pop.Mid_Year_Ending.unique()
for c in tqdm(reference.axes[0].astype(int), desc="Year"):
    # Preserve edu data and use closest population year
    if c not in pop_years:
        yr = pop_years[np.abs(pop_years-c).argmin()]
    else:
        yr=c
        
    age_quartiles=annual_quartiles.loc[yr].astype(int)
        
    _cons_pop = cons_pop[(cons_pop.Gender != 'All Persons') \
                       & (cons_pop.Mid_Year_Ending == yr) \
                      ].groupby(['constituency', 'Gender'])['Population_Estimate'].sum()\
                        .unstack().rename(columns=lambda c: c.lower())
    _cons_pop['total'] = _cons_pop.sum(axis=1)
    _cons_pop['m_per_f'] = _cons_pop['males']/_cons_pop['females']
    
    cons_pop_qilted=pd.DataFrame.from_dict(
        {
            f"{lotile}-{qtile}":cons_pop[(cons_pop.Gender == 'All Persons') \
                           & (cons_pop.Mid_Year_Ending == yr) \
                           & (lotile<=cons_pop.Age) &(cons_pop.Age<qtile)
                          ].groupby('constituency')['Population_Estimate'].sum()
            for lotile, qtile in zip([0]+list(age_quartiles), list(age_quartiles)+[99])
        }
    )
    
    cons_stats[f"pop_{c}"] = _cons_pop['total']
    cons_stats[f"pop_{c}_males"] = _cons_pop['males']
    cons_stats[f"pop_{c}_females"] = _cons_pop['females']
    cons_stats[f"pop_{c}_m_per_f"] = _cons_pop['m_per_f']

    cons_stats[f"topqt_{c}"]=cons_pop_qilted.idxmax(axis=1)
    cons_stats[f"en_{c}_pc"]=cons_stats[f"en_{c}"]/cons_pop_qilted.sum(axis=1)
    
    cons_pop_mean=cons_pop[(cons_pop.Gender.isin(["All Persons"])) & (cons_pop.Mid_Year_Ending == yr)]
    cons_pop_mean['popprod'] = cons_pop_mean[['Age','Population_Estimate']].product(axis=1)
    cons_pop_mean=cons_pop_mean.groupby('constituency')[['popprod','Population_Estimate']].sum(axis=0)
    cons_stats[f'age_{c}_avg'] = cons_pop_mean['popprod']/cons_pop_mean['Population_Estimate']
    try:
        cons_stats[f"av_{c}_pc"]=cons_stats[f"av_{c}"]/cons_pop_qilted.sum(axis=1)
    except:
        print(f"No data for av in {c}")

In [ ]:

unyearify = lambda c:'_'.join([c.split('_')[0]]+c.split('_')[2:])
sns.set(rc={'figure.figsize':graph_figsize})

for year in range(2009,2017):

    cons_stats_year = cons_stats[[c for c in cons_stats.columns if str(year) in c]].rename(columns=unyearify)

    f = sns.pairplot(data=cons_stats_year, hue='topqt',
                 vars=['age_avg','fsm_rat','pop_m_per_f'],
                )
    f.fig.suptitle(f"Pairplot of Age, FSM, and Gender:({year})")
    f.fig.tight_layout()
    p = Path(f"outputs/pairplot/{year}.png")
    p.parent.mkdir(parents=True, exist_ok=True)
    f.savefig(p)
    plt.close(f.fig)

In [ ]:

import matplotlib.animation as animation

unyearify = lambda c:'_'.join([c.split('_')[0]]+c.split('_')[2:])
years = list(range(2009,2017))
explanations={
    'age_avg': "Average Age",
    'av_pc': "Available School Places per capita",
    'av_rat': "Ratio of Available Places to Actualised Enrolment",
    'av': "Available School Places",
    'en_pc': "Enrolled Pupils per capita",
    'en_pk': "Enrolled Pupils per $km^2$",
    'en': "Enrolled Pupils",
    'fsm_rat': "Ratio of FSM pupils to total pupils",
    'fsm': "FSM pupils",
    'pop': "Population",
    'pop_females': "Population (F)",
    'pop_males': "Population (M)",
    'pop_m_per_f': "Ratio of Males to Females",
    'topqt': "Most populous age quartile (Age or younger)"
}

In [ ]:

for metric in set(map(unyearify, cons_stats.columns)):
    for year in years:
        try:
            f, ax = plt.subplots(figsize=graph_figsize)
            if metric in explanations:
                ax.set_title(f"{explanations[metric]}:({year})")
            else:
                ax.set_title(f"{metric}:({year})")
            if metric=='topqt':
                cons_stats[[c for c in cons_stats.columns if str(year) in c]+['geometry']]\
                    .rename(columns=unyearify).plot(column='topqt', ax=ax, legend=True, scheme='Quantiles',k=3)
            else:
                cons_stats[[c for c in cons_stats.columns if str(year) in c]+['geometry']]\
                    .rename(columns=unyearify).plot(column=metric, legend=True, ax=ax)
            ax.axis('off')
            f.tight_layout()
            p = Path(f"outputs/{metric}/{year}.png")
            p.parent.mkdir(parents=True, exist_ok=True)
            f.savefig(p)
            plt.close(f)
            fps = 12
        except:
            print(f"{metric}:{year} failed")
        finally:
            if f:
                plt.close(f)

In [ ]:

import moviepy.editor as mpy
import moviepy.video as mpyv
clips = {}
for p in Path('outputs/').iterdir():
    if not p.is_dir():
        continue
    metric = p.parts[-1]
    file_list = [str(_p) for _p in sorted(filter(lambda s:s.suffix=='.png', p.iterdir()))]
    clip = mpy.ImageSequenceClip(file_list, fps=1.5)
    clip = mpyv.fx.all.freeze(clip, t='end', freeze_duration=2)
    clip.write_gif(f"outputs/{metric}.gif")
    clips[metric]=clip
    

Population Distribution Shifts¶

In [ ]:

metric='Population Distribution'
for year in years:
    try:
        f, ax = plt.subplots(figsize=graph_figsize)
        if metric in explanations:
            ax.set_title(f"{explanations[metric]}:({year})")
        else:
            ax.set_title(f"{metric}:({year})")

        df = cons_pop[(cons_pop.Gender == 'All Persons') \
                  & (cons_pop.Mid_Year_Ending == year)]\
            .groupby(['Age','constituency'])['Population_Estimate']\
            .sum().unstack()
        sns.heatmap(df.T, ax=ax)
        f.tight_layout()
        p = Path(f"outputs/{metric}/{year}.png")
        p.parent.mkdir(parents=True, exist_ok=True)
        f.savefig(p)
        plt.close(f)

        fps = 12
    except:
        print(f"{metric}:{year} failed")
    finally:
        if f:
            plt.close(f)
            
metric='Population Distribution _normed_'
for year in years:
    try:
        f, ax = plt.subplots(figsize=graph_figsize)
        if metric in explanations:
            ax.set_title(f"{explanations[metric]}:({year})")
        else:
            ax.set_title(f"{metric}:({year})")

        df = cons_pop[(cons_pop.Gender == 'All Persons') \
                  & (cons_pop.Mid_Year_Ending == year)]\
            .groupby(['Age','constituency'])['Population_Estimate']\
            .sum().unstack()
        sns.heatmap(colnorm(df.T), ax=ax)
        f.tight_layout()
        p = Path(f"outputs/{metric}/{year}.png")
        p.parent.mkdir(parents=True, exist_ok=True)
        f.savefig(p)
        plt.close(f)

        fps = 12
    except:
        print(f"{metric}:{year} failed")
    finally:
        if f:
            plt.close(f)

clips = {}
for p in Path('outputs/').iterdir():
    if not p.is_dir():
        continue
    metric = p.parts[-1]
    file_list = [str(_p) for _p in sorted(filter(lambda s:s.suffix=='.png', p.iterdir()))]
    clip = mpy.ImageSequenceClip(file_list, fps=1.5)
    clip = mpyv.fx.all.freeze(clip, t='end', freeze_duration=2)
    clip.write_gif(f"outputs/{metric}.gif")
    clips[metric]=clip
    

![](outputs/Population Distribution.gif)

In [ ]:

year = 2016
school_type = reference[year][['school type','school management type','parliamentary constituency']]
school_type = school_type.dropna(how='all')
for c in school_type:
    school_type[c] = school_type[c].astype('category')
consschool_type.groupby(['parliamentary constituency','school type']).size().unstack('school type')

In [ ]:

cons_stats = cons_stats.assign(**school_type.groupby(['parliamentary constituency','school type']).size().unstack('school type').to_dict(orient='series'))
cons_stats = cons_stats.assign(**school_type.groupby(['parliamentary constituency','school management type']).size().unstack('school management type').to_dict(orient='series'))

In [ ]:

cons_stats

In [ ]:

cons_stats['grampct'] = 100*cons_stats['grammar'] / (cons_stats['secondary']+cons_stats['grammar'])
cons_stats['grampct'] = 100*cons_stats['grammar'] / (cons_stats['secondary']+cons_stats['grammar'])

In [ ]:

f,ax = plt.subplots( figsize=graph_figsize) 
cons_stats[['grampct','geometry']].plot(ax=ax, column='grampct', legend=True)
ax.axis('off')
ax.set_title('% of schools in constituency that are grammars')
f.tight_layout()
p = Path(f"outputs/grampct/2016.png")
p.parent.mkdir(parents=True, exist_ok=True)
f.savefig(p)

In [ ]:

unyearify = lambda c:'_'.join([c.split('_')[0]]+c.split('_')[2:]) if len(c.split('_'))>1 else c
allowed_list = ['grampct']
cons_stats_2016 = cons_stats[[c for c in cons_stats.columns if '2016' in c or c in allowed_list ]].rename(columns=unyearify)

sns.pairplot(data=cons_stats_2016, hue='topqt',
             vars=['fsm_rat','grampct'])

In [ ]:

cons_stats_2016

In [ ]:

enrolment.sum().sum()