#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_cell_magic('HTML', '', ' \n') # In[2]: from IPython.display import Image import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) from matplotlib import pyplot as plt plt.rcParams.update({'figure.max_open_warning': 0}) graph_figsize = (10,6) # I'm forgetful and lazy plt.rcParams.update({'figure.figsize': graph_figsize}) get_ipython().run_line_magic('matplotlib', 'nbagg') # Jupyter Magics! import geopandas as gp import urllib.request import zipfile from pathlib import Path import pandas as pd import numpy as np import seaborn as sns from itertools import chain from collections import defaultdict import moviepy.editor as mpy import moviepy.video as mpyv import plotly.plotly as py from plotly.offline import init_notebook_mode, plot, iplot import plotly.graph_objs as go from datetime import datetime init_notebook_mode(connected=False) import cufflinks from bs4 import BeautifulSoup import requests from tqdm import tqdm_notebook as tqdm from ckanapi import RemoteCKAN # it's on pip def build_odni_connection(): """Be nice to OpenDataNI and tell them how old I am. (And that it's me)""" version_no = (pd.to_datetime('now') - pd.to_datetime('1988/05/17')).days/365 ua = f'@Bolster/{version_no:.2f} (+http://bolster.online/)' return RemoteCKAN('https://www.opendatani.gov.uk/', user_agent=ua) # # Extraction, Transformation and Loading # # * Get the data # * Clean the data # * Store the data # # ## Extraction: # * use BS4 to walk the Department of Education datasets for post primary school level data for: # * * # In[ ]: def get_year_files_for_enrolement_data(dest_base_dir): base_url = "https://www.education-ni.gov.uk" year_files = defaultdict(list) listing_path = "/articles/school-enrolments-school-level-data" soup = BeautifulSoup(requests.get(base_url+listing_path).text, 'lxml') for link in tqdm(soup.find_all('a'), desc='years', leave=False ): if 'School enrolments - school level data 20' in ' '.join(map(str, link.contents)): year = link.get('href')[-6:-2] # Follow year and get all the relevant files year_soup = BeautifulSoup( requests.get(base_url+link.get('href')).text, 'lxml' ) for link in tqdm(year_soup.find_all('a'), desc=f'files in {year}', leave=False ): year_dir = dest_base_dir.joinpath(str(year)) year_dir.mkdir(parents=True, exist_ok=True) href = link.get('href', "") filename = href.split('/')[-1] contents = ' '.join(map(str, link.contents)) if 'xls' in href.split('.')[-1].lower(): # See this requirement right 👆 # Mix of xls, XLSX, xlsx and XLS dest_file = year_dir/filename if not dest_file.exists(): urllib.request.urlretrieve( href, dest_file ) year_files[year].append(filename) return year_files dest = Path('./data/education-ni/') year_files = get_year_files_for_enrolement_data(dest) # In[ ]: sheets = defaultdict(list) for year, files in year_files.items(): for file in files: if 'post' in file: xls = pd.ExcelFile(f'data/education-ni/{year}/{file}') sheets[year].extend(xls.sheet_names) dict(sheets) # In[ ]: from collections import Counter all_sheet_names = Counter([_ for d in sheets.values() for _ in d]) all_sheet_names.most_common() # In[ ]: df = pd.DataFrame.from_dict({ year: [sn in sheet_names for sn in all_sheet_names] for year, sheet_names in sheets.items() }, orient='index') df.columns=all_sheet_names.keys() df.T # In[ ]: import seaborn as sns f,ax = plt.subplots(figsize=graph_figsize) sns.heatmap(df.T, ax=ax) # In[ ]: def parse_reference_table(xls): """ From an ExcelFile, clean up: * School Mgmt Type disaster * Inconsistent header depth * Multi-row header names * inconsistent headers (ref_key_map) * inconsistent col order * inconsistent caps/spacing (strip|lower) """ cols= [ 'de ref', 'school name', 'school type', 'address 1', 'postcode', 'urban_rural', 'school management type', 'district council', 'parliamentary constituency', 'town' ] categories = [ 'school type', 'urban_rural', 'school management type', 'district council', 'parliamentary constituency', 'town' ] ref_key_map={ 'denino':'de ref', 'urban/ rural': 'urban_rural', 'schoolname': 'school name' } reference_value_rename = { 'school management type':{ 'gmi':'integrated', 'controlled integrated':'integrated', 'roman catholic maintained':'rc maintained', 'grant maintained integrated':'integrated', 'voluntary - other managed':'voluntary', 'voluntary - rc managed':'voluntary', 'catholic maintained':'rc maintained' } } join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower() if 'reference data' in xls.sheet_names: df = pd.read_excel(xls, 'reference data', header=None) h_range = 2 if isinstance(df.ix[3,0], int) else 3 try: df.columns=df.ix[1:h_range].fillna('').apply(join_n_strip_n_lower, axis=0).values df.rename(columns=ref_key_map, inplace=True) df = df.drop(df.index[0:h_range+1]).reset_index(drop=True) df = df[cols] df['de ref'] = df['de ref'].astype(int) df.set_index('de ref', inplace=True) for c in df: df[c]=df[c].str.lower().str.strip() df.replace(reference_value_rename, inplace=True) for c in categories: df[c].fillna('NA', inplace=True) df[c] = df[c].astype('category') except TypeError as e: print(e) else: df=None return df def parse_enrolments_table(xls): """From an ExcelFile, clean up: * Inconsistent header depth * fucked up nans/nulls all over the place * inconsistent *footer* depth... * Set de ref as join index and drop pointless fields """ def join_n_strip_n_lower(l): return ' '.join(l).strip().lower() def strip_n_lower(s): return s.strip().lower() def unyearify(s): return int(s.replace('year ', '')) if 'enrolments' in xls.sheet_names: df = pd.read_excel(xls, 'enrolments', header=None, skip_footer=5) h_range = 2 if isinstance(df.ix[3, 0], int) else 3 try: df.columns = df.ix[3].fillna('').apply(strip_n_lower).values df = df.drop(df.index[0:h_range+1]).reset_index(drop=True) df.dropna(how='all', inplace=True, axis=0) df['de ref'] = df['de ref'].astype(int) df.drop('schoolname', axis=1, inplace=True) df.drop('total pupils', axis=1, inplace=True) df.set_index('de ref', inplace=True) df.rename(columns=unyearify, inplace=True) df = df.astype(float) except TypeError as e: print(e) else: df = None return df def parse_fsm_table(xls): """From an ExcelFile, clean up: * Inconsistent header depth * fucked up nans/nulls all over the place * inconsistent *footer* depth... * Set de ref as join index and drop pointless fields """ join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower() strip_n_lower = lambda s: s.strip().lower() unyearify = lambda s: int(s.replace('year ','')) if 'free school meals' in xls.sheet_names: df = pd.read_excel(xls, 'free school meals', header=None, skip_footer=5) h_range = 2 if isinstance(df.ix[3,0], int) else 3 try: df.columns=df.ix[3].fillna('').apply(strip_n_lower).values df = df.drop(df.index[0:h_range+1]).reset_index(drop=True) df.dropna(how='all', inplace=True, axis=0) df['de ref'] = df['de ref'].astype(int) df.drop('schoolname',axis=1, inplace=True) df.drop('free school meals', axis=1, inplace=True) df.set_index('de ref', inplace=True) df.replace('#',pd.np.nan, inplace=True) # # = Undisclosed df.replace('*',2.0, inplace=True) # * == < 5 df.replace('!',1, inplace=True) # ! avoid identification, so it's prob one or two df=df.astype(float) except TypeError as e: print(e) else: df=None return df def parse_available_table(xls): """From an ExcelFile, clean up: * Inconsistent header depth * fucked up nans/nulls all over the place * inconsistent *footer* depth... * Set de ref as join index and drop pointless fields * Totally different schemas between years * Inconsistent metric naming * non numerical data flags (*/!) """ ref_key_map={ 'schoolname': 'school name', 'total unfilled places': 'available places', 'unfilled places': 'available places', 'total approved enrolment number': 'approved enrolments' } join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower() if 'School level data' in xls.sheet_names: df = pd.read_excel(xls, 'School level data', header=None) h_range = 2 if isinstance(df.ix[3,0], int) else 3 elif 'unfilled places' in xls.sheet_names: df = pd.read_excel(xls, 'unfilled places', header=None) h_range = 2 if isinstance(df.ix[3,0], int) else 3 else: df=None if df is not None: try: df.columns=df.ix[1:h_range].fillna('').apply(join_n_strip_n_lower, axis=0).values df.rename(columns=ref_key_map, inplace=True) df = df.drop(df.index[0:h_range+1]).reset_index(drop=True) df=df.applymap(lambda x: np.nan if isinstance(x, str) and x.isspace() else x) df.dropna(how='all', axis=1, inplace=True) df.dropna(how='any', axis=0, inplace=True) if df.shape[1] == 6: # recent doesn't have fecking headers cols = list(df.columns) cols[0] = 'de ref' cols[1] = 'school name' df.columns=cols df.drop('school name', axis=1, inplace=True) df['de ref'] = df['de ref'].astype(int) df.set_index('de ref', inplace=True) df.replace('*',2.0, inplace=True) # * == < 5 df.replace('!',1, inplace=True) # ! avoid identification, so it's prob one or two df.dropna(how='all', inplace=True, axis=1) df.astype(int, inplace=True) except TypeError as e: print(e) return df # In[ ]: re_dfs={} av_dfs={} en_dfs={} fsm_dfs={} for year, files in year_files.items(): for file in files: if 'post' in file: xls = pd.ExcelFile(f'data/education-ni/{year}/{file}') df = parse_reference_table(xls) if df is not None: print(f'Got reference data for {year}') re_dfs[year]=df df = parse_enrolments_table(xls) if df is not None: print(f'Got enrolment data for {year}') en_dfs[year]=df df = parse_available_table(xls) if df is not None: print(f'Got available data for {year}') av_dfs[year]=df df = parse_fsm_table(xls) if df is not None: print(f'Got fsm data for {year}') fsm_dfs[year]=df reference = pd.Panel(re_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2) available = pd.Panel(av_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2) enrolment = pd.Panel(en_dfs).sort_index().rename_axis('year').rename_axis('yeargroup', axis=2) fsm = pd.Panel(fsm_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2) reference.to_hdf('data.h5', 'reference') available.to_hdf('data.h5', 'available') enrolment.to_hdf('data.h5', 'enrolment') fsm.to_hdf('data.h5','fsm') # In[ ]: reference # # Population Data # In[ ]: if not Path('data/cons_pop.csv').exists(): odni = build_odni_connection() for dataset in odni.action.package_show(id='population-estimates-for-northern-ireland')['resources']: if dataset['name'] == "Parliamentary Constituencies by single year of age and gender (mid-2001 to mid-2017)": cons_pop = pd.read_csv(dataset['url']) cons_pop.to_csv("data/cons_pop.csv", index=False) cons_pop.head() cons_pop = pd.read_csv('data/cons_pop.csv') cons_pop['Mid_Year_Ending'] = cons_pop.Mid_Year_Ending.astype(int) cons_pop['Population_Estimate'] = cons_pop.Population_Estimate.astype(float) cons_pop['Age'] = cons_pop.Age.astype(int) cons_pop.rename(columns={'Geo_Name':'constituency'}, inplace=True) cons_pop['constituency']= cons_pop.constituency.str.strip().str.lower() cons_pop[(cons_pop.Gender == 'All Persons') & (cons_pop.Mid_Year_Ending == 2016)].head() cons_pop.to_hdf('data.h5','cons_pop') # # Constituency Maps # In[ ]: cons_gdf_zip="http://osni-spatial-ni.opendata.arcgis.com/datasets/563dc2ec3d9943428e3fe68966d40deb_3.zip" cons_gdf_shp = "OSNI_Open_Data_Largescale_Boundaries__Parliamentary_Constituencies_2008.shp" if not Path('data/'+cons_gdf_shp).exists(): urllib.request.urlretrieve(cons_gdf_zip, 'data/_tmp.zip') with zipfile.ZipFile('data/_tmp.zip') as z: z.extractall('data/') Path('data/_tmp.zip').unlink() cons_gdf=gp.GeoDataFrame.from_file('data/'+cons_gdf_shp) cons_gdf.rename(columns={'PC_NAME':'constituency'}, inplace=True) cons_gdf.drop(['OBJECTID','PC_ID'], axis=1, inplace=True) cons_gdf['constituency'] = cons_gdf['constituency'].str.lower().str.strip() cons_gdf.set_index('constituency', inplace=True) cons_gdf.plot() # In[ ]: import fiona; fiona.supported_drivers # In[ ]: # Enrolment sum of year group for all schools in constituencey per consitituency en_df= pd.DataFrame.from_dict({ con: enrolment[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],:].sum().sum() for con in cons }).T # Available places sum of year group for all schools in constituencey per consitituency av_df= pd.DataFrame.from_dict({ con: available[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],'available places'].sum() for con in cons }).T # Free School Meals fsm_df= pd.DataFrame.from_dict({ con: fsm[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],'fsme'].sum() for con in cons }).T for c in av_df: cons_gdf[f"av_{c}"] = av_df[c] cons_gdf[f"av_{c}_rat"] = (av_df[c]/en_df[c]) for c in en_df: cons_gdf[f"en_{c}"] = en_df[c] cons_gdf[f"en_{c}_pk"] = en_df[c]/cons_gdf["Area_sqkm"] for c in fsm_df: cons_gdf[f"fsm_{c}"] = fsm_df[c] cons_gdf[f"fsm_{c}_rat"] = (fsm_df[c]/en_df[c]) def get_winning_quartile(df, age_quartiles): age_counts = df.groupby("Age")['Population_Estimate'].sum() quartile_counts = pd.Series({ f"{lower}-{upper}":age_counts.loc[lower:upper].sum() for lower,upper in zip([0]+list(age_quartiles), list(age_quartiles)+[99]) }) return quartile_counts.idxmax() pop_years = set(cons_pop.Mid_Year_Ending.unique()) for c in tqdm(reference.axes[0].astype(int), desc="Year"): # Preserve edu data and use closest population year if c not in pop_years: yr = pop_years[np.abs(pop_years-c).argmin()] else: yr=c age_quartiles=annual_quartiles.loc[yr].astype(int) _cons_pop = cons_pop[(cons_pop.Gender != 'All Persons') \ & (cons_pop.Mid_Year_Ending == yr) \ ].groupby(['constituency', 'Gender'])['Population_Estimate'].sum()\ .unstack().rename(columns=lambda c: c.lower()) _cons_pop['total'] = _cons_pop.sum(axis=1) _cons_pop['m_per_f'] = _cons_pop['males']/_cons_pop['females'] cons_gdf[f"pop_{c}"] = _cons_pop['total'] cons_gdf[f"pop_{c}_males"] = _cons_pop['males'] cons_gdf[f"pop_{c}_females"] = _cons_pop['females'] cons_gdf[f"pop_{c}_m_per_f"] = _cons_pop['m_per_f'] cons_gdf[f"topqt_{c}"]=cons_pop[ (cons_pop.Gender != 'All Persons') & (cons_pop.Mid_Year_Ending == yr) ].groupby('constituency').apply(get_winning_quartile, age_quartiles=annual_quartiles.loc[yr].values) cons_gdf[f"en_{c}_pc"]=cons_gdf[f"en_{c}"]/cons_gdf[f"pop_{c}"] cons_gdf[f"fsm_{c}_pc"]=cons_gdf[f"fsm_{c}"]/cons_gdf[f"pop_{c}"] cons_pop_mean=cons_pop[(cons_pop.Gender.isin(["All Persons"])) & (cons_pop.Mid_Year_Ending == yr)] cons_pop_mean['popprod'] = cons_pop_mean[['Age','Population_Estimate']].product(axis=1) cons_pop_mean=cons_pop_mean.groupby('constituency')[['popprod','Population_Estimate']].sum(axis=0) cons_gdf[f'age_{c}_avg'] = cons_pop_mean['popprod']/cons_pop_mean['Population_Estimate'] try: cons_gdf[f"av_{c}_pc"]=cons_gdf[f"av_{c}"]/cons_gdf[f"pop_{c}"] except: print(f"No data for av in {c}") cons_gdf.to_file('data.h5', 'cons_gdf') # ### That's not a great way of summarising age distributions.... # In[ ]: def flatten_ages(s): return np.asarray( list(chain.from_iterable( ([age]*int(n) for age,n in s.iteritems()) )) ) def get_age_quantiles(df, q=[0.25,0.5,0.75]): pop_sum=df.groupby('Age')['Population_Estimate'].sum() age_quartiles = np.quantile(flatten_ages(pop_sum),q) return pd.Series(dict(zip(q,age_quartiles))) annual_cons_quartiles = cons_pop[(cons_pop.Gender == "All Persons")].groupby(["Mid_Year_Ending","constituency"]).apply(get_age_quantiles) annual_quartiles = cons_pop[(cons_pop.Gender == "All Persons")].groupby("Mid_Year_Ending").apply(get_age_quantiles) # In[ ]: pop_years = cons_pop.Mid_Year_Ending.unique() for c in tqdm(reference.axes[0].astype(int), desc="Year"): # Preserve edu data and use closest population year if c not in pop_years: yr = pop_years[np.abs(pop_years-c).argmin()] else: yr=c age_quartiles=annual_quartiles.loc[yr].astype(int) _cons_pop = cons_pop[(cons_pop.Gender != 'All Persons') \ & (cons_pop.Mid_Year_Ending == yr) \ ].groupby(['constituency', 'Gender'])['Population_Estimate'].sum()\ .unstack().rename(columns=lambda c: c.lower()) _cons_pop['total'] = _cons_pop.sum(axis=1) _cons_pop['m_per_f'] = _cons_pop['males']/_cons_pop['females'] cons_pop_qilted=pd.DataFrame.from_dict( { f"{lotile}-{qtile}":cons_pop[(cons_pop.Gender == 'All Persons') \ & (cons_pop.Mid_Year_Ending == yr) \ & (lotile<=cons_pop.Age) &(cons_pop.Age1 else c allowed_list = ['grampct'] cons_stats_2016 = cons_stats[[c for c in cons_stats.columns if '2016' in c or c in allowed_list ]].rename(columns=unyearify) sns.pairplot(data=cons_stats_2016, hue='topqt', vars=['fsm_rat','grampct']) # In[ ]: cons_stats_2016 # In[ ]: enrolment.sum().sum()