%%HTML
<style>
/* style for presentation only */
.reveal .rendered_html table { font-size: 24px }
/* centre images */
.reveal .rendered_html img {
display: table-cell;
text-align: center;
vertical-align: middle;
horizontal-align: middle;
margin-left: auto;
margin-right: auto;
}
</style>
from IPython.display import Image
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
graph_figsize = (10,6) # I'm forgetful and lazy
plt.rcParams.update({'figure.figsize': graph_figsize})
%matplotlib nbagg
# Jupyter Magics!
import geopandas as gp
import urllib.request
import zipfile
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import chain
from collections import defaultdict
import moviepy.editor as mpy
import moviepy.video as mpyv
import plotly.plotly as py
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from datetime import datetime
init_notebook_mode(connected=False)
import cufflinks
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm_notebook as tqdm
from ckanapi import RemoteCKAN # it's on pip
def build_odni_connection():
"""Be nice to OpenDataNI and tell them how old I am. (And that it's me)"""
version_no = (pd.to_datetime('now') -
pd.to_datetime('1988/05/17')).days/365
ua = f'@Bolster/{version_no:.2f} (+http://bolster.online/)'
return RemoteCKAN('https://www.opendatani.gov.uk/', user_agent=ua)
def get_year_files_for_enrolement_data(dest_base_dir):
base_url = "https://www.education-ni.gov.uk"
year_files = defaultdict(list)
listing_path = "/articles/school-enrolments-school-level-data"
soup = BeautifulSoup(requests.get(base_url+listing_path).text, 'lxml')
for link in tqdm(soup.find_all('a'),
desc='years',
leave=False
):
if 'School enrolments - school level data 20' in ' '.join(map(str, link.contents)):
year = link.get('href')[-6:-2]
# Follow year and get all the relevant files
year_soup = BeautifulSoup(
requests.get(base_url+link.get('href')).text,
'lxml'
)
for link in tqdm(year_soup.find_all('a'),
desc=f'files in {year}',
leave=False
):
year_dir = dest_base_dir.joinpath(str(year))
year_dir.mkdir(parents=True, exist_ok=True)
href = link.get('href', "")
filename = href.split('/')[-1]
contents = ' '.join(map(str, link.contents))
if 'xls' in href.split('.')[-1].lower():
# See this requirement right 👆
# Mix of xls, XLSX, xlsx and XLS
dest_file = year_dir/filename
if not dest_file.exists():
urllib.request.urlretrieve(
href, dest_file
)
year_files[year].append(filename)
return year_files
dest = Path('./data/education-ni/')
year_files = get_year_files_for_enrolement_data(dest)
sheets = defaultdict(list)
for year, files in year_files.items():
for file in files:
if 'post' in file:
xls = pd.ExcelFile(f'data/education-ni/{year}/{file}')
sheets[year].extend(xls.sheet_names)
dict(sheets)
from collections import Counter
all_sheet_names = Counter([_ for d in sheets.values() for _ in d])
all_sheet_names.most_common()
df = pd.DataFrame.from_dict({
year: [sn in sheet_names for sn in all_sheet_names]
for year, sheet_names in sheets.items()
}, orient='index')
df.columns=all_sheet_names.keys()
df.T
import seaborn as sns
f,ax = plt.subplots(figsize=graph_figsize)
sns.heatmap(df.T, ax=ax)
def parse_reference_table(xls):
"""
From an ExcelFile, clean up:
* School Mgmt Type disaster
* Inconsistent header depth
* Multi-row header names
* inconsistent headers (ref_key_map)
* inconsistent col order
* inconsistent caps/spacing (strip|lower)
"""
cols= [
'de ref',
'school name',
'school type',
'address 1',
'postcode',
'urban_rural',
'school management type',
'district council',
'parliamentary constituency',
'town'
]
categories = [
'school type',
'urban_rural',
'school management type',
'district council',
'parliamentary constituency',
'town'
]
ref_key_map={
'denino':'de ref',
'urban/ rural': 'urban_rural',
'schoolname': 'school name'
}
reference_value_rename = {
'school management type':{
'gmi':'integrated',
'controlled integrated':'integrated',
'roman catholic maintained':'rc maintained',
'grant maintained integrated':'integrated',
'voluntary - other managed':'voluntary',
'voluntary - rc managed':'voluntary',
'catholic maintained':'rc maintained'
}
}
join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()
if 'reference data' in xls.sheet_names:
df = pd.read_excel(xls, 'reference data', header=None)
h_range = 2 if isinstance(df.ix[3,0], int) else 3
try:
df.columns=df.ix[1:h_range].fillna('').apply(join_n_strip_n_lower, axis=0).values
df.rename(columns=ref_key_map, inplace=True)
df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)
df = df[cols]
df['de ref'] = df['de ref'].astype(int)
df.set_index('de ref', inplace=True)
for c in df:
df[c]=df[c].str.lower().str.strip()
df.replace(reference_value_rename, inplace=True)
for c in categories:
df[c].fillna('NA', inplace=True)
df[c] = df[c].astype('category')
except TypeError as e:
print(e)
else:
df=None
return df
def parse_enrolments_table(xls):
"""From an ExcelFile, clean up:
* Inconsistent header depth
* fucked up nans/nulls all over the place
* inconsistent *footer* depth...
* Set de ref as join index and drop pointless fields
"""
def join_n_strip_n_lower(l): return ' '.join(l).strip().lower()
def strip_n_lower(s): return s.strip().lower()
def unyearify(s): return int(s.replace('year ', ''))
if 'enrolments' in xls.sheet_names:
df = pd.read_excel(xls, 'enrolments', header=None, skip_footer=5)
h_range = 2 if isinstance(df.ix[3, 0], int) else 3
try:
df.columns = df.ix[3].fillna('').apply(strip_n_lower).values
df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)
df.dropna(how='all', inplace=True, axis=0)
df['de ref'] = df['de ref'].astype(int)
df.drop('schoolname', axis=1, inplace=True)
df.drop('total pupils', axis=1, inplace=True)
df.set_index('de ref', inplace=True)
df.rename(columns=unyearify, inplace=True)
df = df.astype(float)
except TypeError as e:
print(e)
else:
df = None
return df
def parse_fsm_table(xls):
"""From an ExcelFile, clean up:
* Inconsistent header depth
* fucked up nans/nulls all over the place
* inconsistent *footer* depth...
* Set de ref as join index and drop pointless fields
"""
join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()
strip_n_lower = lambda s: s.strip().lower()
unyearify = lambda s: int(s.replace('year ',''))
if 'free school meals' in xls.sheet_names:
df = pd.read_excel(xls, 'free school meals', header=None, skip_footer=5)
h_range = 2 if isinstance(df.ix[3,0], int) else 3
try:
df.columns=df.ix[3].fillna('').apply(strip_n_lower).values
df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)
df.dropna(how='all', inplace=True, axis=0)
df['de ref'] = df['de ref'].astype(int)
df.drop('schoolname',axis=1, inplace=True)
df.drop('free school meals', axis=1, inplace=True)
df.set_index('de ref', inplace=True)
df.replace('#',pd.np.nan, inplace=True) # # = Undisclosed
df.replace('*',2.0, inplace=True) # * == < 5
df.replace('!',1, inplace=True) # ! avoid identification, so it's prob one or two
df=df.astype(float)
except TypeError as e:
print(e)
else:
df=None
return df
def parse_available_table(xls):
"""From an ExcelFile, clean up:
* Inconsistent header depth
* fucked up nans/nulls all over the place
* inconsistent *footer* depth...
* Set de ref as join index and drop pointless fields
* Totally different schemas between years
* Inconsistent metric naming
* non numerical data flags (*/!)
"""
ref_key_map={
'schoolname': 'school name',
'total unfilled places': 'available places',
'unfilled places': 'available places',
'total approved enrolment number': 'approved enrolments'
}
join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()
if 'School level data' in xls.sheet_names:
df = pd.read_excel(xls, 'School level data', header=None)
h_range = 2 if isinstance(df.ix[3,0], int) else 3
elif 'unfilled places' in xls.sheet_names:
df = pd.read_excel(xls, 'unfilled places', header=None)
h_range = 2 if isinstance(df.ix[3,0], int) else 3
else:
df=None
if df is not None:
try:
df.columns=df.ix[1:h_range].fillna('').apply(join_n_strip_n_lower, axis=0).values
df.rename(columns=ref_key_map, inplace=True)
df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)
df=df.applymap(lambda x: np.nan if isinstance(x, str) and x.isspace() else x)
df.dropna(how='all', axis=1, inplace=True)
df.dropna(how='any', axis=0, inplace=True)
if df.shape[1] == 6: # recent doesn't have fecking headers
cols = list(df.columns)
cols[0] = 'de ref'
cols[1] = 'school name'
df.columns=cols
df.drop('school name', axis=1, inplace=True)
df['de ref'] = df['de ref'].astype(int)
df.set_index('de ref', inplace=True)
df.replace('*',2.0, inplace=True) # * == < 5
df.replace('!',1, inplace=True) # ! avoid identification, so it's prob one or two
df.dropna(how='all', inplace=True, axis=1)
df.astype(int, inplace=True)
except TypeError as e:
print(e)
return df
re_dfs={}
av_dfs={}
en_dfs={}
fsm_dfs={}
for year, files in year_files.items():
for file in files:
if 'post' in file:
xls = pd.ExcelFile(f'data/education-ni/{year}/{file}')
df = parse_reference_table(xls)
if df is not None:
print(f'Got reference data for {year}')
re_dfs[year]=df
df = parse_enrolments_table(xls)
if df is not None:
print(f'Got enrolment data for {year}')
en_dfs[year]=df
df = parse_available_table(xls)
if df is not None:
print(f'Got available data for {year}')
av_dfs[year]=df
df = parse_fsm_table(xls)
if df is not None:
print(f'Got fsm data for {year}')
fsm_dfs[year]=df
reference = pd.Panel(re_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)
available = pd.Panel(av_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)
enrolment = pd.Panel(en_dfs).sort_index().rename_axis('year').rename_axis('yeargroup', axis=2)
fsm = pd.Panel(fsm_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)
reference.to_hdf('data.h5', 'reference')
available.to_hdf('data.h5', 'available')
enrolment.to_hdf('data.h5', 'enrolment')
fsm.to_hdf('data.h5','fsm')
reference
if not Path('data/cons_pop.csv').exists():
odni = build_odni_connection()
for dataset in odni.action.package_show(id='population-estimates-for-northern-ireland')['resources']:
if dataset['name'] == "Parliamentary Constituencies by single year of age and gender (mid-2001 to mid-2017)":
cons_pop = pd.read_csv(dataset['url'])
cons_pop.to_csv("data/cons_pop.csv", index=False)
cons_pop.head()
cons_pop = pd.read_csv('data/cons_pop.csv')
cons_pop['Mid_Year_Ending'] = cons_pop.Mid_Year_Ending.astype(int)
cons_pop['Population_Estimate'] = cons_pop.Population_Estimate.astype(float)
cons_pop['Age'] = cons_pop.Age.astype(int)
cons_pop.rename(columns={'Geo_Name':'constituency'}, inplace=True)
cons_pop['constituency']= cons_pop.constituency.str.strip().str.lower()
cons_pop[(cons_pop.Gender == 'All Persons') & (cons_pop.Mid_Year_Ending == 2016)].head()
cons_pop.to_hdf('data.h5','cons_pop')
cons_gdf_zip="http://osni-spatial-ni.opendata.arcgis.com/datasets/563dc2ec3d9943428e3fe68966d40deb_3.zip"
cons_gdf_shp = "OSNI_Open_Data_Largescale_Boundaries__Parliamentary_Constituencies_2008.shp"
if not Path('data/'+cons_gdf_shp).exists():
urllib.request.urlretrieve(cons_gdf_zip, 'data/_tmp.zip')
with zipfile.ZipFile('data/_tmp.zip') as z:
z.extractall('data/')
Path('data/_tmp.zip').unlink()
cons_gdf=gp.GeoDataFrame.from_file('data/'+cons_gdf_shp)
cons_gdf.rename(columns={'PC_NAME':'constituency'}, inplace=True)
cons_gdf.drop(['OBJECTID','PC_ID'], axis=1, inplace=True)
cons_gdf['constituency'] = cons_gdf['constituency'].str.lower().str.strip()
cons_gdf.set_index('constituency', inplace=True)
cons_gdf.plot()
import fiona; fiona.supported_drivers
# Enrolment sum of year group for all schools in constituencey per consitituency
en_df= pd.DataFrame.from_dict({
con:
enrolment[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],:].sum().sum()
for con in cons
}).T
# Available places sum of year group for all schools in constituencey per consitituency
av_df= pd.DataFrame.from_dict({
con:
available[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],'available places'].sum()
for con in cons
}).T
# Free School Meals
fsm_df= pd.DataFrame.from_dict({
con:
fsm[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],'fsme'].sum()
for con in cons
}).T
for c in av_df:
cons_gdf[f"av_{c}"] = av_df[c]
cons_gdf[f"av_{c}_rat"] = (av_df[c]/en_df[c])
for c in en_df:
cons_gdf[f"en_{c}"] = en_df[c]
cons_gdf[f"en_{c}_pk"] = en_df[c]/cons_gdf["Area_sqkm"]
for c in fsm_df:
cons_gdf[f"fsm_{c}"] = fsm_df[c]
cons_gdf[f"fsm_{c}_rat"] = (fsm_df[c]/en_df[c])
def get_winning_quartile(df, age_quartiles):
age_counts = df.groupby("Age")['Population_Estimate'].sum()
quartile_counts = pd.Series({
f"{lower}-{upper}":age_counts.loc[lower:upper].sum()
for lower,upper in zip([0]+list(age_quartiles), list(age_quartiles)+[99])
})
return quartile_counts.idxmax()
pop_years = set(cons_pop.Mid_Year_Ending.unique())
for c in tqdm(reference.axes[0].astype(int), desc="Year"):
# Preserve edu data and use closest population year
if c not in pop_years:
yr = pop_years[np.abs(pop_years-c).argmin()]
else:
yr=c
age_quartiles=annual_quartiles.loc[yr].astype(int)
_cons_pop = cons_pop[(cons_pop.Gender != 'All Persons') \
& (cons_pop.Mid_Year_Ending == yr) \
].groupby(['constituency', 'Gender'])['Population_Estimate'].sum()\
.unstack().rename(columns=lambda c: c.lower())
_cons_pop['total'] = _cons_pop.sum(axis=1)
_cons_pop['m_per_f'] = _cons_pop['males']/_cons_pop['females']
cons_gdf[f"pop_{c}"] = _cons_pop['total']
cons_gdf[f"pop_{c}_males"] = _cons_pop['males']
cons_gdf[f"pop_{c}_females"] = _cons_pop['females']
cons_gdf[f"pop_{c}_m_per_f"] = _cons_pop['m_per_f']
cons_gdf[f"topqt_{c}"]=cons_pop[
(cons_pop.Gender != 'All Persons') & (cons_pop.Mid_Year_Ending == yr)
].groupby('constituency').apply(get_winning_quartile, age_quartiles=annual_quartiles.loc[yr].values)
cons_gdf[f"en_{c}_pc"]=cons_gdf[f"en_{c}"]/cons_gdf[f"pop_{c}"]
cons_gdf[f"fsm_{c}_pc"]=cons_gdf[f"fsm_{c}"]/cons_gdf[f"pop_{c}"]
cons_pop_mean=cons_pop[(cons_pop.Gender.isin(["All Persons"])) & (cons_pop.Mid_Year_Ending == yr)]
cons_pop_mean['popprod'] = cons_pop_mean[['Age','Population_Estimate']].product(axis=1)
cons_pop_mean=cons_pop_mean.groupby('constituency')[['popprod','Population_Estimate']].sum(axis=0)
cons_gdf[f'age_{c}_avg'] = cons_pop_mean['popprod']/cons_pop_mean['Population_Estimate']
try:
cons_gdf[f"av_{c}_pc"]=cons_gdf[f"av_{c}"]/cons_gdf[f"pop_{c}"]
except:
print(f"No data for av in {c}")
cons_gdf.to_file('data.h5', 'cons_gdf')
def flatten_ages(s):
return np.asarray(
list(chain.from_iterable(
([age]*int(n) for age,n in s.iteritems())
))
)
def get_age_quantiles(df, q=[0.25,0.5,0.75]):
pop_sum=df.groupby('Age')['Population_Estimate'].sum()
age_quartiles = np.quantile(flatten_ages(pop_sum),q)
return pd.Series(dict(zip(q,age_quartiles)))
annual_cons_quartiles = cons_pop[(cons_pop.Gender == "All Persons")].groupby(["Mid_Year_Ending","constituency"]).apply(get_age_quantiles)
annual_quartiles = cons_pop[(cons_pop.Gender == "All Persons")].groupby("Mid_Year_Ending").apply(get_age_quantiles)
pop_years = cons_pop.Mid_Year_Ending.unique()
for c in tqdm(reference.axes[0].astype(int), desc="Year"):
# Preserve edu data and use closest population year
if c not in pop_years:
yr = pop_years[np.abs(pop_years-c).argmin()]
else:
yr=c
age_quartiles=annual_quartiles.loc[yr].astype(int)
_cons_pop = cons_pop[(cons_pop.Gender != 'All Persons') \
& (cons_pop.Mid_Year_Ending == yr) \
].groupby(['constituency', 'Gender'])['Population_Estimate'].sum()\
.unstack().rename(columns=lambda c: c.lower())
_cons_pop['total'] = _cons_pop.sum(axis=1)
_cons_pop['m_per_f'] = _cons_pop['males']/_cons_pop['females']
cons_pop_qilted=pd.DataFrame.from_dict(
{
f"{lotile}-{qtile}":cons_pop[(cons_pop.Gender == 'All Persons') \
& (cons_pop.Mid_Year_Ending == yr) \
& (lotile<=cons_pop.Age) &(cons_pop.Age<qtile)
].groupby('constituency')['Population_Estimate'].sum()
for lotile, qtile in zip([0]+list(age_quartiles), list(age_quartiles)+[99])
}
)
cons_stats[f"pop_{c}"] = _cons_pop['total']
cons_stats[f"pop_{c}_males"] = _cons_pop['males']
cons_stats[f"pop_{c}_females"] = _cons_pop['females']
cons_stats[f"pop_{c}_m_per_f"] = _cons_pop['m_per_f']
cons_stats[f"topqt_{c}"]=cons_pop_qilted.idxmax(axis=1)
cons_stats[f"en_{c}_pc"]=cons_stats[f"en_{c}"]/cons_pop_qilted.sum(axis=1)
cons_pop_mean=cons_pop[(cons_pop.Gender.isin(["All Persons"])) & (cons_pop.Mid_Year_Ending == yr)]
cons_pop_mean['popprod'] = cons_pop_mean[['Age','Population_Estimate']].product(axis=1)
cons_pop_mean=cons_pop_mean.groupby('constituency')[['popprod','Population_Estimate']].sum(axis=0)
cons_stats[f'age_{c}_avg'] = cons_pop_mean['popprod']/cons_pop_mean['Population_Estimate']
try:
cons_stats[f"av_{c}_pc"]=cons_stats[f"av_{c}"]/cons_pop_qilted.sum(axis=1)
except:
print(f"No data for av in {c}")
unyearify = lambda c:'_'.join([c.split('_')[0]]+c.split('_')[2:])
sns.set(rc={'figure.figsize':graph_figsize})
for year in range(2009,2017):
cons_stats_year = cons_stats[[c for c in cons_stats.columns if str(year) in c]].rename(columns=unyearify)
f = sns.pairplot(data=cons_stats_year, hue='topqt',
vars=['age_avg','fsm_rat','pop_m_per_f'],
)
f.fig.suptitle(f"Pairplot of Age, FSM, and Gender:({year})")
f.fig.tight_layout()
p = Path(f"outputs/pairplot/{year}.png")
p.parent.mkdir(parents=True, exist_ok=True)
f.savefig(p)
plt.close(f.fig)
import matplotlib.animation as animation
unyearify = lambda c:'_'.join([c.split('_')[0]]+c.split('_')[2:])
years = list(range(2009,2017))
explanations={
'age_avg': "Average Age",
'av_pc': "Available School Places per capita",
'av_rat': "Ratio of Available Places to Actualised Enrolment",
'av': "Available School Places",
'en_pc': "Enrolled Pupils per capita",
'en_pk': "Enrolled Pupils per $km^2$",
'en': "Enrolled Pupils",
'fsm_rat': "Ratio of FSM pupils to total pupils",
'fsm': "FSM pupils",
'pop': "Population",
'pop_females': "Population (F)",
'pop_males': "Population (M)",
'pop_m_per_f': "Ratio of Males to Females",
'topqt': "Most populous age quartile (Age or younger)"
}
for metric in set(map(unyearify, cons_stats.columns)):
for year in years:
try:
f, ax = plt.subplots(figsize=graph_figsize)
if metric in explanations:
ax.set_title(f"{explanations[metric]}:({year})")
else:
ax.set_title(f"{metric}:({year})")
if metric=='topqt':
cons_stats[[c for c in cons_stats.columns if str(year) in c]+['geometry']]\
.rename(columns=unyearify).plot(column='topqt', ax=ax, legend=True, scheme='Quantiles',k=3)
else:
cons_stats[[c for c in cons_stats.columns if str(year) in c]+['geometry']]\
.rename(columns=unyearify).plot(column=metric, legend=True, ax=ax)
ax.axis('off')
f.tight_layout()
p = Path(f"outputs/{metric}/{year}.png")
p.parent.mkdir(parents=True, exist_ok=True)
f.savefig(p)
plt.close(f)
fps = 12
except:
print(f"{metric}:{year} failed")
finally:
if f:
plt.close(f)
import moviepy.editor as mpy
import moviepy.video as mpyv
clips = {}
for p in Path('outputs/').iterdir():
if not p.is_dir():
continue
metric = p.parts[-1]
file_list = [str(_p) for _p in sorted(filter(lambda s:s.suffix=='.png', p.iterdir()))]
clip = mpy.ImageSequenceClip(file_list, fps=1.5)
clip = mpyv.fx.all.freeze(clip, t='end', freeze_duration=2)
clip.write_gif(f"outputs/{metric}.gif")
clips[metric]=clip
metric='Population Distribution'
for year in years:
try:
f, ax = plt.subplots(figsize=graph_figsize)
if metric in explanations:
ax.set_title(f"{explanations[metric]}:({year})")
else:
ax.set_title(f"{metric}:({year})")
df = cons_pop[(cons_pop.Gender == 'All Persons') \
& (cons_pop.Mid_Year_Ending == year)]\
.groupby(['Age','constituency'])['Population_Estimate']\
.sum().unstack()
sns.heatmap(df.T, ax=ax)
f.tight_layout()
p = Path(f"outputs/{metric}/{year}.png")
p.parent.mkdir(parents=True, exist_ok=True)
f.savefig(p)
plt.close(f)
fps = 12
except:
print(f"{metric}:{year} failed")
finally:
if f:
plt.close(f)
metric='Population Distribution _normed_'
for year in years:
try:
f, ax = plt.subplots(figsize=graph_figsize)
if metric in explanations:
ax.set_title(f"{explanations[metric]}:({year})")
else:
ax.set_title(f"{metric}:({year})")
df = cons_pop[(cons_pop.Gender == 'All Persons') \
& (cons_pop.Mid_Year_Ending == year)]\
.groupby(['Age','constituency'])['Population_Estimate']\
.sum().unstack()
sns.heatmap(colnorm(df.T), ax=ax)
f.tight_layout()
p = Path(f"outputs/{metric}/{year}.png")
p.parent.mkdir(parents=True, exist_ok=True)
f.savefig(p)
plt.close(f)
fps = 12
except:
print(f"{metric}:{year} failed")
finally:
if f:
plt.close(f)
clips = {}
for p in Path('outputs/').iterdir():
if not p.is_dir():
continue
metric = p.parts[-1]
file_list = [str(_p) for _p in sorted(filter(lambda s:s.suffix=='.png', p.iterdir()))]
clip = mpy.ImageSequenceClip(file_list, fps=1.5)
clip = mpyv.fx.all.freeze(clip, t='end', freeze_duration=2)
clip.write_gif(f"outputs/{metric}.gif")
clips[metric]=clip
![](outputs/Population Distribution.gif)
year = 2016
school_type = reference[year][['school type','school management type','parliamentary constituency']]
school_type = school_type.dropna(how='all')
for c in school_type:
school_type[c] = school_type[c].astype('category')
consschool_type.groupby(['parliamentary constituency','school type']).size().unstack('school type')
cons_stats = cons_stats.assign(**school_type.groupby(['parliamentary constituency','school type']).size().unstack('school type').to_dict(orient='series'))
cons_stats = cons_stats.assign(**school_type.groupby(['parliamentary constituency','school management type']).size().unstack('school management type').to_dict(orient='series'))
cons_stats
cons_stats['grampct'] = 100*cons_stats['grammar'] / (cons_stats['secondary']+cons_stats['grammar'])
cons_stats['grampct'] = 100*cons_stats['grammar'] / (cons_stats['secondary']+cons_stats['grammar'])
f,ax = plt.subplots( figsize=graph_figsize)
cons_stats[['grampct','geometry']].plot(ax=ax, column='grampct', legend=True)
ax.axis('off')
ax.set_title('% of schools in constituency that are grammars')
f.tight_layout()
p = Path(f"outputs/grampct/2016.png")
p.parent.mkdir(parents=True, exist_ok=True)
f.savefig(p)
unyearify = lambda c:'_'.join([c.split('_')[0]]+c.split('_')[2:]) if len(c.split('_'))>1 else c
allowed_list = ['grampct']
cons_stats_2016 = cons_stats[[c for c in cons_stats.columns if '2016' in c or c in allowed_list ]].rename(columns=unyearify)
sns.pairplot(data=cons_stats_2016, hue='topqt',
vars=['fsm_rat','grampct'])
cons_stats_2016
enrolment.sum().sum()