#!/usr/bin/env python # coding: utf-8 # # Local Authority Housing Returns # # Metadata for [local authority housing statistics 2017 to 2018](https://www.gov.uk/government/publications/completing-local-authority-housing-statistics-2017-to-2018-guidance-notes). # In[1]: import pandas as pd # In[2]: #The simplest full dataset has code based column labels lahs_2017_18 = pd.read_csv('LAHS_2017_18/LAHS_2017-18.csv') #TIdy up whitespace in column names lahs_2017_18.columns = [c.strip() for c in lahs_2017_18.columns] #Find Island data lahs_2017_18 = lahs_2017_18[lahs_2017_18['Area name'].str.contains('Wight')] #Grab a dict of the Island data (colnames are dict keys, cell values are dict values) lahs_2017_18_dict = lahs_2017_18.to_dict(orient='records')[0] lahs_2017_18 # In[54]: #Simple data quality check #Number of informative / non-empty / non-zero cols vs total cols lahs_2017_18.replace('0','').replace('-','').iloc[0].astype(bool).sum(), len(lahs_2017_18.columns) # In[3]: #Read the bulk upload guidance doc xl = pd.read_excel('LAHS_guidance_notes_bulk_upload.xlsx', sheet_name=None) #Clean it... for k in xl.keys(): #Drop empty cols and empty rows xl[k] = xl[k].dropna(axis=0, how='all').dropna(axis=1, how='all').reset_index(drop=True) # In[4]: #What are the sheets? for k in xl.keys(): print(k) # In[5]: import re #Create a lookup of stuff metadata={} #Just grab sheetnames starting with one letter in range A-J then a space... for k in [k for k in xl.keys() if re.search('^[ABCDEFGHIJ]\s',k) ]: metadata[k.split()[0]]={'sheet':k,'fullname':xl[k].columns[0], 'name':' '.join(k.split()[1:]), 'desc':''} xl[k].columns = list(range(0,len(xl[k].columns))) metadata # ## A Stock # In[169]: k = 'A' # In[170]: xl[metadata[k]['sheet']] # In[206]: xl[metadata[k]['sheet']].replace(to_replace=lahs_2017_18_dict).fillna('') # ## C Allocations # In[215]: k = 'C' # In[216]: tmp = xl[metadata[k]['sheet']] tmp # In[8]: txt=''' In terms of the households required by numbers of bedrooms: \t- {cc1aa} required 1 bedroom \t- {cc1ab} required 2 bedrooms \t- {cc1ac} required 3 bedrooms \t- {cc1ad} required more than bedrooms \t- {cc1ae} required an unspecified number of bedrooms. ''' print(txt.format( **lahs_2017_18_dict )) # In[217]: tmp.replace(to_replace=lahs_2017_18_dict).fillna('') # ## I Affordable Housing Supply # In[212]: k = 'I' # In[213]: tmp = xl[metadata[k]['sheet']] tmp # In[214]: tmp.replace(to_replace=lahs_2017_18_dict).fillna('') # ## J Affordable Housing Starts' # In[208]: k = 'J' # In[209]: metadata['J']['desc']= 'Completed by all Local Authorities. Report units STARTED during financial year 2017-18.\n \ Units that have been started and completed in the year will be counted both in section I and J.' # In[210]: tmp = xl[metadata[k]['sheet']] tmp # In[211]: tmp.replace(to_replace=lahs_2017_18_dict).fillna('') # In[133]: j1 = tmp.loc[8:15,[1,2,3]].reset_index(drop=True) j1.columns = tmp.loc[4,[0]].tolist() + tmp.loc[5,[2,3]].tolist() j1 # In[134]: j2 = tmp.loc[19:26,[1,2,3]] j2.columns = tmp.loc[16,[0,2,3]].tolist() j2 # In[135]: j3 = tmp.loc[30:37,[1,2,3]] j3.columns = tmp.loc[27,[0,2,3]].tolist() j3 # ## Unused But Possibly Useful Elsewhere # In[166]: #https://gist.github.com/bgusach/a967e0587d6e01e889fd1d776c5f3729 #Python string multireplacement import re def multireplace(string, replacements): """ Given a string and a replacement map, it returns the replaced string. :param str string: string to execute replacements on :param dict replacements: replacement dictionary {value to find: value to replace} :rtype: str """ # Place longer ones first to keep shorter substrings from matching where the longer ones should take place # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce # 'hey ABC' and not 'hey ABc' substrs = sorted(replacements, key=len, reverse=True) # Create a big OR regex that matches any of the substrings to replace regexp = re.compile('|'.join(map(re.escape, substrs))) # For each match, look up the new string in the replacements return regexp.sub(lambda match: replacements[match.group(0)], string)