#!/usr/bin/env python
# coding: utf-8

# # Local Authority Housing Returns
# 
# Metadata for [local authority housing statistics 2017 to 2018](https://www.gov.uk/government/publications/completing-local-authority-housing-statistics-2017-to-2018-guidance-notes).

# In[1]:


import pandas as pd


# In[2]:


#The simplest full dataset has code based column labels
lahs_2017_18 = pd.read_csv('LAHS_2017_18/LAHS_2017-18.csv')

#TIdy up whitespace in column names
lahs_2017_18.columns = [c.strip() for c in lahs_2017_18.columns]

#Find Island data
lahs_2017_18 = lahs_2017_18[lahs_2017_18['Area name'].str.contains('Wight')]

#Grab a dict of the Island data (colnames are dict keys, cell values are dict values)
lahs_2017_18_dict = lahs_2017_18.to_dict(orient='records')[0]

lahs_2017_18


# In[54]:


#Simple data quality check
#Number of informative / non-empty / non-zero cols vs total cols
lahs_2017_18.replace('0','').replace('-','').iloc[0].astype(bool).sum(), len(lahs_2017_18.columns)


# In[3]:


#Read the bulk upload guidance doc
xl = pd.read_excel('LAHS_guidance_notes_bulk_upload.xlsx', sheet_name=None)

#Clean it...
for k in xl.keys():
    #Drop empty cols and empty rows
    xl[k] = xl[k].dropna(axis=0, how='all').dropna(axis=1, how='all').reset_index(drop=True)
    

# In[4]:


#What are the sheets?
for k in xl.keys():
    print(k)


# In[5]:


import re

#Create a lookup of stuff
metadata={}
#Just grab sheetnames starting with one letter in range A-J then a space...
for k in [k for k in xl.keys() if re.search('^[ABCDEFGHIJ]\s',k) ]:
    
    metadata[k.split()[0]]={'sheet':k,'fullname':xl[k].columns[0],
                            'name':' '.join(k.split()[1:]), 'desc':''}
    xl[k].columns = list(range(0,len(xl[k].columns)))
                         
metadata


# ## A Stock

# In[169]:


k = 'A'


# In[170]:


xl[metadata[k]['sheet']]


# In[206]:


xl[metadata[k]['sheet']].replace(to_replace=lahs_2017_18_dict).fillna('')


# ## C Allocations

# In[215]:


k = 'C'


# In[216]:


tmp = xl[metadata[k]['sheet']]
tmp


# In[8]:


txt='''
In terms of the households required by numbers of bedrooms:

\t- {cc1aa} required 1 bedroom
\t- {cc1ab} required 2 bedrooms
\t- {cc1ac} required 3 bedrooms
\t- {cc1ad} required more than bedrooms
\t- {cc1ae} required an unspecified number of bedrooms.
'''

print(txt.format( **lahs_2017_18_dict ))


# In[217]:


tmp.replace(to_replace=lahs_2017_18_dict).fillna('')


# ## I Affordable Housing Supply

# In[212]:


k = 'I'


# In[213]:


tmp = xl[metadata[k]['sheet']]
tmp


# In[214]:


tmp.replace(to_replace=lahs_2017_18_dict).fillna('')


# ## J Affordable Housing Starts'

# In[208]:


k = 'J'


# In[209]:


metadata['J']['desc']= 'Completed by all Local Authorities. Report units STARTED during financial year 2017-18.\n \
             Units that have been started and completed in the year will be counted both in section I and J.'


# In[210]:


tmp = xl[metadata[k]['sheet']]
tmp


# In[211]:


tmp.replace(to_replace=lahs_2017_18_dict).fillna('')


# In[133]:


j1 = tmp.loc[8:15,[1,2,3]].reset_index(drop=True)
j1.columns = tmp.loc[4,[0]].tolist() + tmp.loc[5,[2,3]].tolist()
j1


# In[134]:


j2 = tmp.loc[19:26,[1,2,3]]
j2.columns = tmp.loc[16,[0,2,3]].tolist()
j2


# In[135]:


j3 = tmp.loc[30:37,[1,2,3]]
j3.columns = tmp.loc[27,[0,2,3]].tolist()
j3


# ## Unused But Possibly Useful Elsewhere

# In[166]:


#https://gist.github.com/bgusach/a967e0587d6e01e889fd1d776c5f3729
#Python string multireplacement
import re

def multireplace(string, replacements):
    """
    Given a string and a replacement map, it returns the replaced string.
    :param str string: string to execute replacements on
    :param dict replacements: replacement dictionary {value to find: value to replace}
    :rtype: str
    """
    # Place longer ones first to keep shorter substrings from matching where the longer ones should take place
    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'
    substrs = sorted(replacements, key=len, reverse=True)

    # Create a big OR regex that matches any of the substrings to replace
    regexp = re.compile('|'.join(map(re.escape, substrs)))

    # For each match, look up the new string in the replacements
    return regexp.sub(lambda match: replacements[match.group(0)], string)