!wget https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/390701/Local_Authority_Housing_Statistics_dataset_2013-14.xlsx import pandas as pd dfx=pd.ExcelFile('Local_Authority_Housing_Statistics_dataset_2013-14.xlsx') dfx.sheet_names #Menu sheet parse to identify sheets A-I import re sd=re.compile(r'Section (\w) - (.*)$') sheetDetails={} for row in dfx.parse('Menu')[[1]].values: if str(row[0]).startswith('Section'): sheetDetails[sd.match(row[0]).group(1)]=sd.match(row[0]).group(2) sheetDetails dfx.parse('A',header=None)[:15] df=dfx.parse('A',header=None) df=df.dropna(how='all') row = df[df.apply(lambda x: (x == "DCLG code").any(), axis=1)].index.tolist()[0]#.values[0] # will be an array row df=dfx.parse('A',header=row).dropna(how='all').dropna(how='all',axis=1) df=df[df['DCLG code'].notnull()].reset_index(drop=True) df[:5] def dfgrabber(dfx,sheet): #First pass - identify row for headers df=dfx.parse(sheet,header=None) df=df.dropna(how='all') row = df[df.apply(lambda x: (x == "DCLG code").any(), axis=1)].index.tolist()[0]#.values[0] # will be an array #Second pass - generate dataframe df=dfx.parse(sheet,header=row).dropna(how='all').dropna(how='all',axis=1) df=df[df['DCLG code'].notnull()].reset_index(drop=True) df.columns=[c.split(' ')[0] for c in df.columns] return df,row dfgrabber(dfx,'A')[:5] pd.read_csv('messyindex.csv',header=[0,1,2]) #Next step - decode the column codes row=7 df=dfx.parse('A',header=None) title=df[0][[0]][0] title df=df[1:row+1].dropna(how='all').dropna(how='all',axis=1) df xx=dfx.parse('A',header=None)[1:row].dropna(how='all') xx #Fill down xx.fillna(method='ffill', axis=0,inplace=True) #Fill across xx=xx.fillna(method='ffill', axis=1) xx xx=xx.append(dfx.parse('A',header=None)[row:row+1]) xx dfx.parse('A',header=None)[row:row+1] #Add empty row trick via #http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/things_in_pandas.ipynb import numpy as np def addemptyrow(df): df = df.append(pd.Series([np.nan]*len(df.columns),index=df.columns),ignore_index=True) return df xx=addemptyrow(xx) xx.to_csv('multi_index.csv',header=False,index=False) mxx=pd.read_csv('multi_index.csv',header=[0,1,2]) mxx for c in mxx.columns.get_level_values(0).tolist(): if c.startswith('Unnamed'): mxx = mxx.drop(c, level=0, axis=1) mxx dd=mxx.to_dict(orient='list') dd keyx={} for r in dd: keyx[dd[r][0].split(' ')[0]]=r keyx import math import collections def coldecoder(dfx,sheet,row): zz=dfx.parse(sheet,header=None) stitle=zz[0][[0]][0] xx=zz[1:row].dropna(how='all') #Fill down xx.fillna(method='ffill', axis=0,inplace=True) #Fill across xx=xx.fillna(method='ffill', axis=1) #How many rows in the header? keydepth=len(xx) header=[i for i in range(0,keydepth)] xx=xx.append(zz[row:row+1]) xx.to_csv('multi_index.csv',header=False,index=False,encoding='utf-8') mxx=pd.read_csv('multi_index.csv',header=header,encoding='utf-8') for c in mxx.columns.get_level_values(0).tolist(): if c.startswith('Unnamed'): mxx = mxx.drop(c, level=0, axis=1) dd=mxx.to_dict(orient='list') keyx=collections.OrderedDict() #{} for r in dd: if not pd.isnull(dd[r][0]): keyx[dd[r][0].split(' ')[0]]=r return stitle,keyx,keydepth df,row=dfgrabber(dfx,'B') sname,skey,kd=coldecoder(dfx,'B',row) sname df,row=dfgrabber(dfx,'C') sname,skey,kd=coldecoder(dfx,'C',row) skey df kq={} for k in skey: kq[k]=[] for j in skey[k]: if j not in kq[k]: kq[k].append(j) kq colmapper={} for kkq in kq: curr_level = colmapper depth=0 for path in kq[kkq]: depth=depth+1 if path not in curr_level: if depth