import pandas as pd

dfs = pd.read_html('http://esewa.epfoservices.in/ecr_dashboard/summary_of_reg_estt.php', infer_types=False, skiprows=2)
dfs[0].head()

import os.path
from urllib import urlretrieve

def get_epf(offices, path):
    data = []
    base = 'http://esewa.epfoservices.in/ecr_dashboard/export_establishment_wise_missing_detail.php?office_name='
    
    # Weird workaround because pd.read_excel does not support encoding_override
    xl = pd.ExcelFile.__new__(pd.ExcelFile)
    
    for office in offices:
        excel_file = os.path.join(path, office) + '.xls'
        if not os.path.exists(excel_file):
            urlretrieve(base + office, excel_file)
        try:
            book = xlrd.open_workbook(excel_file, encoding_override='cp1252')
        except UnicodeDecodeError:
            book = xlrd.open_workbook(excel_file, encoding_override='utf8')
        setattr(xl, 'book', book)
        df = xl.parse('Sheet 1', skiprows=2)
        df['Office'] = office
        data.append(df)
    
    return pd.concat(data, ignore_index=True)

path = 'd:/site/gramener.com/viz/epf/establishments/'
datafile = os.path.join(path, 'data.csv')
if os.path.exists(datafile):
    data = pd.read_csv(datafile, encoding='cp1252')
else:
    data = get_epf(offices=dfs[0][5][:120], path=path)
    data.to_csv(datafile, index=False, encoding='cp1252')

data.sort('On the basis of ECR (wage month Feb-2013 onwards)', ascending=False).head(20)[['Office', 'Estt. Name', 'On the basis of ECR (wage month Feb-2013 onwards)']]