import pandas as pd
dfs = pd.read_html('http://esewa.epfoservices.in/ecr_dashboard/summary_of_reg_estt.php', infer_types=False, skiprows=2)
dfs[0].head()
import os.path
from urllib import urlretrieve
def get_epf(offices, path):
data = []
base = 'http://esewa.epfoservices.in/ecr_dashboard/export_establishment_wise_missing_detail.php?office_name='
# Weird workaround because pd.read_excel does not support encoding_override
xl = pd.ExcelFile.__new__(pd.ExcelFile)
for office in offices:
excel_file = os.path.join(path, office) + '.xls'
if not os.path.exists(excel_file):
urlretrieve(base + office, excel_file)
try:
book = xlrd.open_workbook(excel_file, encoding_override='cp1252')
except UnicodeDecodeError:
book = xlrd.open_workbook(excel_file, encoding_override='utf8')
setattr(xl, 'book', book)
df = xl.parse('Sheet 1', skiprows=2)
df['Office'] = office
data.append(df)
return pd.concat(data, ignore_index=True)
path = 'd:/site/gramener.com/viz/epf/establishments/'
datafile = os.path.join(path, 'data.csv')
if os.path.exists(datafile):
data = pd.read_csv(datafile, encoding='cp1252')
else:
data = get_epf(offices=dfs[0][5][:120], path=path)
data.to_csv(datafile, index=False, encoding='cp1252')
data.sort('On the basis of ECR (wage month Feb-2013 onwards)', ascending=False).head(20)[['Office', 'Estt. Name', 'On the basis of ECR (wage month Feb-2013 onwards)']]