import pandas as pd dfs = pd.read_html('http://esewa.epfoservices.in/ecr_dashboard/summary_of_reg_estt.php', infer_types=False, skiprows=2) dfs[0].head() import os.path from urllib import urlretrieve def get_epf(offices, path): data = [] base = 'http://esewa.epfoservices.in/ecr_dashboard/export_establishment_wise_missing_detail.php?office_name=' # Weird workaround because pd.read_excel does not support encoding_override xl = pd.ExcelFile.__new__(pd.ExcelFile) for office in offices: excel_file = os.path.join(path, office) + '.xls' if not os.path.exists(excel_file): urlretrieve(base + office, excel_file) try: book = xlrd.open_workbook(excel_file, encoding_override='cp1252') except UnicodeDecodeError: book = xlrd.open_workbook(excel_file, encoding_override='utf8') setattr(xl, 'book', book) df = xl.parse('Sheet 1', skiprows=2) df['Office'] = office data.append(df) return pd.concat(data, ignore_index=True) path = 'd:/site/gramener.com/viz/epf/establishments/' datafile = os.path.join(path, 'data.csv') if os.path.exists(datafile): data = pd.read_csv(datafile, encoding='cp1252') else: data = get_epf(offices=dfs[0][5][:120], path=path) data.to_csv(datafile, index=False, encoding='cp1252') data.sort('On the basis of ECR (wage month Feb-2013 onwards)', ascending=False).head(20)[['Office', 'Estt. Name', 'On the basis of ECR (wage month Feb-2013 onwards)']]