Crawl the ECI election statistics

http://eci.nic.in/eci_main1/ElectionStatistics.aspx has PDFs of past election results. Let's first download them all and convert to text.

In [1]:
import os
from urllib import urlopen, urlretrieve
from urlparse import urljoin
from lxml.html import parse
from os.path import exists
from subprocess import call
In [2]:
# Use [xpdf](http://www.foolabs.com/xpdf/) to convert PDF to text
PDF_TO_TEXT = 'D:/Apps/xpdf/pdftotext.exe'
In [3]:
base = 'http://eci.nic.in/eci_main1/ElectionStatistics.aspx'
tree = parse(urlopen(base))
In [4]:
files = set()
def download(year, link):
    '''Download a year's election results from link and convert to text'''
    pdf_file = os.path.join('raw', year + '.pdf')
    if not exists(pdf_file):
        urlretrieve(urljoin(base, link), pdf_file)
    text_file = pdf_file.replace('.pdf', '.txt')
    if not exists(text_file):
        call([PDF_TO_TEXT, '-layout', pdf_file, text_file])
    files.add(year + '.txt')

# Get all rows from the first table in <div id="c">
# We pick only the first link, that has the
# constituency-wise detailed results
for td in tree.findall('//*[@id="c"]/table[1]//td'):
    if td.text is None:
        continue
    year = td.text.strip().split(' ')[0]
    download(year, td.find('.//a').get('href'))

# 2009 results are elsewhere. Hard code the link
download('2009', 'http://eci.nic.in/eci_main/archiveofge2009/Stats/VOLI/25_ConstituencyWiseDetailedResult.pdf')

Now, we'll convert these into a CSV file with the relevant data.

Manual processing

At this point, there's some manual munging of the text files. I'd ideally like to have avoided this, but it's just so much faster to manually process some of this content than write a program to do it.

I'll document what I did at some point. But a few notes in the meantime:

Resume automated extraction

In [5]:
import re
import logging
In [6]:
fieldlist = {
    '1951.txt': ['NAME', 'PARTY', 'VOTES', '%'],
    '1957.txt': ['NAME', 'PARTY', 'VOTES', '%'],
    '1962.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1967.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1971.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1977.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1980.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1984.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1985.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1989.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1991.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1992.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1996.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1998.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '1999.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
    '2004.txt': ['NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES'],
    '2009.txt': ['#', 'NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES', '% ELECTORS', '% VOTES'],
}
In [7]:
def old_text_parse(filename):
    if filename.startswith('1'):
        re_state = re.compile(r'^ {25,}[A-Za-z].*')
        re_electors = re.compile(r'ELECTORS *: *(\d+)')
    else:
        re_state = re.compile(r'^[A-Z][A-Za-z& ]+$')
        re_electors = re.compile(r'Total Electors *(\d+)(.*)')
    
    re_constituency = re.compile(r'Constituency *:? *(\d+) *\.? *(.*)', re.IGNORECASE)
    re_name = re.compile(r'^\d+ *\. *')
    re_scst = re.compile(r' *\((SC|ST)\)')
    
    fields = fieldlist[filename]
    results, electors = [], {}
    state, constituency = None, None
    for ln, line in enumerate(open(filename)):
        match = re_constituency.match(line)
        if match:
            constituency = match.group(2).split('  ')[0].upper()
            constituency = re_scst.sub('', constituency)
            continue
            
        match = re_state.match(line)
        if match:
            state = line.strip().upper()
            continue

        match = re_electors.match(line)
        if match:
            electors[state, constituency] = match.group(1)
            continue
        
        parts = re.split(r'  +', line.strip())
        if len(parts) == len(fields):
            row = dict(zip(fields, parts))
        elif len(parts) == 1:
            row['NAME'] = row['NAME'] + ' ' + line.strip()
            continue
        else:
            logging.warn('%s:%d: %d parts, not %d: %s', 
                         filename, ln + 1, len(parts), len(fields), line)
            continue
                
        row['STATE'] = state
        row['PC'] = constituency
        row['NAME'] = re_name.sub('', row['NAME'])
        results.append(row)
        
    results = pd.DataFrame(results).set_index(['STATE', 'PC'])
    results['YEAR'] = filename.split('.')[0]
    results['ELECTORS'] = pd.Series(electors)
    if '%' in results:
        del results['%']
    return results.reset_index()
In [8]:
# Parse the text files
logging.basicConfig(level=logging.INFO)

results = []
for filename in sorted(fieldlist):
    results.append(old_text_parse(filename))

results = pd.concat(results, ignore_index=True)['YEAR STATE PC NAME SEX PARTY AGE CATEGORY VOTES ELECTORS'.split(' ')]
In [9]:
# Cleanse the results
rename = pd.read_csv('rename.csv').set_index(['Field', 'Source'])['Target']
for col in rename.index.get_level_values(0).unique():
    # print rename.ix[col]
    results[col].replace(rename.ix[col].to_dict(), inplace=True)
In [10]:
# Calculations
results['VOTES'] = results['VOTES'].astype(float)
results['#'] = results.groupby(['YEAR', 'STATE', 'PC'])['VOTES'].rank(method='min', ascending=False)
results.sort(['YEAR', 'STATE', 'PC', 'VOTES'], ascending=(True, True, True, False), inplace=True)
results.to_csv('parliament.csv', index=False, float_format='%.0f')