Scraping ADR's Myneta.info¶

Myneta.info has analysed candidate affidavits for many candidates. This scraper converts that data into CSVs. (Of course, we could always ask ADR, and they'll probably happily provide it. But I find it faster to write a scraper than wait for people to arrive at office.)

The pages are very structured. We'll begin with the candidate summary page.

In [1]:

import os
import time
import urllib
import hashlib
import pandas as pd
from lxml.html import parse

if not os.path.exists('.cache'):
    os.makedirs('.cache')
    
# If file is older than 5 days, download it again
OLD = time.time() - 15 * 24 * 60 * 60

yearkey = {
    2014: 'ls2014',
    2009: 'ls2009',
    2004: 'loksabha2004',
}

def get(url):
    path = os.path.join('.cache', hashlib.sha1(url).hexdigest()) + '.html'
    if not os.path.exists(path) or os.stat(path).st_mtime < OLD:
        print url
        urllib.urlretrieve(url, path)
    return parse(open(path))

def candidates(year):
    url = 'http://myneta.info/{:s}/index.php?action=summary&subAction=candidates_analyzed&sort=candidate'
    tree = get(url.format(yearkey[year]))
    results = []
    for row in tree.findall('.//table')[-1].findall('tr'):
        td = row.findall('td')
        results.append({
            'Year': year,
            'Sno': td[0].text,
            'ID': int(td[1].find('a').get('href').split('=')[-1]),
            'Candidate': td[1].find('a').text,
            'Constituency': td[2].text,
            'Party': td[3].text,
            'Criminal Cases': int(td[4].text_content()),
            'Education': td[5].text,
            'Total Assets': int(td[6].text.replace(u'Rs\xa0', '').replace(',', '').replace('Nil', '0')),
            'Total Liabilities': int(td[7].text.replace(u'Rs\xa0', '').replace(',', '').replace('Nil', '0')),
        })
    return pd.DataFrame(results)

In [2]:

ls2014 = candidates(2014)

# The constituency page does not provide the state and PC code
# So let's introduce that, at least for 2014.
pc2014 = pd.read_csv('pc2014.csv').set_index('Constituency')
ls2014['ST_CODE'] = ls2014['Constituency'].apply(lambda v: pc2014['ST_CODE'].get(v, ''))
ls2014['PC_CODE'] = ls2014['Constituency'].apply(lambda v: pc2014['PC_CODE'].get(v, ''))

# However, some corrections are required for duplicate constituencies
index = ls2014[(ls2014['Constituency'] == 'AURANGABAD') & (ls2014['ID'] > 5000)].index
ls2014['ST_CODE'][index] = 'S13'
ls2014['PC_CODE'][index] = 19

index = ls2014[(ls2014['Constituency'] == 'MAHARAJGANJ') & (ls2014['ID'] > 9000)].index
ls2014['ST_CODE'][index] = 'S24'
ls2014['PC_CODE'][index] = 63

index = ls2014[(ls2014['Constituency'] == 'HAMIRPUR') & (ls2014['ID'] < 7000)].index
ls2014['ST_CODE'][index] = 'S24'
ls2014['PC_CODE'][index] = 47

# Save to disk
ls2014.to_csv('myneta.2014.csv', index=False)
ls2014.head()

Out[2]:

	Candidate	Constituency	Criminal Cases	Education	ID	Party	Sno	Total Assets	Total Liabilities	Year	ST_CODE	PC_CODE
0	Kaushal Yadav	NAWADA	8	Post Graduate	148	JD(U)	1	154566136	2604969	2014	S04	39
1	Kiran Sharma	AZAMGARH	0	8th Pass	9487	Bhartiya Shakti Chetna Party	2	3509407	325000	2014	S24	69
2	M. Aamir Rashadi	AZAMGARH	1	Others	9496	Rashtriya Ulama Council	3	2191523	0	2014	S24	69
3	Rakesh Kumar Giri	MAHARAJGANJ	0	Graduate Professional	9706	IND	4	306023	0	2014	S24	63
4	(Kuppal)G.Devadoss	CHENNAI SOUTH	0	8th Pass	6912	IND	5	3630000	850000	2014	S22	3

5 rows × 12 columns

In [3]:

ls2009 = candidates(2009)
ls2009.to_csv('myneta.2009.csv', index=False)

ls2004 = candidates(2004)
ls2004.to_csv('myneta.2004.csv', index=False)

Candidate details¶

Let's scraping the IPC sections and asset breakup.

(As I supsected, writing the scraper took less time (40 min) than I think it'd have taken to get the information even from an organisation as friendly as ADR.)

In [4]:

import re

re_ipcs = re.compile(r'(\d+) charges related to .*?IPC Section\-(\d+)')

def candidate(year, id):
    url = 'http://myneta.info/{:s}/candidate.php?candidate_id={:d}'.format(yearkey[year], id)
        
    tree = get(url)
    result = []
    ipcs = tree.xpath(".//h3[contains(text(), 'Brief Details of IPCs')]")
    if len(ipcs):
        ipcs = ipcs[0].getparent().text_content()
        for count, ipc_section in re_ipcs.findall(ipcs):
            result.append({
                'Type': 'IPC',
                'Year': year,
                'ID': id,
                'Key': ipc_section,
                'Value': int(count)
            })

    for heading in tree.xpath(".//h3[contains(text(), 'ovable Assets')]"):
        # Ignore 1st header row, last total rows from table
        for row in heading.getparent().getnext().findall('.//tr')[1:-1]:
            cells = row.findall('.//td')
            # Since rowspan is used for some cells, description is in 1st / 2nd col
            key = cells[0].text_content()
            if key[0].islower():
                key = cells[1].text_content()
            if 'Total' in key:
                key = 'Total as per Affidavit'
            result.append({
                'Type': 'Assets',
                'Year': year,
                'ID': id,
                'Key': key,
                'Value': int(re.sub(r'\D', '', cells[-1].find('.//b').text) or 0)
            })
            
    return pd.DataFrame(result)

In [5]:

ls2014_details = []
for index, row in ls2014.iterrows():
    ls2014_details.append(candidate(2014, row['ID']))

ls2014_details = pd.concat(ls2014_details)

In [6]:

ls2014_details.to_csv('myneta.details.2014.csv', index=False)
ls2014_details.head()

Out[6]:

	ID	Key	Type	Value	Year
0	148	420	IPC	3	2014
1	148	467	IPC	2	2014
2	148	468	IPC	2	2014
3	148	307	IPC	1	2014
4	148	379	IPC	1	2014

5 rows × 5 columns

Candidates with shares¶

Some URLS have share information. Next step is to scrape those as well.