Myneta.info has analysed candidate affidavits for many candidates. This scraper converts that data into CSVs. (Of course, we could always ask ADR, and they'll probably happily provide it. But I find it faster to write a scraper than wait for people to arrive at office.)
The pages are very structured. We'll begin with the candidate summary page.
import os
import time
import urllib
import hashlib
import pandas as pd
from lxml.html import parse
if not os.path.exists('.cache'):
os.makedirs('.cache')
# If file is older than 5 days, download it again
OLD = time.time() - 15 * 24 * 60 * 60
yearkey = {
2014: 'ls2014',
2009: 'ls2009',
2004: 'loksabha2004',
}
def get(url):
path = os.path.join('.cache', hashlib.sha1(url).hexdigest()) + '.html'
if not os.path.exists(path) or os.stat(path).st_mtime < OLD:
print url
urllib.urlretrieve(url, path)
return parse(open(path))
def candidates(year):
url = 'http://myneta.info/{:s}/index.php?action=summary&subAction=candidates_analyzed&sort=candidate'
tree = get(url.format(yearkey[year]))
results = []
for row in tree.findall('.//table')[-1].findall('tr'):
td = row.findall('td')
results.append({
'Year': year,
'Sno': td[0].text,
'ID': int(td[1].find('a').get('href').split('=')[-1]),
'Candidate': td[1].find('a').text,
'Constituency': td[2].text,
'Party': td[3].text,
'Criminal Cases': int(td[4].text_content()),
'Education': td[5].text,
'Total Assets': int(td[6].text.replace(u'Rs\xa0', '').replace(',', '').replace('Nil', '0')),
'Total Liabilities': int(td[7].text.replace(u'Rs\xa0', '').replace(',', '').replace('Nil', '0')),
})
return pd.DataFrame(results)
ls2014 = candidates(2014)
# The constituency page does not provide the state and PC code
# So let's introduce that, at least for 2014.
pc2014 = pd.read_csv('pc2014.csv').set_index('Constituency')
ls2014['ST_CODE'] = ls2014['Constituency'].apply(lambda v: pc2014['ST_CODE'].get(v, ''))
ls2014['PC_CODE'] = ls2014['Constituency'].apply(lambda v: pc2014['PC_CODE'].get(v, ''))
# However, some corrections are required for duplicate constituencies
index = ls2014[(ls2014['Constituency'] == 'AURANGABAD') & (ls2014['ID'] > 5000)].index
ls2014['ST_CODE'][index] = 'S13'
ls2014['PC_CODE'][index] = 19
index = ls2014[(ls2014['Constituency'] == 'MAHARAJGANJ') & (ls2014['ID'] > 9000)].index
ls2014['ST_CODE'][index] = 'S24'
ls2014['PC_CODE'][index] = 63
index = ls2014[(ls2014['Constituency'] == 'HAMIRPUR') & (ls2014['ID'] < 7000)].index
ls2014['ST_CODE'][index] = 'S24'
ls2014['PC_CODE'][index] = 47
# Save to disk
ls2014.to_csv('myneta.2014.csv', index=False)
ls2014.head()
Candidate | Constituency | Criminal Cases | Education | ID | Party | Sno | Total Assets | Total Liabilities | Year | ST_CODE | PC_CODE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Kaushal Yadav | NAWADA | 8 | Post Graduate | 148 | JD(U) | 1 | 154566136 | 2604969 | 2014 | S04 | 39 |
1 | Kiran Sharma | AZAMGARH | 0 | 8th Pass | 9487 | Bhartiya Shakti Chetna Party | 2 | 3509407 | 325000 | 2014 | S24 | 69 |
2 | M. Aamir Rashadi | AZAMGARH | 1 | Others | 9496 | Rashtriya Ulama Council | 3 | 2191523 | 0 | 2014 | S24 | 69 |
3 | Rakesh Kumar Giri | MAHARAJGANJ | 0 | Graduate Professional | 9706 | IND | 4 | 306023 | 0 | 2014 | S24 | 63 |
4 | (Kuppal)G.Devadoss | CHENNAI SOUTH | 0 | 8th Pass | 6912 | IND | 5 | 3630000 | 850000 | 2014 | S22 | 3 |
5 rows × 12 columns
ls2009 = candidates(2009)
ls2009.to_csv('myneta.2009.csv', index=False)
ls2004 = candidates(2004)
ls2004.to_csv('myneta.2004.csv', index=False)
Let's scraping the IPC sections and asset breakup.
(As I supsected, writing the scraper took less time (40 min) than I think it'd have taken to get the information even from an organisation as friendly as ADR.)
import re
re_ipcs = re.compile(r'(\d+) charges related to .*?IPC Section\-(\d+)')
def candidate(year, id):
url = 'http://myneta.info/{:s}/candidate.php?candidate_id={:d}'.format(yearkey[year], id)
tree = get(url)
result = []
ipcs = tree.xpath(".//h3[contains(text(), 'Brief Details of IPCs')]")
if len(ipcs):
ipcs = ipcs[0].getparent().text_content()
for count, ipc_section in re_ipcs.findall(ipcs):
result.append({
'Type': 'IPC',
'Year': year,
'ID': id,
'Key': ipc_section,
'Value': int(count)
})
for heading in tree.xpath(".//h3[contains(text(), 'ovable Assets')]"):
# Ignore 1st header row, last total rows from table
for row in heading.getparent().getnext().findall('.//tr')[1:-1]:
cells = row.findall('.//td')
# Since rowspan is used for some cells, description is in 1st / 2nd col
key = cells[0].text_content()
if key[0].islower():
key = cells[1].text_content()
if 'Total' in key:
key = 'Total as per Affidavit'
result.append({
'Type': 'Assets',
'Year': year,
'ID': id,
'Key': key,
'Value': int(re.sub(r'\D', '', cells[-1].find('.//b').text) or 0)
})
return pd.DataFrame(result)
ls2014_details = []
for index, row in ls2014.iterrows():
ls2014_details.append(candidate(2014, row['ID']))
ls2014_details = pd.concat(ls2014_details)
ls2014_details.to_csv('myneta.details.2014.csv', index=False)
ls2014_details.head()
ID | Key | Type | Value | Year | |
---|---|---|---|---|---|
0 | 148 | 420 | IPC | 3 | 2014 |
1 | 148 | 467 | IPC | 2 | 2014 |
2 | 148 | 468 | IPC | 2 | 2014 |
3 | 148 | 307 | IPC | 1 | 2014 |
4 | 148 | 379 | IPC | 1 | 2014 |
5 rows × 5 columns
Some URLS have share information. Next step is to scrape those as well.