import os from urllib import urlopen, urlretrieve from urlparse import urljoin from lxml.html import parse from os.path import exists from subprocess import call # Use [xpdf]( to convert PDF to text PDF_TO_TEXT = 'D:/Apps/xpdf/pdftotext.exe' base = '' tree = parse(urlopen(base)) files = set() def download(year, link): '''Download a year's election results from link and convert to text''' pdf_file = os.path.join('raw', year + '.pdf') if not exists(pdf_file): urlretrieve(urljoin(base, link), pdf_file) text_file = pdf_file.replace('.pdf', '.txt') if not exists(text_file): call([PDF_TO_TEXT, '-layout', pdf_file, text_file]) files.add(year + '.txt') # Get all rows from the first table in
# We pick only the first link, that has the # constituency-wise detailed results for td in tree.findall('//*[@id="c"]/table[1]//td'): if td.text is None: continue year = td.text.strip().split(' ')[0] download(year, td.find('.//a').get('href')) # 2009 results are elsewhere. Hard code the link download('2009', '') import re import logging fieldlist = { '1951.txt': ['NAME', 'PARTY', 'VOTES', '%'], '1957.txt': ['NAME', 'PARTY', 'VOTES', '%'], '1962.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1967.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1971.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1977.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1980.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1984.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1985.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1989.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1991.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1992.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1996.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1998.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '1999.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'], '2004.txt': ['NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES'], '2009.txt': ['#', 'NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES', '% ELECTORS', '% VOTES'], } def old_text_parse(filename): if filename.startswith('1'): re_state = re.compile(r'^ {25,}[A-Za-z].*') re_electors = re.compile(r'ELECTORS *: *(\d+)') else: re_state = re.compile(r'^[A-Z][A-Za-z& ]+$') re_electors = re.compile(r'Total Electors *(\d+)(.*)') re_constituency = re.compile(r'Constituency *:? *(\d+) *\.? *(.*)', re.IGNORECASE) re_name = re.compile(r'^\d+ *\. *') re_scst = re.compile(r' *\((SC|ST)\)') fields = fieldlist[filename] results, electors = [], {} state, constituency = None, None for ln, line in enumerate(open(filename)): match = re_constituency.match(line) if match: constituency =' ')[0].upper() constituency = re_scst.sub('', constituency) continue match = re_state.match(line) if match: state = line.strip().upper() continue match = re_electors.match(line) if match: electors[state, constituency] = continue parts = re.split(r' +', line.strip()) if len(parts) == len(fields): row = dict(zip(fields, parts)) elif len(parts) == 1: row['NAME'] = row['NAME'] + ' ' + line.strip() continue else: logging.warn('%s:%d: %d parts, not %d: %s', filename, ln + 1, len(parts), len(fields), line) continue row['STATE'] = state row['PC'] = constituency row['NAME'] = re_name.sub('', row['NAME']) results.append(row) results = pd.DataFrame(results).set_index(['STATE', 'PC']) results['YEAR'] = filename.split('.')[0] results['ELECTORS'] = pd.Series(electors) if '%' in results: del results['%'] return results.reset_index() # Parse the text files logging.basicConfig(level=logging.INFO) results = [] for filename in sorted(fieldlist): results.append(old_text_parse(filename)) results = pd.concat(results, ignore_index=True)['YEAR STATE PC NAME SEX PARTY AGE CATEGORY VOTES ELECTORS'.split(' ')] # Cleanse the results rename = pd.read_csv('rename.csv').set_index(['Field', 'Source'])['Target'] for col in rename.index.get_level_values(0).unique(): # print rename.ix[col] results[col].replace(rename.ix[col].to_dict(), inplace=True) # Calculations results['VOTES'] = results['VOTES'].astype(float) results['#'] = results.groupby(['YEAR', 'STATE', 'PC'])['VOTES'].rank(method='min', ascending=False) results.sort(['YEAR', 'STATE', 'PC', 'VOTES'], ascending=(True, True, True, False), inplace=True) results.to_csv('parliament.csv', index=False, float_format='%.0f')