#imports import requests import string from bs4 import BeautifulSoup import re import csv page = requests.get('http://www.accesstoresearch.org.uk/libraries') soup = BeautifulSoup(page.text) librarylist = soup.find('div', class_='col-lft') libraries = librarylist.find_all('ul') print len(libraries) print libraries[0] libs = [] for library in libraries: name = library.find('a').text url = library.find('a')['href'] libs.append([name, url]) print libs[0] # The UK Postcode REGEX came from # http://www.regxlib.com/REDetails.aspx?regexp_id=260 ukpc = re.compile('([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)') def process(raw_postcode,page): """ Given a postcode from the 'doesn't work' list, process page correctly """ soup = BeautifulSoup(page) if raw_postcode == 'OX1 1ND': elem = soup.find(['span','div'], class_='postal-code') postcode = ukpc.search(elem.get_text()).group(0) elif raw_postcode == 'ME14 1LQ': elem = soup.find('span', id='ctl00__mainContent_uxPostcodeLabel') postcode = ukpc.search(elem.get_text()).group(0) elif raw_postcode == 'HX1 1UJ': elem = soup.find_all('ul', class_='contactitem')[1] postcode = ukpc.search(elem.get_text()).group(0) return postcode tocorrect = ['OX1 1ND', 'ME14 1LQ', 'HX1 1UJ'] for library in libs: page = requests.get(library[1]) m = ukpc.search(page.text) if m: postcode = m.group(0) if postcode in tocorrect: postcode = process(postcode, page.text) library.append(postcode) filename = 'libraries.csv' with open(filename, 'w') as f: writer = csv.writer(f) writer.writerow(['Library', 'url', 'postcode']) writer.writerows(libs) for segment in [(0,100), (101, 200), (201,235)]: filename = 'libraries%s.csv' % str(segment[0]) with open(filename, 'w') as f: writer = csv.writer(f) writer.writerow(['Library', 'url', 'postcode']) writer.writerows(libs[segment[0]:segment[1]])