#The first thing we need to do is bring in some helper libraries #We're going to load in web pages, so a tool for doing that import urllib #We may need to do some complex string matching, which typically require the use of regular expressions import re #There are various tools to help us extract information from web page defining HTML. I'm using BeautifulSoup #If you don't have BeautifulSoup installed, uncomment and execute the following shell command #!pip install beautifulsoup from BeautifulSoup import BeautifulSoup #First we need to load in the page from the target web address/URL #urllib2.urlopen(url) opens the connection #.read() reads in the HTML from the connection #BeautifulSoup() parses the HTML and puts it in a form we can work with url='http://www.ofsted.gov.uk/inspection-reports/find-inspection-report/provider/ELS/116734' soup = BeautifulSoup(urllib2.urlopen(url).read()) #We can search the soup to look for span elements with the specified class #A list of results is returned so we pick the first (in fact, only) result which has index value [0] #Then we want to look at the text that is contained within that span element print soup('span', {'class': 'ins-judgement ins-judgement-2'})[0].text print soup('span', {'class': re.compile(r"^ins-judgement.*") })[0].text def stripper(url): soup=BeautifulSoup(urllib2.urlopen(url).read()) return soup('span', {'class': re.compile(r"ins-judgement.*")})[0].text url='http://www.ofsted.gov.uk/inspection-reports/find-inspection-report/provider/ELS/116734' outcome=stripper(url) print outcome #Identify a results page, then make some soup from it url='http://www.ofsted.gov.uk/inspection-reports/find-inspection-report/results/any/21/any/any/any/any/any/any/any/week/0/0?page=0' soup=BeautifulSoup(urllib2.urlopen(url).read()) for result in soup('ul',{'class':'resultsList'})[0].findAll('li'): urlstub=result.find('a')['href'] print urlstub def fullstripper(urlstub): url='http://www.ofsted.gov.uk'+urlstub return stripper(url) for result in soup('ul',{'class':'resultsList'})[0].findAll('li'): urlstub=result.find('a')['href'] print fullstripper(urlstub) for result in soup('ul',{'class':'resultsList'})[0].findAll('li'): urlstub=result.find('a')['href'] #Rather than print the outcome, pop it into a variable outcome=fullstripper(urlstub) #Find the first

element, split on commas, get last item pc=result.find('p').text.split(',')[-1] print outcome, pc for result in soup('ul',{'class':'resultsList'})[0].findAll('li'): urlstub=result.find('a')['href'] #Grab the URN from the end of the URL stub urn=urlstub.split('/')[-1] #Rather than print the outcome, pop it into a variable outcome=fullstripper(urlstub) #Find the first

element, split on commas, get last item pc=result.find('p').text.split(',')[-1] print outcome, pc, urn #We need another helper library import json #Define a function to get latitude and longitude for a given UK postcode def geoCodePostcode(postcode): #No spaces allowed in the postcode we pass to the geocoding service url='http://uk-postcodes.com/postcode/'+postcode.replace(' ','')+'.json' data = json.load(urllib2.urlopen(url)) return data['geo']['lat'],data['geo']['lng'] #Let's try it pc='MK7 6AA' lat,lng = geoCodePostcode(pc) print pc, lat, lng for result in soup('ul',{'class':'resultsList'})[0].findAll('li'): urlstub=result.find('a')['href'] #Grab the URN from the end of the URL stub urn=urlstub.split('/')[-1] #Rather than print the outcome, pop it into a variable outcome=fullstripper(urlstub) #Find the first

element, split on commas, get last item pc=result.find('p').text.split(',')[-1] #Geocode the postcode lat,lng = geoCodePostcode(pc) print outcome, pc, urn, lat, lng for result in soup('ul',{'class':'resultsList'})[0].findAll('li'): urlstub=result.find('a')['href'] #Grab the URN from the end of the URL stub urn=urlstub.split('/')[-1] #Rather than print the outcome, pop it into a variable outcome=fullstripper(urlstub) #Find the first

element, split on commas, get last item pc=result.find('p').text.split(',')[-1] #Geocode the postcode lat,lng = geoCodePostcode(pc) #Get the school name and strip out the cruft name=result.find('a').text.strip('1') print outcome, pc, urn, lat, lng, name #Let's have a quick try at parsing the line that sayas which results are displayed... txt=soup('p',{'class':'resultsSummary'})[0].text print txt #We can use a regular expression to parse out the values of interest m = re.match(".*Displaying \d* to (\d*) of (\d*) matches.*", txt) print m.group(1), m.group(2) #Set the scraper running flag to True... running=True #Start with results page 0 page=0 #While we've still got results to fetch... while running: #Create the results page URL stub='http://www.ofsted.gov.uk/inspection-reports/find-inspection-report/results/any/21/any/any/any/any/any/any/any/week/0/0?page=' url=stub+str(page) soup=BeautifulSoup(urllib2.urlopen(url).read()) page=page+1 print page,'...', #Extracting results and then fetching info on each result would go here bit=soup('p',{'class':'resultsSummary'})[0].text m = re.match(".*Displaying \d* to (\d*) of (\d*) matches.*", bit) if m.group(1)==m.group(2): running = False running =True page=0 #I'm going to build a list of reports reports=[] while running: stub='http://www.ofsted.gov.uk/inspection-reports/find-inspection-report/results/any/21/any/any/any/any/any/any/any/week/0/0?page=' soup=BeautifulSoup(urllib2.urlopen(stub+str(page)).read()) page=page+1 print page,'...', for result in soup('ul',{'class':'resultsList'})[0].findAll('li'): urlstub=result.find('a')['href'] #Grab the URN from the end of the URL stub urn=urlstub.split('/')[-1] #Rather than print the outcome, pop it into a variable outcome=fullstripper(urlstub) #Find the first

element, split on commas, get last item pc=result.find('p').text.split(',')[-1] #Geocode the postcode lat,lng = geoCodePostcode(pc) #Get the school name and strip out the cruft name=result.find('a').text.strip('1') #Rather than print the data, let's add it to the report list as a line item #print outcome, pc, urn, lat, lng, name reports.append([outcome, pc, urn, lat, lng, name]) bit=soup('p',{'class':'resultsSummary'})[0].text m = re.match(".*Displaying \d* to (\d*) of (\d*) matches.*", bit) if m.group(1)==m.group(2): running = False #Preview the first few report lines for report in reports[:5]: print report #Get the CSV helper library that makes sure we write nice CSV out import csv f = csv.writer(open('sampleReport.csv', 'wb+')) f.writerow(['outcome', 'pc', 'urn', 'lat', 'lng', 'name']) for report in reports: f.writerow(report) #Use a commandline command to preview the head of the CSV file !head sampleReport.csv