IPython-Scrapy

This notebook is a minimal proof-of-concept Scrapy-IPython integration.

To try this notebook, create a 'tmp' subfolder (in the folder 'ipython notebook' is executed from) and run

python -m SimpleHTTPServer

from this 'tmp' folder.

Code for downloading webpages via Scrapy:

In [1]:
from __future__ import print_function
import os
import sys
import multiprocessing
from multiprocessing.queues import Queue
import lxml.etree
import lxml.html
from scrapy import project, signals
from scrapy.spider import BaseSpider
from scrapy.item import Item, Field
from scrapy.crawler import CrawlerProcess
from scrapy.xlib.pydispatch import dispatcher
from scrapy.utils.project import get_project_settings
from scrapy.http import Request
from scrapy.selector import XPathSelector, XmlXPathSelector, HtmlXPathSelector

TMP_DIR = './tmp'

class ResponseItem(Item):
    response = Field()

class ResponseSpider(BaseSpider):
    name = 'response_spider'
    
    def __init__(self, url):
        self.url = url
        super(ResponseSpider, self).__init__()
        
    def start_requests(self):
        return [Request(self.url, self.parse, dont_filter=True)]
        
    def parse(self, response):
        # request with callback fails to serialize - why?
        req = response.request.replace(callback=None)
        return ResponseItem(
            response=response.replace(request=req),
        )
 
    
class CrawlerWorker(multiprocessing.Process):
    def __init__(self, result_queue, spider, settings=None):
        multiprocessing.Process.__init__(self)
        self.settings = settings or get_project_settings()
        self.result_queue = result_queue
        self.spider = spider
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)
         
    def _item_passed(self, item):
        self.items.append(item)
  
    def run(self):
        self.crawler = CrawlerProcess(self.settings)
        self.crawler.install()
        self.crawler.configure()        
        self.crawler.crawl(self.spider)
        self.crawler.start()        
        self.crawler.stop()
        self.result_queue.put(self.items)
        

def _download(url):
    result_queue = Queue()
    spider = ResponseSpider(url)
    crawler = CrawlerWorker(result_queue, spider)
    crawler.start()            
    item = result_queue.get()[0]
    result_queue.cancel_join_thread()
    crawler.join()
    return item['response']

def set_base(body, base):
    if '<base' not in body:
        body = body.replace('<head>', '<head><base href="%s">' % base)
    return body

def download(url):
    """
    Download 'url' using Scrapy. Return Response.
    """
    response = _download(url)
    return response.replace(body=set_base(response.body, url))

Code for highlighting XPaths and displaying HTML in IPython cells:

In [13]:
from IPython import display

def _show_in_iframe(local_url):
    fname = os.path.join(TMP_DIR, 'output.html')
    html = """<html><body>
    <p><input type='button' value='Do we need'> <input type='button' value='some UI controls?'></p>
    <hr>
    <iframe style='width:800px; height:600px;' src="%s"></iframe>
    </body></html>""" % local_url
    display.display(display.HTML(html))


def show_in_iframe(html):
    fname = os.path.join(TMP_DIR, 'output.html')
    with open(fname, 'wb') as f:        
        f.write(html)            
    _show_in_iframe('http://127.0.0.1:8000/output.html')
        

def _highlight(hxs):
    el = hxs._root
    el.attrib['style'] = 'background-color: yellow;' + el.get('style', '')    


def show_hxs_select(hxs, xpath):
    for link in hxs.select(xpath):
        _highlight(link)
    
    body = lxml.html.tostring(hxs._root.getroottree())
    show_in_iframe(body)

    
def show_xpath(url, xpath):
    response = download(url)
    hxs = HtmlXPathSelector(response)
    show_hxs_select(hxs, xpath)

Usage example:

In [19]:
show_xpath('http://crawlera.com', '//a[contains(text(), "i")]')