#!/usr/bin/env python # coding: utf-8 # ## PycURL can be used to fetch objects identified by a URL from a Python program, similar to the urllib Python module. # 1. PycURL is mature, very fast, and supports a lot of features. # 2. PycURL is targeted at an advanced developer - if you need dozens of concurrent, fast and reliable connections or any of the sophisticated features listed above then PycURL is for you. # # # #### Here we are creating script to retrieve multiple URLs concurrently. # In[1]: import pycurl # #### URLs to retrieve # In[61]: urls= ["http://www.google.com/","http://www.imdb.com/"] # #### Number of connections (1-10000) # In[55]: num_conn= 10 # #### Create a list of curl objects # In[56]: a =pycurl.CurlMulti() a.handles = [] for i in range(num_conn): c= pycurl.Curl() c.fp= None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.CONNECTTIMEOUT,30) c.setopt(pycurl.MAX_RECV_SPEED_LARGE, 65536) c.setopt(pycurl.NOSIGNAL,1) a.handles.append(c) # In[57]: freelist=a.handles[:] # #### PycURL does not provide storage for the network response - that is the application’s job. Therefore we must setup a buffer (in the form of a StringIO object) and instruct PycURL to write to that buffer. # In[58]: from StringIO import StringIO # In[59]: htmlcontainer=" " # #### To issue a network request with PycURL, the following steps are required: # # 1. Create a pycurl.Curl instance. # 2. Use setopt to set options. # 3. Call perform to perform the operation # In[62]: while urls and freelist: url=urls.pop() c= freelist.pop() b= StringIO() c.setopt(pycurl.URL,url) c.setopt(pycurl.WRITEFUNCTION, b.write) c.perform() htmlcontainer += b.getvalue() c.close # In[63]: print htmlcontainer # #### Data extraction can now be done using beautiful soup or regular expression