#!/usr/bin/env python
# coding: utf-8

# ## PycURL can be used to fetch objects identified by a URL from a Python program, similar to the urllib Python module. 
# 1. PycURL is mature, very fast, and supports a lot of features.
# 2. PycURL is targeted at an advanced developer - if you need dozens of concurrent, fast and reliable connections or any of the sophisticated features listed above then PycURL is for you.
# 
# 

# #### Here we are creating script to retrieve multiple URLs concurrently.

# In[1]:


import pycurl


# #### URLs to retrieve

# In[61]:


urls= ["http://www.google.com/","http://www.imdb.com/"]


# #### Number of connections (1-10000)

# In[55]:


num_conn= 10


# #### Create a list of curl objects

# In[56]:


a =pycurl.CurlMulti()
a.handles = []
for i in range(num_conn):
    c= pycurl.Curl()
    c.fp= None
    c.setopt(pycurl.FOLLOWLOCATION, 1)
    c.setopt(pycurl.MAXREDIRS,5)
    c.setopt(pycurl.CONNECTTIMEOUT,30)
    c.setopt(pycurl.MAX_RECV_SPEED_LARGE, 65536)
    c.setopt(pycurl.NOSIGNAL,1)
    a.handles.append(c)


# In[57]:


freelist=a.handles[:]


# #### PycURL does not provide storage for the network response - that is the application’s job. Therefore we must setup a buffer (in the form of a StringIO object) and instruct PycURL to write to that buffer.

# In[58]:


from StringIO import StringIO


# In[59]:


htmlcontainer=" "


# #### To issue a network request with PycURL, the following steps are required:
# 
# 1. Create a pycurl.Curl instance.
# 2. Use setopt to set options.
# 3. Call perform to perform the operation

# In[62]:


while urls and freelist:
    url=urls.pop()
    c= freelist.pop()
    b= StringIO()
    c.setopt(pycurl.URL,url)
    c.setopt(pycurl.WRITEFUNCTION, b.write)
    c.perform()
    htmlcontainer += b.getvalue()
    c.close


# In[63]:


print htmlcontainer


# #### Data extraction can now be done using beautiful soup or regular expression