#!/usr/bin/env python
# coding: utf-8

# # Crawling Web Pages using Beautiful Soup
# 
# This notebook crawls apress.com's blog page to:
# + extract list of recent blog post titles and their URLS
# + extract content related to each blog post in plain text
# 
# using requests and **BeautifulSoup**

# In[1]:


# import required libraries
import requests
from time import sleep
from bs4 import BeautifulSoup


# ## Utilities

# In[2]:


def get_post_mapping(content):
    """This function extracts blog post title and url from response object

    Args:
        content (request.content): String content returned from requests.get

    Returns:
        list: a list of dictionaries with keys title and url

    """
    post_detail_list = []
    post_soup = BeautifulSoup(content,"lxml")
    h3_content = post_soup.find_all("h3")
    
    for h3 in h3_content:
        post_detail_list.append(
            {'title':h3.a.get_text(),'url':h3.a.attrs.get('href')}
            )
    
    return post_detail_list


def get_post_content(content):
    """This function extracts blog post content from response object

    Args:
        content (request.content): String content returned from requests.get

    Returns:
        str: blog's content in plain text

    """
    plain_text = ""
    text_soup = BeautifulSoup(content,"lxml")
    para_list = text_soup.find_all("div",
                                   {'class':'cms-richtext'})
    
    for p in para_list[0]:
        plain_text += p.getText()
    
    return plain_text


# ## Crawl the Web

# Set the URL and get a list of blogs to be parsed

# In[3]:


crawl_url = "http://www.apress.com/in/blog/all-blog-posts"
post_url_prefix = "http://www.apress.com"


# Crawl recent posts on the website

# In[4]:


response = requests.get(crawl_url)


# Extract blog post title and url from response object

# In[5]:


if response.status_code == 200:
        blog_post_details = get_post_mapping(response.content)


# For each recent post, crawl the content and parse plain text content using beautiful soup

# In[6]:


if blog_post_details:
        print("Blog posts found:{}".format(len(blog_post_details)))
        
        for post in blog_post_details:
            print("Crawling content for post titled:",post.get('title'))
            post_response = requests.get(post_url_prefix+post.get('url'))
            
            if post_response.status_code == 200:
                post['content'] = get_post_content(post_response.content)
            
            print("Waiting for 10 secs before crawling next post...\n\n")
            sleep(10)
    
        print("Content crawled for all posts")


# Print Title and Content for first 5 posts

# In[7]:


# print/write content to file
for post in blog_post_details[:5]:
    print("title:{}\n-----------".format(post['title']))
    print("content:{}\n".format(post['content'][0:250]))