#!/usr/bin/env python # coding: utf-8 # # Crawling Web Pages using Beautiful Soup # # This notebook crawls apress.com's blog page to: # + extract list of recent blog post titles and their URLS # + extract content related to each blog post in plain text # # using requests and **BeautifulSoup** # In[1]: # import required libraries import requests from time import sleep from bs4 import BeautifulSoup # ## Utilities # In[2]: def get_post_mapping(content): """This function extracts blog post title and url from response object Args: content (request.content): String content returned from requests.get Returns: list: a list of dictionaries with keys title and url """ post_detail_list = [] post_soup = BeautifulSoup(content,"lxml") h3_content = post_soup.find_all("h3") for h3 in h3_content: post_detail_list.append( {'title':h3.a.get_text(),'url':h3.a.attrs.get('href')} ) return post_detail_list def get_post_content(content): """This function extracts blog post content from response object Args: content (request.content): String content returned from requests.get Returns: str: blog's content in plain text """ plain_text = "" text_soup = BeautifulSoup(content,"lxml") para_list = text_soup.find_all("div", {'class':'cms-richtext'}) for p in para_list[0]: plain_text += p.getText() return plain_text # ## Crawl the Web # Set the URL and get a list of blogs to be parsed # In[3]: crawl_url = "http://www.apress.com/in/blog/all-blog-posts" post_url_prefix = "http://www.apress.com" # Crawl recent posts on the website # In[4]: response = requests.get(crawl_url) # Extract blog post title and url from response object # In[5]: if response.status_code == 200: blog_post_details = get_post_mapping(response.content) # For each recent post, crawl the content and parse plain text content using beautiful soup # In[6]: if blog_post_details: print("Blog posts found:{}".format(len(blog_post_details))) for post in blog_post_details: print("Crawling content for post titled:",post.get('title')) post_response = requests.get(post_url_prefix+post.get('url')) if post_response.status_code == 200: post['content'] = get_post_content(post_response.content) print("Waiting for 10 secs before crawling next post...\n\n") sleep(10) print("Content crawled for all posts") # Print Title and Content for first 5 posts # In[7]: # print/write content to file for post in blog_post_details[:5]: print("title:{}\n-----------".format(post['title'])) print("content:{}\n".format(post['content'][0:250]))