#!/usr/bin/env python # coding: utf-8 # # Wordpress scraper # # Suggestions for downloading Wordpress blogs. # # **Note:** Worpress blogs have different configurations and versions. So the code has to be adapted by first inspecting the source code of the rendered html, then the ``soup.findAll`` has to be changed to fit either the permanent URL in **Step 1** or the content of the blog post in **Step 2**. # # **Note 2:** Scraping blogs can violate some agreement and might get your connection banned visiting the blog. Probably the content is also copyrighted. Use wisely. # In[ ]: import requests import io from bs4 import BeautifulSoup # ## Step 1 - Fetch URLs of fulltext articles # In[ ]: urlfile = open("urllist.txt", 'w') #opens an output file for storing permanent urls baseurl = "http://urloftheblog.com" # In[ ]: counter = 1 for i in range(0,1): try: url = baseurl + "/page/" + str(counter) + '/' print("Counter:" + str(counter)) print(url) r = requests.get(url) file_like_obj = io.StringIO(r.text) #Turns the requested output into a file like objet lines = file_like_obj.read() soup = BeautifulSoup(lines, "lxml") # Change below according to the rendered source code of the blog html. # What your want is the direct url to each blog post. posturls = soup.findAll("h2", { "class" : "entry-title" }) for p in posturls: #print(p.find('a').attrs['href']) urlfile.write(p.find('a').attrs['href'] + "\n") counter += 1 except ConnectionError: # Add more exceptions if needed. print("There was a connectin error for " + url) urlfile.close() # ## Step 2 - Get the full text body of each blog post and write to file. # In[ ]: posturls = [line.rstrip('\n') for line in open('urllist.txt')] # Load urls from Step 1 print(posturls[0]) #just to check that the list is full of urls. # In[ ]: blogcontentfile = open("blogcontent.txt", 'w') # Open up a file to store content. articlecounter = 0 failedarticles = [] # If this one grows, store them in a file or something. for url in posturls: try: req = requests.get(url) file_like_object = io.StringIO(req.text) apxlines = file_like_object.read() apxsoup = BeautifulSoup(apxlines, "lxml") # Change below according to the rendered source code of the blog html. # What your want is the direct url to each blog post. postbody = apxsoup.findAll("div", { "class" : "entry-content" }) for p in postbody: articlecounter += 1 print(str(articlecounter) + ". " + url) #print("-----\n" + url + "\n" + p.text) blogcontentfile.write("-----\n" + url + "\n" + p.text) except requests.exceptions.RequestException as e: print(e) failedarticles.append(url) blogcontentfile.close() print("The following URLs failed to download:") print(failedarticles)