Notebook

Wordpress scraper¶

Suggestions for downloading Wordpress blogs.

Note: Worpress blogs have different configurations and versions. So the code has to be adapted by first inspecting the source code of the rendered html, then the soup.findAll has to be changed to fit either the permanent URL in Step 1 or the content of the blog post in Step 2.

Note 2: Scraping blogs can violate some agreement and might get your connection banned visiting the blog. Probably the content is also copyrighted. Use wisely.

In [ ]:

import requests
import io
from bs4 import BeautifulSoup

Step 1 - Fetch URLs of fulltext articles¶

In [ ]:

urlfile = open("urllist.txt", 'w') #opens an output file for storing permanent urls

baseurl = "http://urloftheblog.com"

In [ ]:

counter = 1

for i in range(0,1):
    try:
        url = baseurl + "/page/" + str(counter) + '/'
        print("Counter:" + str(counter))
        print(url)
        r = requests.get(url)
        file_like_obj = io.StringIO(r.text) #Turns the requested output into a file like objet
        lines = file_like_obj.read()

        soup = BeautifulSoup(lines, "lxml")
        
        # Change below according to the rendered source code of the blog html.
        # What your want is the direct url to each blog post. 
        posturls = soup.findAll("h2", { "class" : "entry-title" }) 

        for p in posturls:
            #print(p.find('a').attrs['href'])
            urlfile.write(p.find('a').attrs['href'] + "\n")
        counter += 1
    except ConnectionError: # Add more exceptions if needed. 
        print("There was a connectin error for " + url)
    

urlfile.close()

Step 2 - Get the full text body of each blog post and write to file.¶

In [ ]:

posturls = [line.rstrip('\n') for line in open('urllist.txt')] # Load urls from Step 1
print(posturls[0]) #just to check that the list is full of urls.

In [ ]:

blogcontentfile = open("blogcontent.txt", 'w') # Open up a file to store content.

articlecounter = 0

failedarticles = [] # If this one grows, store them in a file or something. 

for url in posturls:
    try:
        req = requests.get(url)
        file_like_object = io.StringIO(req.text) 
        apxlines = file_like_object.read()
        apxsoup = BeautifulSoup(apxlines, "lxml")
        
        # Change below according to the rendered source code of the blog html.
        # What your want is the direct url to each blog post. 
        postbody = apxsoup.findAll("div", { "class" : "entry-content" })

        for p in postbody:
            articlecounter += 1
            print(str(articlecounter) + ". " + url)
            #print("-----\n" + url + "\n" + p.text)
            blogcontentfile.write("-----\n" + url + "\n" + p.text)
    except requests.exceptions.RequestException as e:  
        print(e)
        failedarticles.append(url)

blogcontentfile.close()

print("The following URLs failed to download:")
print(failedarticles)