Suggestions for downloading Wordpress blogs.
Note: Worpress blogs have different configurations and versions. So the code has to be adapted by first inspecting the source code of the rendered html, then the soup.findAll
has to be changed to fit either the permanent URL in Step 1 or the content of the blog post in Step 2.
Note 2: Scraping blogs can violate some agreement and might get your connection banned visiting the blog. Probably the content is also copyrighted. Use wisely.
import requests
import io
from bs4 import BeautifulSoup
urlfile = open("urllist.txt", 'w') #opens an output file for storing permanent urls
baseurl = "http://urloftheblog.com"
counter = 1
for i in range(0,1):
try:
url = baseurl + "/page/" + str(counter) + '/'
print("Counter:" + str(counter))
print(url)
r = requests.get(url)
file_like_obj = io.StringIO(r.text) #Turns the requested output into a file like objet
lines = file_like_obj.read()
soup = BeautifulSoup(lines, "lxml")
# Change below according to the rendered source code of the blog html.
# What your want is the direct url to each blog post.
posturls = soup.findAll("h2", { "class" : "entry-title" })
for p in posturls:
#print(p.find('a').attrs['href'])
urlfile.write(p.find('a').attrs['href'] + "\n")
counter += 1
except ConnectionError: # Add more exceptions if needed.
print("There was a connectin error for " + url)
urlfile.close()
posturls = [line.rstrip('\n') for line in open('urllist.txt')] # Load urls from Step 1
print(posturls[0]) #just to check that the list is full of urls.
blogcontentfile = open("blogcontent.txt", 'w') # Open up a file to store content.
articlecounter = 0
failedarticles = [] # If this one grows, store them in a file or something.
for url in posturls:
try:
req = requests.get(url)
file_like_object = io.StringIO(req.text)
apxlines = file_like_object.read()
apxsoup = BeautifulSoup(apxlines, "lxml")
# Change below according to the rendered source code of the blog html.
# What your want is the direct url to each blog post.
postbody = apxsoup.findAll("div", { "class" : "entry-content" })
for p in postbody:
articlecounter += 1
print(str(articlecounter) + ". " + url)
#print("-----\n" + url + "\n" + p.text)
blogcontentfile.write("-----\n" + url + "\n" + p.text)
except requests.exceptions.RequestException as e:
print(e)
failedarticles.append(url)
blogcontentfile.close()
print("The following URLs failed to download:")
print(failedarticles)