#!/usr/bin/env python
# coding: utf-8

# # Wordpress scraper
# 
# Suggestions for downloading Wordpress blogs. 
# 
# **Note:** Worpress blogs have different configurations and versions. So the code has to be adapted by first inspecting the source code of the rendered html, then the ``soup.findAll`` has to be changed to fit either the permanent URL in **Step 1** or the content of the blog post in **Step 2**. 
# 
# **Note 2:** Scraping blogs can violate some agreement and might get your connection banned visiting the blog. Probably the content is also copyrighted. Use wisely. 

# In[ ]:


import requests
import io
from bs4 import BeautifulSoup


# ## Step 1 - Fetch URLs of fulltext articles

# In[ ]:


urlfile = open("urllist.txt", 'w') #opens an output file for storing permanent urls

baseurl = "http://urloftheblog.com"


# In[ ]:


counter = 1

for i in range(0,1):
    try:
        url = baseurl + "/page/" + str(counter) + '/'
        print("Counter:" + str(counter))
        print(url)
        r = requests.get(url)
        file_like_obj = io.StringIO(r.text) #Turns the requested output into a file like objet
        lines = file_like_obj.read()

        soup = BeautifulSoup(lines, "lxml")
        
        # Change below according to the rendered source code of the blog html.
        # What your want is the direct url to each blog post. 
        posturls = soup.findAll("h2", { "class" : "entry-title" }) 

        for p in posturls:
            #print(p.find('a').attrs['href'])
            urlfile.write(p.find('a').attrs['href'] + "\n")
        counter += 1
    except ConnectionError: # Add more exceptions if needed. 
        print("There was a connectin error for " + url)
    

urlfile.close()


# ## Step 2 - Get the full text body of each blog post and write to file. 

# In[ ]:


posturls = [line.rstrip('\n') for line in open('urllist.txt')] # Load urls from Step 1
print(posturls[0]) #just to check that the list is full of urls.


# In[ ]:


blogcontentfile = open("blogcontent.txt", 'w') # Open up a file to store content.

articlecounter = 0

failedarticles = [] # If this one grows, store them in a file or something. 

for url in posturls:
    try:
        req = requests.get(url)
        file_like_object = io.StringIO(req.text) 
        apxlines = file_like_object.read()
        apxsoup = BeautifulSoup(apxlines, "lxml")
        
        # Change below according to the rendered source code of the blog html.
        # What your want is the direct url to each blog post. 
        postbody = apxsoup.findAll("div", { "class" : "entry-content" })

        for p in postbody:
            articlecounter += 1
            print(str(articlecounter) + ". " + url)
            #print("-----\n" + url + "\n" + p.text)
            blogcontentfile.write("-----\n" + url + "\n" + p.text)
    except requests.exceptions.RequestException as e:  
        print(e)
        failedarticles.append(url)

blogcontentfile.close()

print("The following URLs failed to download:")
print(failedarticles)