#!/usr/bin/env python
# coding: utf-8

# In[9]:


import os
os.chdir('/Users/muneebalam/Desktop/ipython/')
import os.path
if not os.path.exists('/Users/muneebalam/Desktop/ipython/chilcot/'):
    os.mkdir('/Users/muneebalam/Desktop/ipython/chilcot/')
os.chdir('/Users/muneebalam/Desktop/ipython/chilcot/')
import urllib.request
urlbase = 'http://www.iraqinquiry.org.uk'
from PyPDF2 import PdfFileReader


# In[2]:


def download():
    r = urllib.request.urlopen('http://www.iraqinquiry.org.uk/the-report/')
    page = r.read().decode('utf-8')
    r.close()
    
    page = page.split('.pdf')
    for i in range(len(page)-1):
        url = '{0:s}{1:s}.pdf'.format(urlbase, page[i][page[i].rfind('href="')+6:])
        savefile = '/Users/muneebalam/Desktop/ipython/chilcot/' + url[url.rfind('/')+1:]
        print(i+1, ' of ', len(page)-1, url)
        if not os.path.exists(savefile):
            response = urllib.request.urlopen(url)
            data = response.read()
            response.close()

            w = open(savefile, 'wb')
            w.write(data)
            w.close()


# In[12]:


#download()
files = ['/Users/muneebalam/Desktop/ipython/chilcot/' + x for x in os.listdir('/Users/muneebalam/Desktop/ipython/chilcot/')]


# In[29]:


def parse(file):
    r = PdfFileReader(file)
    numpages = r.getNumPages()
    
    fbase = file[:file.rfind('.')]
    alltext = []
    for i in range(numpages):
        try:
            text = r.getPage(i).extractText()
            text = text.replace('.\n', '. ').replace('\n', '').replace('  ', ' ')
            alltext.append(text)
        except Exception as e:
            print(file, 'page', i, e, e.args)
    text = '\n'.join(alltext)
    w = open('{0:s}-{1:d}.txt'.format(fbase, i), 'w')
    w.write(text)
    w.close()
    print('Done with', file)


# In[30]:


for file in files:
    parse(file)


# In[ ]:


# In[ ]:


# In[ ]: