#!/usr/bin/env python # coding: utf-8 # In[9]: import os os.chdir('/Users/muneebalam/Desktop/ipython/') import os.path if not os.path.exists('/Users/muneebalam/Desktop/ipython/chilcot/'): os.mkdir('/Users/muneebalam/Desktop/ipython/chilcot/') os.chdir('/Users/muneebalam/Desktop/ipython/chilcot/') import urllib.request urlbase = 'http://www.iraqinquiry.org.uk' from PyPDF2 import PdfFileReader # In[2]: def download(): r = urllib.request.urlopen('http://www.iraqinquiry.org.uk/the-report/') page = r.read().decode('utf-8') r.close() page = page.split('.pdf') for i in range(len(page)-1): url = '{0:s}{1:s}.pdf'.format(urlbase, page[i][page[i].rfind('href="')+6:]) savefile = '/Users/muneebalam/Desktop/ipython/chilcot/' + url[url.rfind('/')+1:] print(i+1, ' of ', len(page)-1, url) if not os.path.exists(savefile): response = urllib.request.urlopen(url) data = response.read() response.close() w = open(savefile, 'wb') w.write(data) w.close() # In[12]: #download() files = ['/Users/muneebalam/Desktop/ipython/chilcot/' + x for x in os.listdir('/Users/muneebalam/Desktop/ipython/chilcot/')] # In[29]: def parse(file): r = PdfFileReader(file) numpages = r.getNumPages() fbase = file[:file.rfind('.')] alltext = [] for i in range(numpages): try: text = r.getPage(i).extractText() text = text.replace('.\n', '. ').replace('\n', '').replace(' ', ' ') alltext.append(text) except Exception as e: print(file, 'page', i, e, e.args) text = '\n'.join(alltext) w = open('{0:s}-{1:d}.txt'.format(fbase, i), 'w') w.write(text) w.close() print('Done with', file) # In[30]: for file in files: parse(file) # In[ ]: # In[ ]: # In[ ]: