import httplib2 from apiclient.discovery import build from oauth2client.client import flow_from_clientsecrets from oauth2client.file import Storage from oauth2client.tools import run import base64 import re import requests from lxml import etree from StringIO import StringIO import itertools as it import urllib2 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO from collections import defaultdict from lxml.etree import fromstring import sqlite3 from datetime import datetime import zlib # Path to the client_secret.json file downloaded from the Developer Console CLIENT_SECRET_FILE = 'client_secret.json' # Check https://developers.google.com/gmail/api/auth/scopes for all available scopes OAUTH_SCOPE = 'https://www.googleapis.com/auth/gmail.readonly' # Location of the credentials storage file STORAGE = Storage('gmail.storage') # Start the OAuth flow to retrieve credentials flow = flow_from_clientsecrets(CLIENT_SECRET_FILE, scope=OAUTH_SCOPE) http = httplib2.Http() # Try to retrieve credentials from storage or run the flow to generate them credentials = STORAGE.get() if credentials is None or credentials.invalid: credentials = run(flow, STORAGE, http=http) # Authorize the httplib2.Http object with our credentials http = credentials.authorize(http) # Build the Gmail service from discovery gmail_service = build('gmail', 'v1', http=http) labels = gmail_service.users().labels().list(userId='me').execute()['labels'] # 'me' is the currently logged-in user label_id = filter(lambda x: x['name'] == 'Links', labels)[0]['id'] def get_message_ids(): """ Page through all messages in `label_id` """ next_page = None while True: if next_page is not None: response = gmail_service.users().messages().list(userId='me', labelIds=[label_id], pageToken=next_page).execute() else: response = gmail_service.users().messages().list(userId='me', labelIds=[label_id]).execute() messages = response.get('messages') next_page = response.get('nextPageToken') for el in messages: yield el['id'] if next_page is None: break def message_bodies(): for ctr, message_id in enumerate(get_message_ids()): message = gmail_service.users().messages().get(userId='me', id=message_id, format='full').execute() try: body = message['payload']['parts'][0]['body']['data'] # MIME except KeyError: body = message['payload']['body']['data'] # text/plain body = base64.b64decode(str(body), '-_') yield body pattern = (r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|' r'www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)' r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))' r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))* \)|' r'[^\s`!()\[\]{};:\'\".,<>?\«\»\“\”\‘\’]))') def is_feedly(body): return 'feedly.com' in body def urls(): for body in message_bodies(): matches = re.findall(pattern, body) if is_feedly(body): match = matches[0] yield match[0] # Feedly e-mail: first URL is link to original story else: for match in matches: yield match[0] exclude = ['packtpub.com', 'disqus', '@', 'list-manage', 'utm_', 'ref=', 'campaign-archive'] def urls_filtered(): for url in urls(): if not any([pattern in url.lower() for pattern in exclude]): yield url def is_hn(url): return 'news.ycombinator.com' in url parser = etree.HTMLParser() def urls_hn_filtered(): for url in urls_filtered(): if is_hn(url) and (re.search(r'item\?id=', url) is None): continue # do not keep HN links that do not point to an article elif is_hn(url): r = requests.get(url) if r.status_code != 200: continue # download of HN html failed, skip root = etree.parse(StringIO(r.text), parser).getroot() title = root.find(".//td[@class='title']") try: a = [child for child in title.getchildren() if child.tag == 'a'][0] except AttributeError: continue # title is None story_url = a.get('href') yield story_url else: yield url def unique_urls(): seen = defaultdict(bool) for url in urls_hn_filtered(): key = hash(url) if seen[key]: continue else: seen[key] = True yield url def pdf_from_url_to_txt(url): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Open the url provided as an argument to the function and read the content f = urllib2.urlopen(urllib2.Request(url)).read() # Cast to StringIO object fp = StringIO(f) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() return string def resource_text(): headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0'} html_parser = etree.HTMLParser(recover=True, encoding='utf-8') for url in unique_urls(): if url.endswith('.pdf'): try: text = pdf_from_url_to_txt(url) except: continue # something went wrong, just skip ahead yield url, text else: try: r = requests.get(url, headers=headers) except: continue # something went wrong with HTTP GET, just skip ahead if r.status_code != 200: continue if not 'text/html' in r.headers.get('content-type', ''): continue # from: http://stackoverflow.com/a/23929292 and http://stackoverflow.com/a/15830619 try: document = fromstring(r.text.encode('utf-8'), html_parser) except: continue # error parsing document, just skip ahead yield url, '\n'.join(etree.XPath('//text()')(document)) text_generator = resource_text() try: db = sqlite3.connect('gmail_extracted_text.db') db.execute('CREATE TABLE gmail (date text, url text, compression text, extracted blob)') db.commit() db.close() except sqlite3.OperationalError: pass # table gmail already exists while True: try: db = sqlite3.connect('gmail_extracted_text.db') url, text = text_generator.next() now = datetime.now().__str__() if isinstance(text, unicode): text = zlib.compress(text.encode('utf-8')) # to decompress: zlib.decompress(text).decode('utf-8') else: text = zlib.compress(text) db.execute('INSERT INTO gmail VALUES (?, ?, ?, ?)', (unicode(now), unicode(url), u'zlib', sqlite3.Binary(text))) db.commit() except StopIteration: break # generator consumed, stop calling .next() on it except Exception, e: print e continue # some other exception was thrown by the pipeline, just skip ahead finally: db.close() # tidy up