import httplib2

from apiclient.discovery import build
from oauth2client.client import flow_from_clientsecrets
from oauth2client.file import Storage
from oauth2client.tools import run

import base64
import re
import requests
from lxml import etree
from StringIO import StringIO
import itertools as it

import urllib2
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

from collections import defaultdict

from lxml.etree import fromstring

import sqlite3
from datetime import datetime
import zlib

# Path to the client_secret.json file downloaded from the Developer Console
CLIENT_SECRET_FILE = 'client_secret.json'

# Check https://developers.google.com/gmail/api/auth/scopes for all available scopes
OAUTH_SCOPE = 'https://www.googleapis.com/auth/gmail.readonly'

# Location of the credentials storage file
STORAGE = Storage('gmail.storage')

# Start the OAuth flow to retrieve credentials
flow = flow_from_clientsecrets(CLIENT_SECRET_FILE, scope=OAUTH_SCOPE)
http = httplib2.Http()

# Try to retrieve credentials from storage or run the flow to generate them
credentials = STORAGE.get()
if credentials is None or credentials.invalid:
  credentials = run(flow, STORAGE, http=http)

# Authorize the httplib2.Http object with our credentials
http = credentials.authorize(http)

# Build the Gmail service from discovery
gmail_service = build('gmail', 'v1', http=http)

labels = gmail_service.users().labels().list(userId='me').execute()['labels']  # 'me' is the currently logged-in user
label_id = filter(lambda x: x['name'] == 'Links', labels)[0]['id']

def get_message_ids():
    """ Page through all messages in `label_id` """
    next_page = None

    while True:
        if next_page is not None:
            response = gmail_service.users().messages().list(userId='me', labelIds=[label_id], pageToken=next_page).execute()
        else:
            response = gmail_service.users().messages().list(userId='me', labelIds=[label_id]).execute()

        messages = response.get('messages')
        next_page = response.get('nextPageToken')

        for el in messages:
            yield el['id']

        if next_page is None:
            break

def message_bodies():
    for ctr, message_id in enumerate(get_message_ids()):
        message = gmail_service.users().messages().get(userId='me', id=message_id, format='full').execute()
    
        try:
            body = message['payload']['parts'][0]['body']['data']  # MIME
        except KeyError:
            body = message['payload']['body']['data']  # text/plain
        
        body = base64.b64decode(str(body), '-_')
        
        yield body

pattern = (r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|'
           r'www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)'
           r'(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))'
           r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*      \)|'
           r'[^\s`!()\[\]{};:\'\".,<>?\«\»\“\”\‘\’]))')

def is_feedly(body):
    return 'feedly.com' in body

def urls():
    for body in message_bodies():
        matches = re.findall(pattern, body)
        if is_feedly(body):
            match = matches[0]
            yield match[0]  # Feedly e-mail: first URL is link to original story
        else:
            for match in matches:
                yield match[0]

exclude = ['packtpub.com', 'disqus', '@', 'list-manage', 'utm_', 'ref=', 'campaign-archive']
def urls_filtered():
    for url in urls():
        if not any([pattern in url.lower() for pattern in exclude]):
            yield url

def is_hn(url):
    return 'news.ycombinator.com' in url

parser = etree.HTMLParser()
def urls_hn_filtered():
    for url in urls_filtered():
        if is_hn(url) and (re.search(r'item\?id=', url) is None):
            continue  # do not keep HN links that do not point to an article
        elif is_hn(url):
            r = requests.get(url)
            if r.status_code != 200:
                continue  # download of HN html failed, skip
            root = etree.parse(StringIO(r.text), parser).getroot()
            title = root.find(".//td[@class='title']")
        
            try:
                a = [child for child in title.getchildren() if child.tag == 'a'][0]
            except AttributeError:
                continue  # title is None

            story_url = a.get('href')
            yield story_url
        else:
            yield url

def unique_urls():
    seen = defaultdict(bool)
    for url in urls_hn_filtered():
        key = hash(url)
        if seen[key]:
            continue
        else:
            seen[key] = True
            yield url

def pdf_from_url_to_txt(url):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Open the url provided as an argument to the function and read the content
    f = urllib2.urlopen(urllib2.Request(url)).read()
    # Cast to StringIO object
    fp = StringIO(f)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string

def resource_text():
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0'}
    html_parser = etree.HTMLParser(recover=True, encoding='utf-8')
    
    for url in unique_urls():
        if url.endswith('.pdf'):
            try:
                text = pdf_from_url_to_txt(url)
            except:
                continue  # something went wrong, just skip ahead
            
            yield url, text
            
        else:
            try:
                r = requests.get(url, headers=headers)
            except:
                continue  # something went wrong with HTTP GET, just skip ahead
        
            if r.status_code != 200:
                continue
            if not 'text/html' in r.headers.get('content-type', ''):
                continue
                
            # from: http://stackoverflow.com/a/23929292 and http://stackoverflow.com/a/15830619
            try:
                document = fromstring(r.text.encode('utf-8'), html_parser)
            except:
                continue  # error parsing document, just skip ahead
            
            yield url, '\n'.join(etree.XPath('//text()')(document))

text_generator = resource_text()

try:
    db = sqlite3.connect('gmail_extracted_text.db')
    db.execute('CREATE TABLE gmail (date text, url text, compression text, extracted blob)')
    db.commit()
    db.close()
except sqlite3.OperationalError:
    pass  # table gmail already exists

while True:
    try:
        db = sqlite3.connect('gmail_extracted_text.db')
        
        url, text = text_generator.next()
        
        now = datetime.now().__str__()
        if isinstance(text, unicode):
            text = zlib.compress(text.encode('utf-8'))  # to decompress: zlib.decompress(text).decode('utf-8')
        else:
            text = zlib.compress(text)
        db.execute('INSERT INTO gmail VALUES (?, ?, ?, ?)', (unicode(now), unicode(url), u'zlib', sqlite3.Binary(text)))
        db.commit()
    except StopIteration:
        break  # generator consumed, stop calling .next() on it
    except Exception, e:
        print e
        continue  # some other exception was thrown by the pipeline, just skip ahead
    finally:
        db.close()  # tidy up