#!/usr/bin/env python
# coding: utf-8

# # Harvesting series

# In[8]:


import time
import csv
import os
import math
import string
import requests
import pandas as pd
from PIL import Image, ImageOps
from requests import ConnectionError
from recordsearch_tools.utilities import retry
from recordsearch_tools.client import RSSearchClient, RSSeriesClient
from tinydb import TinyDB, Query
try:
    from io import BytesIO
except ImportError:
    from StringIO import StringIO
from IPython.display import Image as DImage
from IPython.core.display import HTML

# Plotly helps us make pretty charts
import plotly.offline as py
import plotly.graph_objs as go

# This lets Plotly draw charts in cells
py.init_notebook_mode()


# In[2]:


# What series do you want to harvest?
# Insert the series id between the quotes.
series = 'B13'


# ## The harvesting code

# In[13]:


class SeriesHarvester():
    def __init__(self, series, control=None, images_only=False):
        self.series = series
        self.control = control
        if not images_only:
            self.total_pages = None
            self.pages_complete = 0
            self.client = RSSearchClient()
            self.prepare_harvest()
        self.db = TinyDB('data/db-{}.json'.format(self.series.replace('/', '-')))
        self.items = self.db.table('items')
        self.images = self.db.table('images')

    def get_total(self):
        return self.client.total_results

    def prepare_harvest(self):
        if self.control:
            self.client.search(series=self.series, control=self.control)
        else:
            self.client.search(series=self.series)
        total_results = self.client.total_results
        print('{} items'.format(total_results))
        self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1
        print(self.total_pages)

    @retry(ConnectionError, tries=20, delay=10, backoff=1)
    def start_harvest(self, page=None):
        Record = Query()
        if not page:
            page = self.pages_complete + 1
        while self.pages_complete < self.total_pages:
            if self.control:
                response = self.client.search(series=self.series, page=page, control=self.control, sort='9')
            else:
                response = self.client.search(series=self.series, page=page, sort='9')
            for result in response['results']:
                self.items.upsert(result, Record.identifier == result['identifier'])
            self.pages_complete += 1
            page += 1
            print('{} pages complete'.format(self.pages_complete))
            time.sleep(1)
        
    @retry(ConnectionError, tries=20, delay=10, backoff=1)
    def harvest_images(self):
        Record = Query()
        items = self.items.search(Record.digitised_status == True)
        headers = {'User-Agent': 'Mozilla/5.0'}
        for item in items:
            directory = os.path.join('data', 'images', '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-').replace(' ', '-'), item['identifier']))
            if not os.path.exists(directory):
                os.makedirs(directory)
            for page in range(1, item['digitised_pages'] + 1):
                filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page)
                print('{}, p. {}'.format(item['identifier'], page))
                if not os.path.exists(filename):
                    img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page)
                    response = requests.get(img_url, headers=headers, stream=True, verify=False)
                    response.raise_for_status()
                    try:
                        image = Image.open(BytesIO(response.content))
                    except IOError:
                        print('Not an image')
                    else:
                        width, height = image.size
                        image.save(filename)
                        del response
                        image_meta = {
                            'image_id': '{}-{}'.format(item['identifier'], page),
                            'identifier': item['identifier'],
                            'page': page,
                            'width': width,
                            'height': height
                        }
                        self.images.upsert(image_meta, Record.image_id == image_meta['image_id'])
                        print('Image saved')
            time.sleep(1)


# In[4]:


def harvest_series(series):
    h = SeriesHarvester(series=series)
    h.start_harvest()


# In[5]:


def harvest_large_series(series, control_range=None):
    '''
    RecordSearch will not return more than 20,000 results.
    If a series has more than 20,000 items you'll need to break it up.
    The easiest way to do this is to add a param for control_symbol.
    This function will break break a series harvest down into a series of harvests --
    using each letter and number with a wildcard as the control_symbol parameter.
    This should be enough to harvest most large series, but in some cases you might need to supply a custom list of control_symbol prefixes.
    '''
    if not control_range:
        control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(number) + '*' for number in range(0, 10)]
    for control in control_range:
        print(control)
        h = SeriesHarvester(series=series, control=control)
        h.start_harvest()
    

# In[16]:


def harvest_images(series, images_only=False):
    h = SeriesHarvester(series=series, images_only=images_only)
    h.harvest_images()


# ## Functions to process harvested data

# In[5]:


def convert_to_df(series):
    '''
    Get the series data from TinyDB and save as a Pandas dataframe.
    Also flattens the date dictionary, and does a bit of ordering.
    '''
    
    # Load the series db
    db = TinyDB('data/db-{}.json'.format(series.replace('/', '-')))
    items = db.table('items')
    
    # Let's convert the database into a simple list
    item_list = [i for i in items]
    
    # Now let's turm that list into a Pandas Dataframe
    df = pd.DataFrame(item_list)
    
    # The 'contents_date' column is a dictionary, we need to flatten this out so we can easily work with the values
    df = pd.concat([df, pd.DataFrame((d for idx, d in df['contents_dates'].iteritems()))], axis=1)

    # Delete the old date field
    del df['contents_dates']

    # Rename column
    df.rename({'date_str': 'contents_dates'}, axis=1, inplace=True)
    
    # Put columns in preferred order
    df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_dates', 'start_date', 'end_date', 'access_status', 'location', 'digitised_status', 'digitised_pages']]
    df.sort_values(['identifier'])
    
    return df   


# In[6]:


def save_as_csv(series):
    '''
    Converts harvested data in TinyDB to a CSV file, via a Pandas dataframe.
    '''
    df = convert_to_df(series)
    df.to_csv('data/{}.csv'.format(series.replace('/', '-')), index=False)


# ## Running the harvests

# In[1]:


# This is a list of series relating to the White Australia Policy
# Large series (> 20,000 items) will be harvested separately
series_list = ['B6003', 'BP343/15', 'D2860', 'D5036', 'D596', 'E752', 'J2481', 'J2482', 'J2483', 'J3115', 'K1145', 'P437', 'P526', 'PP4/2', 'PP6/1', 'SP11/26', 'SP11/6', 'SP115/1', 'SP115/10', 'SP42/1', 'SP726/1', 'ST84/1']


# In[ ]:


# Loop through series list, harvesting each in turn
for s in series_list:
    harvest_series(s)


# In[17]:


# Loop through series list, harvesting each in turn
for s in ['B13']:
    harvest_images(s, images_only=True)


# In[ ]:


# B13 is > 20,000 items
harvest_large_series('B13')


# In[3]:


# A1 is a large series that needs a custom control range to harvest
# This generates a list of control_symbol prefixes that should break it down into harvestable chunks (< 20,000 items)
# This should also work with series that use the the current year as the prefix, eg 1935/190
# Just feed this range to harvest_large_series() -- eg. harvest_large_series('A1', control_range)
control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(num) + '*' for num in range(2,10)] + ['1{}*'.format(num2) for num2 in [str(num) for num in range(0,9)]] + ['19{}*'.format(num2) for num2 in [str(num) for num in range(0,10)]]
print(control_range)


# In[ ]:


# Harvest A1
harvest_large_series('A1', control_range)


# ## Saving as CSV-formatted files

# In[9]:


# Assuming you've already harvested the series!
for s in series_list:
    save_as_csv(s)


# In[10]:


# Don't forget B13
save_as_csv('B13')


# In[ ]: