#!/usr/bin/env python
# coding: utf-8

# # Transkribus workflow tests
# 
# Demonstrate basic workflow components for individual rows extracted from Stock Exchange images.
# 
# * Log into Transkribus
# * Create a collection
# * From a row image generate an xml file in the Transkribus `Page` format
# * Upload image and xml file to Transcribus
# * Trigger HTR
# * Download XML with results of HTR

# In[8]:


get_ipython().system('pip install lxml')


# In[1]:


import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))
# Use JSON rather than XML
s.headers = {'Accept': 'application/json'}


# ## Login
# 
# Saves a JSESSIONID cookie in the session for future requests.

# In[2]:


credentials = {
    'user': 'tim@discontents.com.au',
    'pw': 'hP8sQKPn9fLDKV'
}


# In[3]:


# Login
response = s.post('https://transkribus.eu/TrpServer/rest/auth/login', data=credentials)
# Check that JSESSIONID has been set
response.cookies


# ## View collections

# In[4]:


# Get collections
response = s.get('https://transkribus.eu/TrpServer/rest/collections/list')


# In[5]:


response.json()


# ## Create a new collection

# In[ ]:


new_collection = {
    'collName': 'api-test2'
}
coll_response = s.post('https://transkribus.eu/TrpServer/rest/collections/createCollection', params=new_collection)


# In[ ]:


coll_response.text


# ## Upload images and xml

# In[11]:


# import lxml
from lxml import etree
from PIL import Image

def generate_xml_for_image(image_path=None):
    '''
    Prepares XML for upload, inserting image dimensions as required.
    '''
    with open('page_xml.xml', 'rb') as xml_file:
        template = etree.parse(xml_file)
    print(template)
    root = template.getroot()
    img = Image.open(image_path)
    w, h = img.size
    image_file = os.path.basename(image_path)
    page = root.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page')
    page.set('imageFilename', image_file)
    page.set('imageWidth', str(w))
    page.set('imageHeight', str(h))
    tr = page.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion')
    tr_coords = tr.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords')
    tr_coords.set('points', '0,0 0,{h} {w},{h}, {w},0'.format(w=w, h=h))
    tl = tr.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine')
    tl_coords = tl.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords')
    tl_coords.set('points', '15,15 {w},15 {w},{h}, 15,{h}'.format(w=w-15, h=h-15))
    with open('{}.xml'.format(image_file[:-4]), 'wb') as new_xml:
        new_xml.write(etree.tostring(template, pretty_print=True))
        

# In[12]:


# Basic template for upload data
doc_payload = {
    "md": {
        "title": "Test",
        "author": "Sydney Stock Exchange",
        "genre": "",
        "writer": ""
    },
    "pageList": {"pages": [
        {
            "fileName": "N193-150_0428-col-2-14.jpg",
            "pageXmlName": "N193-150_0428-col-2-14-1.xml",
            "pageNr": 1
        }
    ]}
}


# In[13]:


def upload_doc(coll_id, image_path, doc_name='Test'):
    '''
    Uploads image and XML files to Tranksribus.
    '''
    # Prepare XML file
    generate_xml_for_image(image_path)
    image_file = os.path.basename(image_path)
    xml_file = '{}-1.xml'.format(image_file[:-4])
    # Modify payload
    payload = doc_payload.copy()
    payload['md']['title'] = doc_name
    payload['pageList']['pages'][0]['fileName'] = image_file
    payload['pageList']['pages'][0]['pageXmlName'] = xml_file
    # Post metadata
    response = s.post('https://transkribus.eu/TrpServer/rest/uploads?collId={}'.format(coll_id), json=payload)
    print(response.url)
    # Get upload id from response to submit with files
    upload_id = response.json()['uploadId']
    print(upload_id)
    files = {'img': open(image_path, 'rb'), 'xml': open(xml_file, 'rb')}
    # Upload the xml and image files
    response = s.put('https://transkribus.eu/TrpServer/rest/uploads/{}'.format(upload_id), files=files)
    return upload_id
    

# In[14]:


doc_id = upload_doc(40099, 'data/columns/rows-test/sample/N193-150_0428-col-2-14.jpg')


# ## Initiate HTR

# In[14]:


params = {
    'id': doc_id,
    'pages': 1
}
h = s.post('https://transkribus.eu/TrpServer/rest/recognition/{}/{}/htrCITlab'.format(40099, 133), params=params)


# In[16]:


job_id = h.json()


# In[17]:


# Check on the status of the job (put in a loop)
j = s.get('https://transkribus.eu/TrpServer/rest/jobs/{}'.format(job_id))


# In[19]:


j.json()['success']


# In[20]:


# If job success is true, then get the results
r = s.get('https://transkribus.eu/TrpServer/rest/collections/{}/{}/fulldoc'.format(40099, doc_id))


# In[21]:


# Get the results
page = 0
docinfo = r.json()
xml_url = docinfo['pageList']['pages'][page]['tsList']['transcripts'][0]['url']
x = requests.get(xml_url)
x.text # Save as xml 


# In[ ]: