#!/usr/bin/env python # coding: utf-8 # # Transkribus workflow tests # # Demonstrate basic workflow components for individual rows extracted from Stock Exchange images. # # * Log into Transkribus # * Create a collection # * From a row image generate an xml file in the Transkribus `Page` format # * Upload image and xml file to Transcribus # * Trigger HTR # * Download XML with results of HTR # In[8]: get_ipython().system('pip install lxml') # In[1]: import os import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry s = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ]) s.mount('https://', HTTPAdapter(max_retries=retries)) s.mount('http://', HTTPAdapter(max_retries=retries)) # Use JSON rather than XML s.headers = {'Accept': 'application/json'} # ## Login # # Saves a JSESSIONID cookie in the session for future requests. # In[2]: credentials = { 'user': 'tim@discontents.com.au', 'pw': 'hP8sQKPn9fLDKV' } # In[3]: # Login response = s.post('https://transkribus.eu/TrpServer/rest/auth/login', data=credentials) # Check that JSESSIONID has been set response.cookies # ## View collections # In[4]: # Get collections response = s.get('https://transkribus.eu/TrpServer/rest/collections/list') # In[5]: response.json() # ## Create a new collection # In[ ]: new_collection = { 'collName': 'api-test2' } coll_response = s.post('https://transkribus.eu/TrpServer/rest/collections/createCollection', params=new_collection) # In[ ]: coll_response.text # ## Upload images and xml # In[11]: # import lxml from lxml import etree from PIL import Image def generate_xml_for_image(image_path=None): ''' Prepares XML for upload, inserting image dimensions as required. ''' with open('page_xml.xml', 'rb') as xml_file: template = etree.parse(xml_file) print(template) root = template.getroot() img = Image.open(image_path) w, h = img.size image_file = os.path.basename(image_path) page = root.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page') page.set('imageFilename', image_file) page.set('imageWidth', str(w)) page.set('imageHeight', str(h)) tr = page.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion') tr_coords = tr.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords') tr_coords.set('points', '0,0 0,{h} {w},{h}, {w},0'.format(w=w, h=h)) tl = tr.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine') tl_coords = tl.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords') tl_coords.set('points', '15,15 {w},15 {w},{h}, 15,{h}'.format(w=w-15, h=h-15)) with open('{}.xml'.format(image_file[:-4]), 'wb') as new_xml: new_xml.write(etree.tostring(template, pretty_print=True)) # In[12]: # Basic template for upload data doc_payload = { "md": { "title": "Test", "author": "Sydney Stock Exchange", "genre": "", "writer": "" }, "pageList": {"pages": [ { "fileName": "N193-150_0428-col-2-14.jpg", "pageXmlName": "N193-150_0428-col-2-14-1.xml", "pageNr": 1 } ]} } # In[13]: def upload_doc(coll_id, image_path, doc_name='Test'): ''' Uploads image and XML files to Tranksribus. ''' # Prepare XML file generate_xml_for_image(image_path) image_file = os.path.basename(image_path) xml_file = '{}-1.xml'.format(image_file[:-4]) # Modify payload payload = doc_payload.copy() payload['md']['title'] = doc_name payload['pageList']['pages'][0]['fileName'] = image_file payload['pageList']['pages'][0]['pageXmlName'] = xml_file # Post metadata response = s.post('https://transkribus.eu/TrpServer/rest/uploads?collId={}'.format(coll_id), json=payload) print(response.url) # Get upload id from response to submit with files upload_id = response.json()['uploadId'] print(upload_id) files = {'img': open(image_path, 'rb'), 'xml': open(xml_file, 'rb')} # Upload the xml and image files response = s.put('https://transkribus.eu/TrpServer/rest/uploads/{}'.format(upload_id), files=files) return upload_id # In[14]: doc_id = upload_doc(40099, 'data/columns/rows-test/sample/N193-150_0428-col-2-14.jpg') # ## Initiate HTR # In[14]: params = { 'id': doc_id, 'pages': 1 } h = s.post('https://transkribus.eu/TrpServer/rest/recognition/{}/{}/htrCITlab'.format(40099, 133), params=params) # In[16]: job_id = h.json() # In[17]: # Check on the status of the job (put in a loop) j = s.get('https://transkribus.eu/TrpServer/rest/jobs/{}'.format(job_id)) # In[19]: j.json()['success'] # In[20]: # If job success is true, then get the results r = s.get('https://transkribus.eu/TrpServer/rest/collections/{}/{}/fulldoc'.format(40099, doc_id)) # In[21]: # Get the results page = 0 docinfo = r.json() xml_url = docinfo['pageList']['pages'][page]['tsList']['transcripts'][0]['url'] x = requests.get(xml_url) x.text # Save as xml # In[ ]: