Transkribus workflow tests

Demonstrate basic workflow components for individual rows extracted from Stock Exchange images.

  • Log into Transkribus
  • Create a collection
  • From a row image generate an xml file in the Transkribus Page format
  • Upload image and xml file to Transcribus
  • Trigger HTR
  • Download XML with results of HTR
In [8]:
!pip install lxml
Requirement already satisfied (use --upgrade to upgrade): lxml in /cvmfs/sft.cern.ch/lcg/views/LCG_91python3/x86_64-slc6-gcc62-opt/lib/python3.5/site-packages/lxml-2.3-py3.5-linux-x86_64.egg
You are using pip version 8.1.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
In [1]:
import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))
# Use JSON rather than XML
s.headers = {'Accept': 'application/json'}

Login

Saves a JSESSIONID cookie in the session for future requests.

In [2]:
credentials = {
    'user': '[email protected]',
    'pw': 'hP8sQKPn9fLDKV'
}
In [3]:
# Login
response = s.post('https://transkribus.eu/TrpServer/rest/auth/login', data=credentials)
# Check that JSESSIONID has been set
response.cookies
Out[3]:
<RequestsCookieJar[Cookie(version=0, name='JSESSIONID', value='0F55A6E6A69248DE8E13BB1B393E91B5', port=None, port_specified=False, domain='.transkribus.eu', domain_specified=True, domain_initial_dot=False, path='/TrpServer', path_specified=True, secure=True, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)]>

View collections

In [4]:
# Get collections
response = s.get('https://transkribus.eu/TrpServer/rest/collections/list')
In [5]:
response.json()
Out[5]:
[{'colId': 4,
  'colName': 'Transkribus Cloud',
  'crowdsourcing': False,
  'defaultForApp': 'ALL',
  'elearning': False,
  'nrOfDocuments': 19,
  'pageId': 11941,
  'role': 'Transcriber',
  'thumbUrl': 'https://dbis-thure.uibk.ac.at/f/Get?id=IAZSJCHTXHCUXXLQRMIBNWXN&fileType=thumb',
  'type': 'trpCollection',
  'url': 'https://dbis-thure.uibk.ac.at/f/Get?id=IAZSJCHTXHCUXXLQRMIBNWXN&fileType=view'},
 {'colId': 21108,
  'colName': 'ANU-Archives-Sydney-Stock-Exchange',
  'crowdsourcing': False,
  'description': 'created by [email protected]',
  'elearning': False,
  'nrOfDocuments': 76,
  'pageId': 3301753,
  'role': 'Owner',
  'thumbUrl': 'https://dbis-thure.uibk.ac.at/f/Get?id=CSELIXSKNOXVKGYUHGKUEWGD&fileType=thumb',
  'type': 'trpCollection',
  'url': 'https://dbis-thure.uibk.ac.at/f/Get?id=CSELIXSKNOXVKGYUHGKUEWGD&fileType=view'},
 {'colId': 40091,
  'colName': 'Stock-Exchange-Rows-Test',
  'crowdsourcing': False,
  'description': 'created by [email protected]',
  'elearning': False,
  'nrOfDocuments': 2,
  'pageId': 5830132,
  'role': 'Owner',
  'thumbUrl': 'https://dbis-thure.uibk.ac.at/f/Get?id=ZILBLAGOJDDCGBZCDHDCIVXV&fileType=thumb',
  'type': 'trpCollection',
  'url': 'https://dbis-thure.uibk.ac.at/f/Get?id=ZILBLAGOJDDCGBZCDHDCIVXV&fileType=view'},
 {'colId': 40099,
  'colName': 'api-test',
  'crowdsourcing': False,
  'description': 'created by [email protected]',
  'elearning': False,
  'nrOfDocuments': 17,
  'pageId': 5878293,
  'role': 'Owner',
  'thumbUrl': 'https://dbis-thure.uibk.ac.at/f/Get?id=CHZRPWYXWNMGHUBQYXNZADNZ&fileType=thumb',
  'type': 'trpCollection',
  'url': 'https://dbis-thure.uibk.ac.at/f/Get?id=CHZRPWYXWNMGHUBQYXNZADNZ&fileType=view'},
 {'colId': 40536,
  'colName': 'api-test2',
  'crowdsourcing': False,
  'description': 'created by [email protected]',
  'elearning': False,
  'nrOfDocuments': 0,
  'role': 'Owner',
  'type': 'trpCollection'},
 {'colId': 40537,
  'colName': 'api-test2',
  'crowdsourcing': False,
  'description': 'created by [email protected]',
  'elearning': False,
  'nrOfDocuments': 0,
  'role': 'Owner',
  'type': 'trpCollection'}]

Create a new collection

In [ ]:
new_collection = {
    'collName': 'api-test2'
}
coll_response = s.post('https://transkribus.eu/TrpServer/rest/collections/createCollection', params=new_collection)
In [ ]:
coll_response.text

Upload images and xml

In [11]:
# import lxml
from lxml import etree
from PIL import Image

def generate_xml_for_image(image_path=None):
    '''
    Prepares XML for upload, inserting image dimensions as required.
    '''
    with open('page_xml.xml', 'rb') as xml_file:
        template = etree.parse(xml_file)
    print(template)
    root = template.getroot()
    img = Image.open(image_path)
    w, h = img.size
    image_file = os.path.basename(image_path)
    page = root.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Page')
    page.set('imageFilename', image_file)
    page.set('imageWidth', str(w))
    page.set('imageHeight', str(h))
    tr = page.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextRegion')
    tr_coords = tr.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords')
    tr_coords.set('points', '0,0 0,{h} {w},{h}, {w},0'.format(w=w, h=h))
    tl = tr.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}TextLine')
    tl_coords = tl.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}Coords')
    tl_coords.set('points', '15,15 {w},15 {w},{h}, 15,{h}'.format(w=w-15, h=h-15))
    with open('{}.xml'.format(image_file[:-4]), 'wb') as new_xml:
        new_xml.write(etree.tostring(template, pretty_print=True))
        
In [12]:
# Basic template for upload data
doc_payload = {
    "md": {
        "title": "Test",
        "author": "Sydney Stock Exchange",
        "genre": "",
        "writer": ""
    },
    "pageList": {"pages": [
        {
            "fileName": "N193-150_0428-col-2-14.jpg",
            "pageXmlName": "N193-150_0428-col-2-14-1.xml",
            "pageNr": 1
        }
    ]}
}
In [13]:
def upload_doc(coll_id, image_path, doc_name='Test'):
    '''
    Uploads image and XML files to Tranksribus.
    '''
    # Prepare XML file
    generate_xml_for_image(image_path)
    image_file = os.path.basename(image_path)
    xml_file = '{}-1.xml'.format(image_file[:-4])
    # Modify payload
    payload = doc_payload.copy()
    payload['md']['title'] = doc_name
    payload['pageList']['pages'][0]['fileName'] = image_file
    payload['pageList']['pages'][0]['pageXmlName'] = xml_file
    # Post metadata
    response = s.post('https://transkribus.eu/TrpServer/rest/uploads?collId={}'.format(coll_id), json=payload)
    print(response.url)
    # Get upload id from response to submit with files
    upload_id = response.json()['uploadId']
    print(upload_id)
    files = {'img': open(image_path, 'rb'), 'xml': open(xml_file, 'rb')}
    # Upload the xml and image files
    response = s.put('https://transkribus.eu/TrpServer/rest/uploads/{}'.format(upload_id), files=files)
    return upload_id
    
In [14]:
doc_id = upload_doc(40099, 'data/columns/rows-test/sample/N193-150_0428-col-2-14.jpg')
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-14-c32a3d3dc1b0> in <module>()
----> 1 doc_id = upload_doc(40099, 'data/columns/rows-test/sample/N193-150_0428-col-2-14.jpg')

<ipython-input-13-cad04b74991a> in upload_doc(coll_id, image_path, doc_name)
      4     '''
      5     # Prepare XML file
----> 6     generate_xml_for_image(image_path)
      7     image_file = os.path.basename(image_path)
      8     xml_file = '{}-1.xml'.format(image_file[:-4])

<ipython-input-11-aabe38252e42> in generate_xml_for_image(image_path)
      8     '''
      9     with open('page_xml.xml', 'rb') as xml_file:
---> 10         template = lxml.etree.parse(xml_file)
     11     print(template)
     12     root = template.getroot()

AttributeError: module 'lxml' has no attribute 'etree'

Initiate HTR

In [14]:
params = {
    'id': doc_id,
    'pages': 1
}
h = s.post('https://transkribus.eu/TrpServer/rest/recognition/{}/{}/htrCITlab'.format(40099, 133), params=params)
In [16]:
job_id = h.json()
In [17]:
# Check on the status of the job (put in a loop)
j = s.get('https://transkribus.eu/TrpServer/rest/jobs/{}'.format(job_id))
In [19]:
j.json()['success']
Out[19]:
True
In [20]:
# If job success is true, then get the results
r = s.get('https://transkribus.eu/TrpServer/rest/collections/{}/{}/fulldoc'.format(40099, doc_id))
In [21]:
# Get the results
page = 0
docinfo = r.json()
xml_url = docinfo['pageList']['pages'][page]['tsList']['transcripts'][0]['url']
x = requests.get(xml_url)
x.text # Save as xml 
Out[21]:
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">\n    <Metadata>\n        <Creator>prov=University of Rostock/Institute of Mathematics/CITlab/Gundram Leifert/[email protected]:name=English Writing M1(htr_id=133)::::v=2.3.1\nprov=University of Rostock/Institute of Mathematics/CITlab/Tobias Gruening/[email protected]:name=de.uros.citlab.module.baseline2polygon.B2PSeamMultiOriented:v=2.3.1\nnull</Creator>\n        <LastChange>2019-05-08T08:27:34.876+02:00</LastChange>\n    </Metadata>\n    <Page imageFilename="N193-150_0428-col-2-14.jpg" imageWidth="1115" imageHeight="86">\n        <ReadingOrder>\n            <OrderedGroup id="ro_1557296855054" caption="Regions reading order">\n                <RegionRefIndexed index="0" regionRef="r1"/>\n            </OrderedGroup>\n        </ReadingOrder>\n        <TextRegion id="r1" custom="readingOrder {index:0;}">\n            <Coords points="0,0 0,80 1110,80, 1110,0"/>\n            <TextLine id="l1" custom="readingOrder {index:0;}">\n                <Coords points="17,282 23,282 25,281 26,279 27,278 29,277 30,275 31,274 33,273 34,271 35,270 37,269 38,267 51,254 52,252 54,251 55,249 56,248 58,247 59,245 60,244 62,243 63,241 64,240 66,239 67,237 76,228 77,226 79,225 80,224 81,222 83,221 84,220 85,218 87,217 88,215 89,214 91,213 92,211 93,210 95,209 96,207 97,206 99,205 100,203 101,202 102,200 104,199 105,198 106,196 108,195 109,194 110,192 112,191 113,190 114,188 116,187 117,185 118,184 120,183 121,181 122,180 124,179 125,177 134,168 135,166 137,165 138,164 139,162 141,161 142,160 143,158 145,157 146,156 147,154 149,153 150,151 163,138 164,136 166,135 167,134 168,132 170,131 171,130 172,128 174,127 175,126 176,124 177,123 179,122 180,120 196,104 197,102 199,101 200,100 201,98 202,97 204,96 205,94 206,93 208,92 209,90 216,83 217,83 218,82 220,81 221,79 222,78 224,77 225,77 226,78 228,79 229,79 233,75 236,75 237,76 240,76 241,78 242,78 244,79 253,79 254,80 256,81 257,83 258,84 260,85 267,85 268,84 269,84 271,82 273,82 275,81 276,82 277,81 279,80 280,78 282,76 284,77 286,77 288,78 289,79 296,79 297,81 299,82 300,83 301,85 304,82 305,82 307,81 308,79 309,78 311,78 312,79 313,79 315,78 320,78 321,76 324,76 325,78 327,76 328,76 329,75 332,75 333,76 336,76 340,80 341,80 343,79 344,79 345,80 349,80 351,81 354,81 355,80 356,78 360,78 361,77 363,76 365,76 367,77 370,77 375,82 376,82 378,81 383,81 384,79 386,79 387,78 388,76 390,76 391,75 396,75 398,76 402,76 406,72 414,72 415,73 416,75 418,73 419,73 420,72 423,72 424,73 426,73 427,72 434,72 435,73 438,73 439,74 440,76 442,77 446,81 449,81 451,83 455,83 457,82 458,81 466,81 467,80 469,79 470,78 475,78 477,79 478,79 479,80 483,80 485,79 487,77 489,77 490,79 497,79 499,81 503,81 506,84 508,85 509,86 530,86 532,85 533,83 534,82 536,81 537,82 541,82 544,85 546,85 548,84 552,84 556,80 560,80 565,75 570,75 572,73 573,75 574,75 576,76 581,76 582,77 584,77 585,79 586,78 588,78 589,80 590,81 592,82 593,84 595,85 596,86 608,86 609,85 611,85 612,83 613,85 615,86 616,87 617,89 619,90 620,91 621,93 623,94 632,103 634,104 635,106 636,107 638,108 639,110 641,111 642,112 643,114 645,115 646,115 651,120 653,120 654,122 655,122 658,119 659,117 661,116 662,115 663,113 665,112 666,111 667,109 669,108 670,107 671,107 673,106 675,106 677,105 678,104 679,102 681,101 682,100 683,98 684,97 686,96 687,94 692,89 703,89 704,87 706,87 707,86 708,84 710,83 711,82 712,80 714,79 715,79 720,74 722,74 724,76 726,77 727,79 728,80 730,81 731,83 732,84 734,85 738,85 740,83 742,83 743,81 747,81 748,80 750,81 757,88 766,88 767,87 769,86 770,86 775,81 785,81 786,80 787,80 790,83 795,83 797,82 798,82 799,80 801,79 802,78 803,78 805,77 806,76 807,76 809,75 810,73 823,73 825,72 827,72 829,73 830,73 835,78 837,78 838,77 839,78 841,77 842,77 844,78 845,79 846,79 848,81 850,81 852,79 853,81 854,81 856,79 858,79 860,81 862,81 865,78 866,78 868,79 874,79 876,80 877,79 880,79 881,78 882,76 884,76 885,77 888,77 889,79 890,80 892,81 893,83 894,84 896,85 907,85 908,84 911,84 912,82 916,78 917,78 918,77 920,78 921,80 923,81 924,81 925,82 927,83 928,85 929,86 943,86 945,84 959,84 960,83 972,83 973,81 975,80 977,80 979,81 980,83 982,84 983,85 991,85 992,84 995,84 996,85 998,86 1027,86 1028,84 1030,83 1032,83 1034,82 1035,80 1036,79 1039,79 1040,80 1042,80 1043,82 1044,83 1047,83 1048,81 1050,80 1051,79 1054,79 1055,77 1056,77 1059,80 1062,80 1063,81 1065,82 1066,84 1067,85 1079,85 1081,84 1082,82 1085,79 1098,79 1097,3 1089,3 1087,4 1085,4 1083,3 1082,2 1081,2 1079,3 1075,3 1074,5 1071,5 1070,3 1069,2 1067,1 1066,-1 1064,-1 1063,1 1062,2 1061,4 1056,4 1055,5 1052,5 1051,4 1050,2 1047,2 1046,4 1044,4 1043,3 1042,1 1039,1 1038,0 1036,-1 1031,-1 1030,0 1028,1 1027,3 1026,3 1024,4 1011,4 1010,3 1008,3 1007,2 1006,0 1004,-1 1003,-1 1002,1 1000,2 999,3 998,5 996,5 995,3 994,3 991,6 986,6 983,9 979,9 978,10 976,9 963,9 961,8 959,8 957,7 955,7 953,5 948,5 947,4 933,4 932,3 931,3 929,2 928,0 927,-1 919,-1 915,3 912,3 911,2 909,1 906,1 905,-1 904,-2 897,-2 896,0 894,1 893,2 892,4 890,5 889,5 888,6 887,8 885,9 873,9 872,11 868,11 866,12 865,14 855,24 853,24 852,26 848,30 847,30 845,31 843,29 840,29 839,27 837,27 836,29 833,29 832,27 831,28 829,28 824,33 817,33 816,32 815,30 813,30 812,31 811,31 808,28 806,27 805,25 804,25 802,24 793,24 792,23 790,24 789,23 788,23 786,24 785,26 784,27 781,27 780,28 778,28 777,30 776,31 772,31 766,37 764,37 762,38 761,38 760,37 756,37 752,33 748,33 746,34 745,36 742,36 741,34 740,33 738,33 737,34 736,36 734,37 733,39 730,39 728,41 726,40 725,39 724,37 720,37 718,36 717,36 713,32 712,32 710,34 708,34 706,32 703,32 699,28 698,30 697,31 695,32 694,31 693,31 691,30 690,29 689,27 687,26 686,26 683,23 675,23 674,25 673,25 671,24 667,28 666,26 665,26 662,29 659,29 658,28 657,26 654,26 653,28 651,29 650,29 649,31 647,32 646,33 643,33 642,35 641,36 639,37 638,39 637,40 634,40 633,42 629,42 622,35 621,35 619,34 618,32 616,31 614,31 612,33 611,31 610,30 608,29 607,29 606,27 604,29 603,29 602,30 600,31 598,31 596,30 595,29 594,29 592,30 586,30 584,29 583,29 582,28 580,28 579,26 578,25 576,24 573,24 572,25 571,27 569,25 567,25 565,24 563,24 561,25 560,27 559,28 557,29 556,31 555,32 554,34 552,34 551,35 546,35 544,34 543,34 541,32 540,32 539,34 538,35 536,36 535,38 531,42 528,42 527,43 526,45 524,45 523,43 522,42 520,41 519,39 518,41 509,41 508,40 507,38 505,38 504,40 503,40 501,41 500,42 499,44 498,45 496,45 495,47 494,48 492,48 491,47 489,47 488,48 480,48 479,47 477,46 476,44 475,43 473,44 471,44 469,43 468,42 467,40 461,40 460,42 457,42 456,41 455,41 453,39 449,39 448,41 447,41 445,39 443,39 441,38 440,38 438,39 437,41 435,41 433,40 432,40 430,41 429,42 428,44 427,45 425,46 424,48 423,46 420,46 419,45 413,45 412,47 409,44 405,44 404,43 402,43 401,41 400,40 394,40 393,39 389,39 385,43 382,43 381,44 380,46 377,46 376,45 368,45 366,43 365,43 364,42 362,42 361,41 360,39 358,38 357,37 355,36 353,36 349,32 341,32 339,33 338,34 337,34 334,37 333,39 331,40 330,40 327,43 326,43 325,44 323,43 322,41 319,41 318,42 317,40 313,40 307,46 306,46 305,47 303,46 302,44 301,43 297,43 295,42 294,42 291,45 289,45 282,38 280,37 279,35 278,35 276,34 268,26 263,26 262,28 260,28 259,26 258,26 256,28 254,28 252,29 251,29 250,31 248,32 247,33 246,35 244,36 243,35 242,33 240,32 235,32 234,34 227,34 226,35 224,35 216,27 213,27 212,26 211,27 204,27 203,26 200,26 199,27 197,26 196,26 195,28 193,26 192,28 191,28 189,29 188,30 187,32 185,33 180,33 177,30 176,31 175,29 173,28 172,27 170,27 169,25 166,25 165,27 162,27 161,25 157,25 156,27 154,28 153,30 152,31 150,30 149,28 148,27 146,27 145,26 142,26 140,28 138,28 137,27 136,27 134,29 133,29 132,30 129,30 125,34 124,34 122,33 118,33 117,32 116,30 114,29 113,30 112,32 110,33 106,33 105,34 104,34 102,36 101,36 100,34 98,33 97,33 96,32 94,31 93,29 92,28 90,28 89,27 87,27 86,25 85,25 82,28 79,25 78,27 77,28 74,28 73,30 69,30 66,27 63,27 62,26 54,26 51,29 47,29 46,27 43,27 42,29 41,30 39,31 38,33 33,38 31,37 30,36 29,37 27,38 23,38 19,34 18,35 13,35"/>\n                <Baseline points="20,55 120,55 220,56 420,59 520,58 620,57 720,56 920,47 1020,34 1090,33"/>\n                <TextEquiv>\n                    <Unicode>f Soha SteCralh  of0sf e</Unicode>\n                </TextEquiv>\n            </TextLine>\n            <TextEquiv>\n                <Unicode>f Soha SteCralh  of0sf e</Unicode>\n            </TextEquiv>\n        </TextRegion>\n    </Page>\n</PcGts>\n'
In [ ]: