LayoutXLM was proposed in LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
It is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. More, it’s a multilingual extension of the LayoutLMv2 model trained on 53 languages.
It relies on an external OCR engine to get words and bboxes from the document image. Thus, let's run in this APP an OCR engine ourselves (PyTesseract) as we'll need to do it in real life to get the bounding boxes, then run LayoutXLM base (already fine-tuned on the DocLayNet dataset at paragraph level: pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512) on the individual tokens and visualize the result at paragraph level!
%%capture
!pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
%%capture
# !sudo apt-get install poppler-utils
# !pip install pdf2image
# source: https://levelup.gitconnected.com/4-python-libraries-to-convert-pdf-to-images-7a09eba83a09
# source: https://pypi.org/project/pypdfium2/
!pip install -U pypdfium2
%%capture
!pip install -q langdetect
%%capture
!sudo apt install tesseract-ocr-all # english + osd (Orientation and script detection module)
# !sudo apt-get install tesseract-ocr-por # portuguese
# import os
# print(os.popen(f'cat /etc/debian_version').read())
# print(os.popen(f'cat /etc/issue').read())
# print(os.popen(f'apt search tesseract').read())
!pip install pytesseract
!pip install -q transformers sentencepiece datasets gradio pypdf
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.8/6.8 MB 33.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 43.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 468.7/468.7 KB 30.0 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15.7/15.7 MB 37.9 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 246.9/246.9 KB 17.9 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.6/7.6 MB 28.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.8/199.8 KB 9.9 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 212.2/212.2 KB 15.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 KB 8.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.9/132.9 KB 11.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 30.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.5/129.5 KB 10.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 71.5/71.5 KB 6.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 45.7/45.7 KB 3.7 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 57.1/57.1 KB 366.6 kB/s eta 0:00:00 Preparing metadata (setup.py) ... done ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 57.8/57.8 KB 5.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.1/144.1 KB 9.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 50.5/50.5 KB 4.0 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.2/114.2 KB 7.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 264.6/264.6 KB 15.0 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 158.8/158.8 KB 9.0 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.9/66.9 KB 3.2 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 69.6/69.6 KB 4.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.3/58.3 KB 4.4 MB/s eta 0:00:00 Building wheel for ffmpy (setup.py) ... done
import os
from operator import itemgetter
import collections
import string
import re
import pypdf
from pypdf import PdfReader
from pypdf.errors import PdfReadError
import pypdfium2 as pdfium
import langdetect
from langdetect import detect_langs
import pytesseract
import pandas as pd
import numpy as np
import random
from google.colab import files
import tempfile
from matplotlib import font_manager
from PIL import Image, ImageDraw, ImageFont
import cv2
# In Colab, use cv2_imshow instead of cv2.imshow
from google.colab.patches import cv2_imshow
from IPython.display import display
import itertools
import gradio as gr
import pathlib
from pathlib import Path
import shutil
# categories colors
label2color = {
'Caption': 'brown',
'Footnote': 'orange',
'Formula': 'gray',
'List-item': 'yellow',
'Page-footer': 'red',
'Page-header': 'red',
'Picture': 'violet',
'Section-header': 'orange',
'Table': 'green',
'Text': 'blue',
'Title': 'pink'
}
# bounding boxes start and end of a sequence
cls_box = [0, 0, 0, 0]
sep_box = [1000, 1000, 1000, 1000]
# model
model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512"
# tokenizer
tokenizer_id = "xlm-roberta-base"
# (tokenization) The maximum length of a feature (sequence)
if str(384) in model_id:
max_length = 384
elif str(512) in model_id:
max_length = 512
else:
print("Error with max_length of chunks!")
# (tokenization) overlap
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
# max PDF page images that will be displayed
max_imgboxes = 2
# get files
examples_dir = 'files/'
Path(examples_dir).mkdir(parents=True, exist_ok=True)
from huggingface_hub import hf_hub_download
files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
for file_name in files:
path_to_file = hf_hub_download(
repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2",
filename = "files/" + file_name,
repo_type = "space"
)
shutil.copy(path_to_file,examples_dir)
# path to files
image_wo_content = examples_dir + "wo_content.png" # image without content
pdf_blank = examples_dir + "blank.pdf" # blank PDF
image_blank = examples_dir + "blank.png" # blank image
## get langdetect2Tesseract dictionary
t = "files/languages_tesseract.csv"
l = "files/languages_iso.csv"
df_t = pd.read_csv(t)
df_l = pd.read_csv(l)
langs_t = df_t["Language"].to_list()
langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t]
langs_l = df_l["Language"].to_list()
langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l]
langscode_t = df_t["LangCode"].to_list()
langscode_l = df_l["LangCode"].to_list()
Tesseract2langdetect, langdetect2Tesseract = dict(), dict()
for lang_t, langcode_t in zip(langs_t,langscode_t):
try:
if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese"
index = langs_l.index(lang_t)
langcode_l = langscode_l[index]
Tesseract2langdetect[langcode_t] = langcode_l
except:
continue
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
Downloading (…)in/files/example.pdf: 0%| | 0.00/343k [00:00<?, ?B/s]
Downloading (…)main/files/blank.pdf: 0%| | 0.00/1.15k [00:00<?, ?B/s]
Downloading (…)main/files/blank.png: 0%| | 0.00/3.99k [00:00<?, ?B/s]
Downloading (…)es/languages_iso.csv: 0%| | 0.00/2.52k [00:00<?, ?B/s]
Downloading (…)guages_tesseract.csv: 0%| | 0.00/2.13k [00:00<?, ?B/s]
Downloading (…)files/wo_content.png: 0%| | 0.00/27.6k [00:00<?, ?B/s]
# get text and bounding boxes from an image
# https://stackoverflow.com/questions/61347755/how-can-i-get-line-coordinates-that-readed-by-tesseract
# https://medium.com/geekculture/tesseract-ocr-understanding-the-contents-of-documents-beyond-their-text-a98704b7c655
def get_data_paragraph(results, factor, conf_min=0):
data = {}
for i in range(len(results['line_num'])):
level = results['level'][i]
block_num = results['block_num'][i]
par_num = results['par_num'][i]
line_num = results['line_num'][i]
top, left = results['top'][i], results['left'][i]
width, height = results['width'][i], results['height'][i]
conf = results['conf'][i]
text = results['text'][i]
if not (text == '' or text.isspace()):
if conf >= conf_min:
tup = (text, left, top, width, height)
if block_num in list(data.keys()):
if par_num in list(data[block_num].keys()):
if line_num in list(data[block_num][par_num].keys()):
data[block_num][par_num][line_num].append(tup)
else:
data[block_num][par_num][line_num] = [tup]
else:
data[block_num][par_num] = {}
data[block_num][par_num][line_num] = [tup]
else:
data[block_num] = {}
data[block_num][par_num] = {}
data[block_num][par_num][line_num] = [tup]
# get paragraphs dicionnary with list of lines
par_data = {}
par_idx = 1
for _, b in data.items():
for _, p in b.items():
line_data = {}
line_idx = 1
for _, l in p.items():
line_data[line_idx] = l
line_idx += 1
par_data[par_idx] = line_data
par_idx += 1
# get lines of texts, grouped by paragraph
texts_pars = list()
row_indexes = list()
texts_lines = list()
texts_lines_par = list()
row_index = 0
for _,par in par_data.items():
count_lines = 0
lines_par = list()
for _,line in par.items():
if count_lines == 0: row_indexes.append(row_index)
line_text = ' '.join([item[0] for item in line])
texts_lines.append(line_text)
lines_par.append(line_text)
count_lines += 1
row_index += 1
# lines.append("\n")
row_index += 1
texts_lines_par.append(lines_par)
texts_pars.append(' '.join(lines_par))
# lines = lines[:-1]
# get paragraphes boxes (par_boxes)
# get lines boxes (line_boxes)
par_boxes = list()
par_idx = 1
line_boxes, lines_par_boxes = list(), list()
line_idx = 1
for _, par in par_data.items():
xmins, ymins, xmaxs, ymaxs = list(), list(), list(), list()
line_boxes_par = list()
count_line_par = 0
for _, line in par.items():
xmin, ymin = line[0][1], line[0][2]
xmax, ymax = (line[-1][1] + line[-1][3]), (line[-1][2] + line[-1][4])
line_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
line_boxes_par.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
xmins.append(xmin)
ymins.append(ymin)
xmaxs.append(xmax)
ymaxs.append(ymax)
line_idx += 1
count_line_par += 1
xmin, ymin, xmax, ymax = min(xmins), min(ymins), max(xmaxs), max(ymaxs)
par_bbox = [int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)]
par_boxes.append(par_bbox)
lines_par_boxes.append(line_boxes_par)
par_idx += 1
return texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes
# rescale image to get 300dpi
def set_image_dpi_resize(image):
"""
Rescaling image to 300dpi while resizing
:param image: An image
:return: A rescaled image
"""
length_x, width_y = image.size
factor = min(1, float(1024.0 / length_x))
size = int(factor * length_x), int(factor * width_y)
# image_resize = image.resize(size, Image.Resampling.LANCZOS)
image_resize = image.resize(size, Image.LANCZOS)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='1.png')
temp_filename = temp_file.name
image_resize.save(temp_filename, dpi=(300, 300))
return factor, temp_filename
# it is important that each bounding box should be in (upper left, lower right) format.
# source: https://github.com/NielsRogge/Transformers-Tutorials/issues/129
def upperleft_to_lowerright(bbox):
x0, y0, x1, y1 = tuple(bbox)
if bbox[2] < bbox[0]:
x0 = bbox[2]
x1 = bbox[0]
if bbox[3] < bbox[1]:
y0 = bbox[3]
y1 = bbox[1]
return [x0, y0, x1, y1]
# convert boundings boxes (left, top, width, height) format to (left, top, left+widght, top+height) format.
def convert_box(bbox):
x, y, w, h = tuple(bbox) # the row comes in (left, top, width, height) format
return [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
# LiLT model gets 1000x10000 pixels images
def normalize_box(bbox, width, height):
return [
int(1000 * (bbox[0] / width)),
int(1000 * (bbox[1] / height)),
int(1000 * (bbox[2] / width)),
int(1000 * (bbox[3] / height)),
]
# LiLT model gets 1000x10000 pixels images
def denormalize_box(bbox, width, height):
return [
int(width * (bbox[0] / 1000)),
int(height * (bbox[1] / 1000)),
int(width* (bbox[2] / 1000)),
int(height * (bbox[3] / 1000)),
]
# get back original size
def original_box(box, original_width, original_height, coco_width, coco_height):
return [
int(original_width * (box[0] / coco_width)),
int(original_height * (box[1] / coco_height)),
int(original_width * (box[2] / coco_width)),
int(original_height* (box[3] / coco_height)),
]
def get_blocks(bboxes_block, categories, texts):
# get list of unique block boxes
bbox_block_dict, bboxes_block_list, bbox_block_prec = dict(), list(), list()
for count_block, bbox_block in enumerate(bboxes_block):
if bbox_block != bbox_block_prec:
bbox_block_indexes = [i for i, bbox in enumerate(bboxes_block) if bbox == bbox_block]
bbox_block_dict[count_block] = bbox_block_indexes
bboxes_block_list.append(bbox_block)
bbox_block_prec = bbox_block
# get list of categories and texts by unique block boxes
category_block_list, text_block_list = list(), list()
for bbox_block in bboxes_block_list:
count_block = bboxes_block.index(bbox_block)
bbox_block_indexes = bbox_block_dict[count_block]
category_block = np.array(categories, dtype=object)[bbox_block_indexes].tolist()[0]
category_block_list.append(category_block)
text_block = np.array(texts, dtype=object)[bbox_block_indexes].tolist()
text_block = [text.replace("\n","").strip() for text in text_block]
if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote":
text_block = ' '.join(text_block)
else:
text_block = '\n'.join(text_block)
text_block_list.append(text_block)
return bboxes_block_list, category_block_list, text_block_list
# function to sort bounding boxes
def get_sorted_boxes(bboxes):
# sort by y from page top to bottom
sorted_bboxes = sorted(bboxes, key=itemgetter(1), reverse=False)
y_list = [bbox[1] for bbox in sorted_bboxes]
# sort by x from page left to right when boxes with same y
if len(list(set(y_list))) != len(y_list):
y_list_duplicates_indexes = dict()
y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1]
for item in y_list_duplicates:
y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item]
bbox_list_y_duplicates = sorted(np.array(sorted_bboxes, dtype=object)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False)
np_array_bboxes = np.array(sorted_bboxes)
np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates)
sorted_bboxes = np_array_bboxes.tolist()
return sorted_bboxes
# sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
def sort_data(bboxes, categories, texts):
sorted_bboxes = get_sorted_boxes(bboxes)
sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
sorted_categories = np.array(categories, dtype=object)[sorted_bboxes_indexes].tolist()
sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
return sorted_bboxes, sorted_categories, sorted_texts
# sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
def sort_data_wo_labels(bboxes, texts):
sorted_bboxes = get_sorted_boxes(bboxes)
sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
return sorted_bboxes, sorted_texts
# get filename and images of PDF pages
def pdf_to_images(uploaded_pdf):
# file name of the uploaded PDF
filename = next(iter(uploaded_pdf))
try:
PdfReader(filename)
except PdfReadError:
print("Invalid PDF file.")
else:
try:
# images = convert_from_path(path_to_file, last_page=max_imgboxes)
pdf = pdfium.PdfDocument(str(filename))
version = pdf.get_version() # get the PDF standard version
n_pages = len(pdf) # get the number of pages in the document
page_indices = [i for i in range(n_pages)] # pages until last_page
images = list(pdf.render(
pdfium.PdfBitmap.to_pil,
page_indices = page_indices,
scale = 300/72, # 300dpi resolution
))
num_imgs = len(images)
print(f'The PDF "{filename}"" was converted into {num_imgs} images.')
print("Now, you can extract data from theses images (text, bounding boxes...).")
except:
print(f"Error with the PDF {filename}: it was not converted into images.")
print()
else:
# display images
if num_imgs > 0:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20,10))
columns = 5
for i, image in enumerate(images):
plt.subplot(int(num_imgs / columns + 1), columns, i + 1)
plt.xticks(color="white")
plt.yticks(color="white")
plt.tick_params(bottom = False)
plt.tick_params(left = False)
plt.imshow(image)
return filename, images
# Extraction of image data (text and bounding boxes)
def extraction_data_from_image(images):
num_imgs = len(images)
if num_imgs > 0:
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
results, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes, images_pixels = dict(), dict(), dict(), dict(), dict(), dict(), dict(), dict(), dict()
images_ids_list, texts_lines_list, texts_pars_list, texts_lines_par_list, par_boxes_list, line_boxes_list, lines_par_boxes_list, images_list, images_pixels_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list(), list(), list(), list(), list()
try:
for i,image in enumerate(images):
# image preprocessing
# https://docs.opencv.org/3.0-beta/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.html
img = image.copy()
factor, path_to_img = set_image_dpi_resize(img) # Rescaling to 300dpi
img = Image.open(path_to_img)
img = np.array(img, dtype='uint8') # convert PIL to cv2
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
# OCR PyTesseract | get langs of page
txt = pytesseract.image_to_string(img, config=custom_config)
txt = txt.strip().lower()
txt = re.sub(r" +", " ", txt) # multiple space
txt = re.sub(r"(\n\s*)+\n+", "\n", txt) # multiple line
# txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
try:
langs = detect_langs(txt)
langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
langs_string = '+'.join(langs)
except:
langs_string = "eng"
langs_string += '+osd'
custom_config = f'--oem 3 --psm 3 -l {langs_string}' # default config PyTesseract: --oem 3 --psm 3
# OCR PyTesseract | get data
results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
# results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
# get image pixels
images_pixels[i] = feature_extractor(images[i], return_tensors="pt").pixel_values
texts_lines[i], texts_pars[i], texts_lines_par[i], row_indexes[i], par_boxes[i], line_boxes[i], lines_par_boxes[i] = get_data_paragraph(results[i], factor, conf_min=0)
texts_lines_list.append(texts_lines[i])
texts_pars_list.append(texts_pars[i])
texts_lines_par_list.append(texts_lines_par[i])
par_boxes_list.append(par_boxes[i])
line_boxes_list.append(line_boxes[i])
lines_par_boxes_list.append(lines_par_boxes[i])
images_ids_list.append(i)
images_pixels_list.append(images_pixels[i])
images_list.append(images[i])
page_no_list.append(i)
num_pages_list.append(num_imgs)
except:
print(f"There was an error within the extraction of PDF text by the OCR!")
else:
from datasets import Dataset
dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "images_pixels": images_pixels_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts_line": texts_lines_list, "texts_par": texts_pars_list, "texts_lines_par": texts_lines_par_list, "bboxes_par": par_boxes_list, "bboxes_lines_par":lines_par_boxes_list})
# print(f"The text data was successfully extracted by the OCR!")
return dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes
def prepare_inference_features_paragraph(example, cls_box = cls_box, sep_box = sep_box):
images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, images_pixels_list = list(), list(), list(), list(), list(), list()
# get batch
# batch_page_hash = example["page_hash"]
batch_images_ids = example["images_ids"]
batch_images = example["images"]
batch_images_pixels = example["images_pixels"]
batch_bboxes_par = example["bboxes_par"]
batch_texts_par = example["texts_par"]
batch_images_size = [image.size for image in batch_images]
batch_width, batch_height = [image_size[0] for image_size in batch_images_size], [image_size[1] for image_size in batch_images_size]
# add a dimension if not a batch but only one image
if not isinstance(batch_images_ids, list):
batch_images_ids = [batch_images_ids]
batch_images = [batch_images]
batch_images_pixels = [batch_images_pixels]
batch_bboxes_par = [batch_bboxes_par]
batch_texts_par = [batch_texts_par]
batch_width, batch_height = [batch_width], [batch_height]
# process all images of the batch
for num_batch, (image_id, image_pixels, boxes, texts_par, width, height) in enumerate(zip(batch_images_ids, batch_images_pixels, batch_bboxes_par, batch_texts_par, batch_width, batch_height)):
tokens_list = []
bboxes_list = []
# add a dimension if only on image
if not isinstance(texts_par, list):
texts_par, boxes = [texts_par], [boxes]
# convert boxes to original
normalize_bboxes_par = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
# sort boxes with texts
# we want sorted lists from top to bottom of the image
boxes, texts_par = sort_data_wo_labels(normalize_bboxes_par, texts_par)
count = 0
for box, text_par in zip(boxes, texts_par):
tokens_par = tokenizer.tokenize(text_par)
num_tokens_par = len(tokens_par) # get number of tokens
tokens_list.extend(tokens_par)
bboxes_list.extend([box] * num_tokens_par) # number of boxes must be the same as the number of tokens
# use of return_overflowing_tokens=True / stride=doc_stride
# to get parts of image with overlap
# source: https://huggingface.co/course/chapter6/3b?fw=tf#handling-long-contexts
encodings = tokenizer(" ".join(texts_par),
truncation=True,
padding="max_length",
max_length=max_length,
stride=doc_stride,
return_overflowing_tokens=True,
return_offsets_mapping=True
)
otsm = encodings.pop("overflow_to_sample_mapping")
offset_mapping = encodings.pop("offset_mapping")
# Let's label those examples and get their boxes
sequence_length_prev = 0
for i, offsets in enumerate(offset_mapping):
# truncate tokens, boxes and labels based on length of chunk - 2 (special tokens <s> and </s>)
sequence_length = len(encodings.input_ids[i]) - 2
if i == 0: start = 0
else: start += sequence_length_prev - doc_stride
end = start + sequence_length
sequence_length_prev = sequence_length
# get tokens, boxes and labels of this image chunk
bb = [cls_box] + bboxes_list[start:end] + [sep_box]
# as the last chunk can have a length < max_length
# we must to add [tokenizer.pad_token] (tokens), [sep_box] (boxes) and [-100] (labels)
if len(bb) < max_length:
bb = bb + [sep_box] * (max_length - len(bb))
# append results
input_ids_list.append(encodings["input_ids"][i])
attention_mask_list.append(encodings["attention_mask"][i])
bb_list.append(bb)
images_ids_list.append(image_id)
chunks_ids_list.append(i)
images_pixels_list.append(image_pixels)
return {
"images_ids": images_ids_list,
"chunk_ids": chunks_ids_list,
"input_ids": input_ids_list,
"attention_mask": attention_mask_list,
"normalized_bboxes": bb_list,
"images_pixels": images_pixels_list
}
from torch.utils.data import Dataset
class CustomDataset(Dataset):
def __init__(self, dataset, tokenizer):
self.dataset = dataset
self.tokenizer = tokenizer
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
# get item
example = self.dataset[idx]
encoding = dict()
encoding["images_ids"] = example["images_ids"]
encoding["chunk_ids"] = example["chunk_ids"]
encoding["input_ids"] = example["input_ids"]
encoding["attention_mask"] = example["attention_mask"]
encoding["bbox"] = example["normalized_bboxes"]
encoding["images_pixels"] = example["images_pixels"]
return encoding
import torch.nn.functional as F
# get predictions at token level
def predictions_token_level(images, custom_encoded_dataset):
num_imgs = len(images)
if num_imgs > 0:
chunk_ids, input_ids, bboxes, pixels_values, outputs, token_predictions = dict(), dict(), dict(), dict(), dict(), dict()
images_ids_list = list()
for i,encoding in enumerate(custom_encoded_dataset):
# get custom encoded data
image_id = encoding['images_ids']
chunk_id = encoding['chunk_ids']
input_id = torch.tensor(encoding['input_ids'])[None]
attention_mask = torch.tensor(encoding['attention_mask'])[None]
bbox = torch.tensor(encoding['bbox'])[None]
pixel_values = torch.tensor(encoding["images_pixels"])
# save data in dictionnaries
if image_id not in images_ids_list: images_ids_list.append(image_id)
if image_id in chunk_ids: chunk_ids[image_id].append(chunk_id)
else: chunk_ids[image_id] = [chunk_id]
if image_id in input_ids: input_ids[image_id].append(input_id)
else: input_ids[image_id] = [input_id]
if image_id in bboxes: bboxes[image_id].append(bbox)
else: bboxes[image_id] = [bbox]
if image_id in pixels_values: pixels_values[image_id].append(pixel_values)
else: pixels_values[image_id] = [pixel_values]
# get prediction with forward pass
with torch.no_grad():
output = model(
input_ids=input_id.to(device),
attention_mask=attention_mask.to(device),
bbox=bbox.to(device),
image=pixel_values.to(device)
)
# save probabilities of predictions in dictionnary
if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
else: outputs[image_id] = [F.softmax(output.logits.squeeze(), dim=-1)]
return outputs, images_ids_list, chunk_ids, input_ids, bboxes
else:
print("An error occurred while getting predictions!")
from functools import reduce
# Get predictions (line level)
def predictions_paragraph_level(outputs, images_ids_list, chunk_ids, input_ids, bboxes):
ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
if len(images_ids_list) > 0:
for i, image_id in enumerate(images_ids_list):
# get image information
images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
image = images_list[0]
width, height = image.size
# get data
chunk_ids_list = chunk_ids[image_id]
outputs_list = outputs[image_id]
input_ids_list = input_ids[image_id]
bboxes_list = bboxes[image_id]
# create zeros tensors
ten_probs = torch.zeros((outputs_list[0].shape[0] - 2)*len(outputs_list), outputs_list[0].shape[1])
ten_input_ids = torch.ones(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list)), dtype =int)
ten_bboxes = torch.zeros(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list), 4), dtype =int)
if len(outputs_list) > 1:
for num_output, (output, input_id, bbox) in enumerate(zip(outputs_list, input_ids_list, bboxes_list)):
start = num_output*(max_length - 2) - max(0,num_output)*doc_stride
end = start + (max_length - 2)
if num_output == 0:
ten_probs[start:end,:] += output[1:-1]
ten_input_ids[:,start:end] = input_id[:,1:-1]
ten_bboxes[:,start:end,:] = bbox[:,1:-1,:]
else:
ten_probs[start:start + doc_stride,:] += output[1:1 + doc_stride]
ten_probs[start:start + doc_stride,:] = ten_probs[start:start + doc_stride,:] * 0.5
ten_probs[start + doc_stride:end,:] += output[1 + doc_stride:-1]
ten_input_ids[:,start:start + doc_stride] = input_id[:,1:1 + doc_stride]
ten_input_ids[:,start + doc_stride:end] = input_id[:,1 + doc_stride:-1]
ten_bboxes[:,start:start + doc_stride,:] = bbox[:,1:1 + doc_stride,:]
ten_bboxes[:,start + doc_stride:end,:] = bbox[:,1 + doc_stride:-1,:]
else:
ten_probs += outputs_list[0][1:-1]
ten_input_ids = input_ids_list[0][:,1:-1]
ten_bboxes = bboxes_list[0][:,1:-1]
ten_probs_list, ten_input_ids_list, ten_bboxes_list = ten_probs.tolist(), ten_input_ids.tolist()[0], ten_bboxes.tolist()[0]
bboxes_list = list()
input_ids_dict, probs_dict = dict(), dict()
bbox_prev = [-100, -100, -100, -100]
for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
bbox = denormalize_box(bbox, width, height)
if bbox != bbox_prev and bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]:
bboxes_list.append(bbox)
input_ids_dict[str(bbox)] = [input_id]
probs_dict[str(bbox)] = [probs]
elif bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]:
input_ids_dict[str(bbox)].append(input_id)
probs_dict[str(bbox)].append(probs)
bbox_prev = bbox
probs_bbox = dict()
for i,bbox in enumerate(bboxes_list):
probs = probs_dict[str(bbox)]
probs = np.array(probs).T.tolist()
probs_label = list()
for probs_list in probs:
prob_label = reduce(lambda x, y: x*y, probs_list)
prob_label = prob_label**(1./(len(probs_list))) # normalization
probs_label.append(prob_label)
max_value = max(probs_label)
max_index = probs_label.index(max_value)
probs_bbox[str(bbox)] = max_index
bboxes_list_dict[image_id] = bboxes_list
input_ids_dict_dict[image_id] = input_ids_dict
probs_dict_dict[image_id] = probs_bbox
df[image_id] = pd.DataFrame()
df[image_id]["bboxes"] = bboxes_list
df[image_id]["texts"] = [tokenizer.decode(input_ids_dict[str(bbox)]) for bbox in bboxes_list]
df[image_id]["labels"] = [id2label[probs_bbox[str(bbox)]] for bbox in bboxes_list]
return probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df
else:
print("An error occurred while getting predictions!")
# Get labeled images with lines bounding boxes
def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
labeled_images = list()
for i, image_id in enumerate(images_ids_list):
# get image
images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
image = images_list[0]
width, height = image.size
# get predicted boxes and labels
bboxes_list = bboxes_list_dict[image_id]
probs_bbox = probs_dict_dict[image_id]
draw = ImageDraw.Draw(image)
# https://stackoverflow.com/questions/66274858/choosing-a-pil-imagefont-by-font-name-rather-than-filename-and-cross-platform-f
font = font_manager.FontProperties(family='sans-serif', weight='bold')
font_file = font_manager.findfont(font)
font_size = 30
font = ImageFont.truetype(font_file, font_size)
for bbox in bboxes_list:
predicted_label = id2label[probs_bbox[str(bbox)]]
draw.rectangle(bbox, outline=label2color[predicted_label])
draw.text((bbox[0] + 10, bbox[1] - font_size), text=predicted_label, fill=label2color[predicted_label], font=font)
labeled_images.append(image)
return labeled_images
# get data of encoded chunk
def get_encoded_chunk_inference(index_chunk=None):
# get datasets
example = dataset
encoded_example = encoded_dataset
# get randomly a document in dataset
if index_chunk == None: index_chunk = random.randint(0, len(encoded_example)-1)
encoded_example = encoded_example[index_chunk]
encoded_image_ids = encoded_example["images_ids"]
# get the image
example = example.filter(lambda example: example["images_ids"] == encoded_image_ids)[0]
image = example["images"] # original image
width, height = image.size
page_no = example["page_no"]
num_pages = example["num_pages"]
# get boxes, texts, categories
bboxes, input_ids = encoded_example["normalized_bboxes"][1:-1], encoded_example["input_ids"][1:-1]
bboxes = [denormalize_box(bbox, width, height) for bbox in bboxes]
num_tokens = len(input_ids) + 2
# get unique bboxes and corresponding labels
bboxes_list, input_ids_list = list(), list()
input_ids_dict = dict()
bbox_prev = [-100, -100, -100, -100]
for i, (bbox, input_id) in enumerate(zip(bboxes, input_ids)):
if bbox != bbox_prev:
bboxes_list.append(bbox)
input_ids_dict[str(bbox)] = [input_id]
else:
input_ids_dict[str(bbox)].append(input_id)
# start_indexes_list.append(i)
bbox_prev = bbox
# do not keep "</s><pad><pad>..."
if input_ids_dict[str(bboxes_list[-1])][0] == (tokenizer.convert_tokens_to_ids('</s>')):
del input_ids_dict[str(bboxes_list[-1])]
bboxes_list = bboxes_list[:-1]
# get texts by line
input_ids_list = input_ids_dict.values()
texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
# display DataFrame
df = pd.DataFrame({"texts": texts_list, "input_ids": input_ids_list, "bboxes": bboxes_list})
return image, df, num_tokens, page_no, num_pages
# display chunk of PDF image and its data
def display_chunk_paragraphs_inference(index_chunk=None):
# get image and image data
image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
# get data from dataframe
input_ids = df["input_ids"]
texts = df["texts"]
bboxes = df["bboxes"]
print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
# display image with bounding boxes
print(">> PDF image with bounding boxes of paragraphs\n")
draw = ImageDraw.Draw(image)
labels = list()
for box, text in zip(bboxes, texts):
color = "red"
draw.rectangle(box, outline=color)
# resize image to original
width, height = image.size
image = image.resize((int(0.5*width), int(0.5*height)))
# convert to cv and display
img = np.array(image, dtype='uint8') # PIL to cv2
cv2_imshow(img)
cv2.waitKey(0)
# display image dataframe
print("\n>> Dataframe of annotated paragraphs\n")
cols = ["texts", "bboxes"]
df = df[cols]
display(df)
from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model
# tokenizer = LayoutXLMTokenizerFast.from_pretrained(model_id)
model = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
model.to(device);
# feature extractor
from transformers import LayoutLMv2FeatureExtractor
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
# tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
Downloading (…)lve/main/config.json: 0%| | 0.00/3.32k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 0%| | 0.00/1.48G [00:00<?, ?B/s]
/usr/local/lib/python3.9/dist-packages/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py:30: FutureWarning: The class LayoutLMv2FeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use LayoutLMv2ImageProcessor instead. warnings.warn(
Downloading (…)lve/main/config.json: 0%| | 0.00/615 [00:00<?, ?B/s]
Downloading (…)tencepiece.bpe.model: 0%| | 0.00/5.07M [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json: 0%| | 0.00/9.10M [00:00<?, ?B/s]
# get labels
id2label = model.config.id2label
label2id = model.config.label2id
num_labels = len(id2label)
#@title Get PDF and extract data by OCR
from google.colab import files
uploaded_pdf = files.upload()
Saving example.pdf to example.pdf
#@title Converting PDF to Images
filename, images = pdf_to_images(uploaded_pdf)
The PDF "example.pdf"" was converted into 12 images. Now, you can extract data from theses images (text, bounding boxes...).
#@title Extraction of image data (text and bounding boxes)... this may take a while...
dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes = extraction_data_from_image(images)
#@title (checking) Display paragraphs bounding boxes of the first page
# https://techtutorialsx.com/2020/12/29/python-opencv-draw-rectangles/
i = 0
image = images[i].copy() # PIL
width, height = image.size
img = np.array(image, dtype='uint8') # PIL to cv2
# paragraphs
for box in par_boxes[i]:
cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 3) # blue
# lines
# for box in line_boxes[i]:
# cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (0, 0, 255), 2) # red
img = cv2.resize(img, (int(width/4), int(height/4)))
cv2_imshow(img)
cv2.waitKey(0)
df = pd.DataFrame({"texts": texts_pars[i], "bboxes paragraphs": par_boxes[i]})
display(df)
texts | bboxes paragraphs | |
---|---|---|
0 | BRIEFING | [307, 220, 615, 276] |
1 | The European Commission's annual rule of law r... | [562, 581, 1914, 920] |
2 | SUMMARY | [302, 1032, 557, 1073] |
3 | The annual rule of law reports, launched by th... | [300, 1124, 2178, 1429] |
4 | The methodology adopted by the Commission prov... | [300, 1477, 2178, 1683] |
5 | The Member States are involved throughout the ... | [300, 1725, 2178, 2030] |
6 | The reports have met with some criticism from ... | [300, 2078, 2180, 2173] |
7 | IN THIS BRIEFING | [1204, 2241, 1548, 2272] |
8 | Introduction | [1279, 2321, 1502, 2355] |
9 | Methodology for preparing the annual rule of l... | [1276, 2374, 2141, 2466] |
10 | First annual rule of law report (September 202... | [1276, 2481, 2136, 2626] |
11 | Council position | [1276, 2636, 1570, 2677] |
12 | Expert and stakeholder views | [1279, 2691, 1807, 2720] |
13 | Conclusions | [1276, 2742, 1494, 2774] |
14 | ANNEX: Overview of the main findings of the 20... | [1276, 2798, 2139, 2888] |
15 | EPRS | European Parliamentary Research Service | [530, 3127, 1952, 3171] |
Now, we need to prepare our data in the format of the model.
encoded_dataset = dataset.map(prepare_inference_features_paragraph, batched=True, batch_size=64, remove_columns=dataset.column_names)
Map: 0%| | 0/12 [00:00<?, ? examples/s]
(Checking) Display a random annotated chunk image and its dataframe.
Note: the image is squared because of its normalization to 1000px vs 1000px in the encoded dataset (necessary for training the model).
# get and image from random chunk
display_chunk_paragraphs_inference()
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Chunk (512 tokens) of the PDF (page: 7 / 12) >> PDF image with bounding boxes of paragraphs
>> Dataframe of annotated paragraphs
texts | bboxes | |
---|---|---|
0 | The European Commission's annual rule of law r... | [1238, 157, 2175, 192] |
1 | developments which are eroding democracy, affe... | [297, 284, 2180, 427] |
2 | draws attention to what it perceives as the la... | [297, 449, 2180, 708] |
3 | Legal researcher (University of Magdeburg), wr... | [297, 750, 2175, 1062] |
4 | In December 2020, the Council of Bars and Law ... | [300, 1112, 1969, 1140] |
5 | , regretting that it did not focus sufficientl... | [297, 1192, 2178, 1603] |
6 | Conclusions | [300, 1638, 672, 1690] |
7 | The Commission‘s annual rule of law reports pr... | [297, 1746, 2180, 2399] |
We end our data preparation with a new class that keeps only the information needed for inference.
custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
Now, we can get the predictions!
LayoutXLM outputs labels at the token level, but we are interested in the predicted labels at the line level.
#@title Get predictions (token level)
outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
#@title Get predictions (paragraph level)
probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_paragraph_level(outputs, images_ids_list, chunk_ids, input_ids, bboxes)
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
#@title Get labeled images with paragraphs bounding boxes
labeled_images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
Filter: 0%| | 0/12 [00:00<?, ? examples/s]
print(f"Number of PDF page images: {len(labeled_images)}")
Number of PDF page images: 12
#@title Labeled images
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(15,10))
columns = 3
for i, image in enumerate(labeled_images):
plt.subplot(int(len(images) / columns + 1), columns, i + 1)
plt.xticks(color="white")
plt.yticks(color="white")
plt.tick_params(bottom = False)
plt.tick_params(left = False)
plt.imshow(image)
#@title Display one labeled image (full size)
num_page = 0
print(f"Image of the labeled page {num_page} (at paragraph level).")
labeled_images[num_page]
Image of the labeled page 0 (at paragraph level).
#@title Save all labeled pages
dest = "/content/labeled_images/"
!rm -rf $dest
for num_img, image in enumerate(labeled_images):
folder = dest + filename.replace(".pdf", "") + "/"
Path(folder).mkdir(parents=True, exist_ok=True)
path_to_img = str(folder) + "img_" + str(num_img) + ".png"
image.save(path_to_img)
# download
zipfile = dest.replace("labeled_images/","labeled_images.zip")
!zip -r $zipfile $dest
from google.colab import files
files.download(zipfile)
#@title DataFrame of one image
num_page = 0
print(f"DataFrame of the page {num_page} (at paragraph level).")
example = dataset[num_page]
df_num_page = df[num_page]
width, height = example["images"].size
# apply same transformations
bboxes_par_list = [denormalize_box(normalize_box(upperleft_to_lowerright(bbox), width, height), width, height) for bbox in example['bboxes_par']]
texts_list = list()
for bbox_par, label in zip(df_num_page["bboxes"].tolist(), df_num_page["labels"].tolist()):
index_par = bboxes_par_list.index(bbox_par)
bboxes_lines_par_list = dataset[num_page]["bboxes_lines_par"][index_par]
texts_lines_par_list = dataset[num_page]["texts_lines_par"][index_par]
boxes, texts = sort_data_wo_labels(bboxes_lines_par_list, texts_lines_par_list)
# apply text startegy in function of label
if label == "Text" or label == "Caption" or label == "Footnote":
texts = ' '.join(texts)
else:
texts = '\n'.join(texts)
texts_list.append(texts)
df_num_page["Paragraph text"] = texts_list
cols = ["bboxes", "Paragraph text", "labels"]
df_num_page[cols]
DataFrame of the page 0 (at paragraph level).
bboxes | Paragraph text | labels | |
---|---|---|---|
0 | [305, 217, 612, 273] | BRIEFING | Section-header |
1 | [560, 578, 1912, 919] | The European Commission's\nannual rule of law ... | Section-header |
2 | [300, 1031, 555, 1069] | SUMMARY | Section-header |
3 | [297, 1122, 2175, 1427] | The annual rule of law reports, launched by th... | Text |
4 | [297, 1476, 2175, 1680] | The methodology adopted by the Commission prov... | Text |
5 | [297, 1722, 2175, 2027] | The Member States are involved throughout the ... | List-item |
6 | [297, 2076, 2178, 2171] | The reports have met with some criticism from ... | Text |
7 | [1203, 2238, 1545, 2269] | IN THIS BRIEFING | Section-header |
8 | [1277, 2318, 1501, 2353] | Introduction | Section-header |
9 | [1275, 2371, 2138, 2462] | Methodology for preparing the annual rule of\n... | Section-header |
10 | [1275, 2480, 2133, 2623] | First annual rule of law report (September 202... | Text |
11 | [1275, 2634, 1567, 2676] | Council position | Section-header |
12 | [1277, 2690, 1806, 2718] | Expert and stakeholder views | Section-header |
13 | [1275, 2739, 1493, 2771] | Conclusions | Section-header |
14 | [1275, 2795, 2138, 2887] | ANNEX: Overview of the main findings of the\n2... | Section-header |
15 | [528, 3125, 1950, 3167] | EPRS | European Parliamentary Research Service | Page-footer |
#@title Save DataFrame of all pages in one json file
json_file = dict()
json_file["filename"] = filename
num_pages = len(df)
json_file["num_pages"] = num_pages
for num_page in range(num_pages):
example = dataset[num_page]
df_num_page = df[num_page]
width, height = example["images"].size
json_file["width"] = width
json_file["height"] = height
# apply same transformations
bboxes_par_list = [denormalize_box(normalize_box(upperleft_to_lowerright(bbox), width, height), width, height) for bbox in example['bboxes_par']]
texts_list = list()
for bbox_par, label in zip(df_num_page["bboxes"].tolist(), df_num_page["labels"].tolist()):
index_par = bboxes_par_list.index(bbox_par)
bboxes_lines_par_list = dataset[num_page]["bboxes_lines_par"][index_par]
texts_lines_par_list = dataset[num_page]["texts_lines_par"][index_par]
boxes, texts = sort_data_wo_labels(bboxes_lines_par_list, texts_lines_par_list)
# apply text strategy in function of label
if label == "Text" or label == "Caption" or label == "Footnote":
texts = ' '.join(texts)
else:
texts = '\n'.join(texts)
texts_list.append(texts)
# df_num_page["Paragraph text"] = texts_list
json_file[num_page] = {
"bboxes": df_num_page["bboxes"].tolist(),
"Paragraph text": texts_list,
"labels": df_num_page["labels"].tolist()
}
# cols = ["bboxes", "Paragraph text", "labels"]
# df_num_page = df_num_page[cols]
# # save
# folder = dest + filename.replace(".pdf", "") + "/"
# Path(folder).mkdir(parents=True, exist_ok=True)
# path_to_csv = str(folder) + "page_" + str(num_img) + ".csv"
# df_num_page.to_csv(path_to_csv)
# save json
import json
dest = "/content/pdfs_json/"
Path(dest).mkdir(parents=True, exist_ok=True)
path_to_json = dest + filename.replace(".pdf", ".json")
with open(path_to_json, 'w') as f:
json.dump(json_file, f)
# download
from google.colab import files
files.download(path_to_json)