#!/usr/bin/env python # coding: utf-8 # # Lines # # We show how the pipeline detects lines on the page and we provide # critical examples to see how successful the method is. # # Reference: [lines](https://among.github.io/fusus/fusus/lines.html). # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: from fusus.book import Book # In[3]: B = Book(cd="~/github/among/fusus/example") # In[4]: # cd to the book directory get_ipython().system('cd `pwd`') # We show the line division in every block of text in every page. # Check visually whether all lines have been detected correctly. # # The `histogram` stage shows the blocks that have been detected on the page, # and within the blocks the histograms that correspond to the ink distribution. # # We mark the start and end of lines by orange and purple dots, which are # obtained by a rolling median filter over the first and last black pixel position on each pixel line. # # In each block the main line bands are shown. # A green rule marks the start of a band, a red rule the end. # The space between bands is greyed out. # We show the `main` bands, which are derived directly from the histogram. # # The main bands may not contain *all* the ink, but do not worry: the bands are used to target # the cleaning of marks, and are not visible to the rest of the processing stages. # # Check in particular: # * whether short lines have been detected # * whether consecutive lines are not treated as one line. # # Page 101 is a critical page: both errors are likely to occur! # In[5]: def checkLines(pg, quiet=True, **kwargs): if pg is None: for pg in B.allPagesList: page = B.process( batch=False, pages=pg, doOcr=False, uptoLayout=True, quiet=quiet, **kwargs, ) page.show(stage="histogram") else: page = B.process( batch=False, pages=pg, doOcr=False, uptoLayout=True, quiet=quiet, **kwargs, ) page.show(stage="histogram") return page # In[6]: # B.configure(blurY=None, peakSignificant=0.1, peakProminenceY=None, valleyProminenceY=None, debug=0) # ## Check a single page # In[7]: page = checkLines(101) # ## Check all pages # In[8]: page = checkLines(None) # In[ ]: