#!/usr/bin/env python # coding: utf-8 # # Inspect # # We show how to inspect results and intermediate results # using the example pages in the *example* directory. # # Reference: [page](https://among.github.io/fusus/fusus/page.html). # Enable auto-loading of changed code. # This is handy if you are a developer and changing `fusus` code wanting to load changed code on the fly. # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: from fusus.book import Book # In[3]: B = Book(cd="~/github/among/fusus/example") # In[4]: # cd to the book directory get_ipython().system('cd `pwd`') # In[5]: get_ipython().system('echo `pwd`') # In[6]: B.availablePages() # ## Inspection # # We run a single page and switch batch mode off. # In[7]: page = B.process(batch=False, pages='47') # ### Inspect intermediate stages # # We get a handle to the page object that has been processed, # and because batch mode is off, it has retained its intermediate results. # # Let's show all stages, but heavily reduced: # In[8]: page.show(stage="proofword") # In[9]: page.show(width=200) # Note the data stages: # # * `markData` with the cleaning data. # * `char` with the OCR data at character level. # * `word` with the OCR data at word level. # * `line` with the locations of the line boxes. # # The proof stages (`proofword` and `proofchar`) are HTML pages that cannot easily be shown inside the notebook. # It is better to go to the indicated file on your system and view them directly. # Now the histogram and the `cleanh` stage at true size; # In[10]: page.show(stage='histogram,cleanh') # The grey rectangles are the traces of wiped marks. # # But what goes to the OCR engine is stage `clean`: # In[11]: page.show(stage='clean') # ### Inspect bands # # Let's have a look at all the bands. # # We project the bands on the histogram of the page. # In[12]: for doBands in ("high", "low", "mid", "main", "broad", "inter"): page.show(stage="histogram", band=doBands) # We can also show several bands together: # In[13]: page.show(stage="histogram", band="low,high") # # Cleaning # # Proper cleaning requires some interactive adjusting of parameters. # So we need feedback as to what marks have been cleaned and why. # ## Connectedness # # We can use the `boxed` stage to inspect the results of cleaning. # # The letters above the boxes refer to the bands in which those marks are searched: # # letter | band # --- | --- # ` ` | main # `l` | low # `m` | mid # `h` | high # `b` | broad # `i` | inter # # The numbers above the boxes are the numbers of the marks, by running `B.availableMarks()` you see the marks with their numbers. # # Marks in *green* boxes will not be removed. # They are apparently part of a letter, not an isolated blob of ink. # # The algorithm computes a degree of connection, and if that is bigger than 0.1, the mark counts as connected. # The connection degrees are printed below the marks (times 100), so 10 is the cutoff value. # Green marks are always higher than 10. # # Marks in *orange* boxes will be removed. # They have a small connection degree, less than 0.1, but usually much smaller. If it is near zero, it will be left out. # In[14]: page.show(stage='boxed') # We can also ask for a log of the cleaning process which gives a bit more information: # In[15]: page.show(stage='markData') # The starred entries correspond to the green boxes. # # The connectivity times 100 is the number under the boxes. # Now we only want to see the effects of cleaning for specific marks (in specific bands): # In[16]: page.show(stage='boxed', band="high,mid", mark="comma,a") # ## Cleaning parameters # # The parameters that steer the cleaning process are # # * `connectBorder` # * `connectRatio` # * `connectThreshold` # # See the # [settings specs](https://among.github.io/fusus/fusus/parameters.html). # In[ ]: