%pip install scikit-image
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: scikit-image in /home/leopard/.local/lib/python3.10/site-packages (0.20.0) Requirement already satisfied: tifffile>=2019.7.26 in /home/leopard/.local/lib/python3.10/site-packages (from scikit-image) (2023.2.28) Requirement already satisfied: packaging>=20.0 in /usr/lib/python3/dist-packages (from scikit-image) (21.3) Requirement already satisfied: numpy>=1.21.1 in /usr/lib/python3/dist-packages (from scikit-image) (1.21.5) Requirement already satisfied: imageio>=2.4.1 in /home/leopard/.local/lib/python3.10/site-packages (from scikit-image) (2.26.0) Requirement already satisfied: PyWavelets>=1.1.1 in /home/leopard/.local/lib/python3.10/site-packages (from scikit-image) (1.4.1) Requirement already satisfied: lazy_loader>=0.1 in /home/leopard/.local/lib/python3.10/site-packages (from scikit-image) (0.1) Requirement already satisfied: networkx>=2.8 in /home/leopard/.local/lib/python3.10/site-packages (from scikit-image) (3.0) Requirement already satisfied: scipy>=1.8 in /usr/lib/python3/dist-packages (from scikit-image) (1.8.0) Requirement already satisfied: pillow>=9.0.1 in /usr/lib/python3/dist-packages (from scikit-image) (9.0.1) Note: you may need to restart the kernel to use updated packages.
from skimage.io import imread
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
# read the image, check if its 3 channel or grayscale, based on
# that convert to grayscale
img = imread("sample_data/luzi2.png")
if img.ndim > 2: # is this is a rgb/rgba image
img = rgb2gray(img)
plt.figure(figsize=(10,10))
plt.axis("off")
plt.imshow(img, cmap="gray")
plt.show() # preview
One of the common ways of finding the line-height of a document is by analyzing its Horizontal projection profile. Horizontal projection profile (HPP) is the array of sum or rows of a two dimentional image. Where there are more white spaces we see more peaks. These peaks give us an idea of where the segmentation between two lines can be done.
Before we run the project we should also threshold the image to clean all noise.
from skimage.filters import threshold_otsu
import numpy as np
def horizontal_projections(sobel_image):
return np.sum(sobel_image, axis=1)
def binarize_image(image):
threshold = threshold_otsu(img)
return image < threshold
binarized_image = binarize_image(img)
hpp = horizontal_projections(binarized_image)
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,8))
ax1.set_title("horizontal_projections")
ax1.plot(hpp)
ax2.axis("off")
ax2.set_title("binarized_image")
ax2.imshow(binarized_image, cmap="gray")
plt.show()
As you can see, where there were more white spaces there are peaks in the graph. We will use this information further to locate the regions where we can find the seperation line using the A* algorithm.
# find the midway where we can make a threshold and extract the peaks regions
def find_peak_regions(hpp, threshold):
peaks = []
for i, hppv in enumerate(hpp):
if hppv < threshold:
peaks.append([i, hppv])
return peaks
# find the threshold from where anything above is considered a peak region
# using the average for now but this needs further research. This may or may not work on all images.
threshold = (np.max(hpp)-np.min(hpp))/2
peaks = find_peak_regions(hpp, threshold)
peaks_indexes = np.array(peaks)[:, 0].astype(int)
segmented_img = np.copy(img)
r, c = segmented_img.shape
for ri in range(r):
if ri in peaks_indexes:
segmented_img[ri, :] = 0
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,8))
ax1.plot(hpp)
ax2.set_title("threshold line")
ax1.plot([0, img.shape[0]], [threshold, threshold,])
ax2.imshow(segmented_img, cmap="gray")
plt.show()
The threshold value helped us pick the probably white regions from the image. The black regions indicate where we would need to run our path planning algorithm for line segmentation.
# group the peaks through which we will be doing path planning.
diff_between_consec_numbers = np.diff(peaks_indexes) # difference between consecutive numbers
indexes_with_larger_diff = np.where(diff_between_consec_numbers > 1)[0].flatten()
peak_groups = np.split(peaks_indexes, indexes_with_larger_diff)
# remove very small regions, these are basically errors in algorithm because of our threshold value
peak_groups = [item for item in peak_groups if len(item) > 10]
print("peak groups found", len(peak_groups))
peak groups found 16
#a star path planning algorithm
from heapq import *
def heuristic(a, b):
return (b[0] - a[0]) ** 2 + (b[1] - a[1]) ** 2
def astar(array, start, goal):
neighbors = [(0,1),(0,-1),(1,0),(-1,0),(1,1),(1,-1),(-1,1),(-1,-1)]
close_set = set()
came_from = {}
gscore = {start:0}
fscore = {start:heuristic(start, goal)}
oheap = []
heappush(oheap, (fscore[start], start))
while oheap:
current = heappop(oheap)[1]
if current == goal:
data = []
while current in came_from:
data.append(current)
current = came_from[current]
return data
close_set.add(current)
for i, j in neighbors:
neighbor = current[0] + i, current[1] + j
tentative_g_score = gscore[current] + heuristic(current, neighbor)
if 0 <= neighbor[0] < array.shape[0]:
if 0 <= neighbor[1] < array.shape[1]:
if array[neighbor[0]][neighbor[1]] == 1:
continue
else:
# array bound y walls
continue
else:
# array bound x walls
continue
if neighbor in close_set and tentative_g_score >= gscore.get(neighbor, 0):
continue
if tentative_g_score < gscore.get(neighbor, 0) or neighbor not in [i[1]for i in oheap]:
came_from[neighbor] = current
gscore[neighbor] = tentative_g_score
fscore[neighbor] = tentative_g_score + heuristic(neighbor, goal)
heappush(oheap, (fscore[neighbor], neighbor))
return []
# Visualize the peak images
fig, ax = plt.subplots(nrows=len(peak_groups), ncols=1, figsize=(15,20))
for index, sub_image_index in enumerate(peak_groups):
sub_image = img[sub_image_index[0]:sub_image_index[-1]]
ax[index].axis("off")
ax[index].imshow(sub_image, cmap="gray")
plt.show()
# now that everything is cleaner, its time to segment all the lines using the A* algorithm
def get_binary(img):
mean = np.mean(img)
if mean == 0.0 or mean == 1.0:
return img
thresh = threshold_otsu(img)
binary = img <= thresh
binary = binary * 1
return binary
binary_image = get_binary(img)
segment_separating_lines = []
for i, sub_image_index in enumerate(peak_groups):
nmap = binary_image[sub_image_index[0]:sub_image_index[-1]]
path = np.array(astar(nmap, (int(nmap.shape[0]/2), 0), (int(nmap.shape[0]/2),nmap.shape[1]-1)))
offset_from_top = sub_image_index[0]
path[:,0] += offset_from_top
segment_separating_lines.append(path)
# visualize a sample
cluster_of_interest = peak_groups[1]
offset_from_top = cluster_of_interest[0]
nmap = binary_image[cluster_of_interest[0]:cluster_of_interest[-1],:]
plt.figure(figsize=(20,20))
plt.imshow(nmap, cmap="gray")
path = np.array(astar(nmap, (int(nmap.shape[0]/2), 0), (int(nmap.shape[0]/2),nmap.shape[1]-1)))
plt.plot(path[:,1], path[:,0])
[<matplotlib.lines.Line2D at 0x7fcd740dd4e0>]
offset_from_top = cluster_of_interest[0]
fig, ax = plt.subplots(figsize=(20,10), ncols=2)
for path in segment_separating_lines:
ax[1].plot((path[:,1]), path[:,0])
ax[1].imshow(img, cmap="gray")
ax[0].imshow(img, cmap="gray")
<matplotlib.image.AxesImage at 0x7fcd73afbcd0>
# TODO: the below splitting image algorithm is too simple
# I need to write a cleaner one.
seperated_images = []
for index, line_segments in enumerate(segment_separating_lines):
if index < len(segment_separating_lines)-1:
lower_line = np.min(segment_separating_lines[index][:,0])
upper_line = np.max(segment_separating_lines[index+1][:,0])
seperated_images.append(img[lower_line:upper_line])
# visualize it
fig, ax = plt.subplots(figsize=(10,10), nrows=len(seperated_images))
for index, line_image in enumerate(seperated_images):
ax[index].imshow(line_image, cmap="gray")
plt.show()
from skimage.filters import threshold_otsu
#binarize the image, guassian blur will remove any noise in the image
first_line = seperated_images[0]
thresh = threshold_otsu(first_line)
binary = first_line > thresh
# find the vertical projection by adding up the values of all pixels along rows
vertical_projection = np.sum(binary, axis=0)
# plot the vertical projects
fig, ax = plt.subplots(nrows=2, figsize=(20,10))
plt.xlim(0, first_line.shape[1])
ax[0].imshow(binary, cmap="gray")
ax[1].plot(vertical_projection)
[<matplotlib.lines.Line2D at 0x7fcd728b66b0>]
height = first_line.shape[0]
## we will go through the vertical projections and
## find the sequence of consecutive white spaces in the image
whitespace_lengths = []
whitespace = 0
for vp in vertical_projection:
if vp == height:
whitespace = whitespace + 1
elif vp != height:
if whitespace != 0:
whitespace_lengths.append(whitespace)
whitespace = 0 # reset whitepsace counter.
print("whitespaces:", whitespace_lengths)
avg_white_space_length = np.mean(whitespace_lengths)
print("average whitespace lenght:", avg_white_space_length)
whitespaces: [5, 4, 3, 170, 67, 65, 50, 11, 62, 70, 5, 31, 79, 5, 41] average whitespace lenght: 44.53333333333333
## find index of whitespaces which are actually long spaces using the avg_white_space_length
whitespace_length = 0
divider_indexes = []
for index, vp in enumerate(vertical_projection):
if vp == height:
whitespace_length = whitespace_length + 1
elif vp != height:
if whitespace_length != 0 and whitespace_length > avg_white_space_length:
divider_indexes.append(index-int(whitespace_length/2))
whitespace_length = 0 # reset it
print(divider_indexes)
[99, 472, 800, 872, 1141, 1313, 1949, 2385]
# lets create the block of words from divider_indexes
divider_indexes = np.array(divider_indexes)
dividers = np.column_stack((divider_indexes[:-1],divider_indexes[1:]))
# now plot the findings
fig, ax = plt.subplots(nrows=len(dividers), figsize=(5,10))
for index, window in enumerate(dividers):
ax[index].axis("off")
ax[index].imshow(first_line[:,window[0]:window[1]], cmap="gray")