Process Stock Exchange images, detecting the positions of columns and headers.
import numpy as np
import cv2
import math
import statistics
import os
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
from tqdm.auto import tqdm
from statistics import mean
import re
from fuzzywuzzy import fuzz
import tempfile
from pathlib import Path
# These OCR image preprocessing steps are based on https://stackoverflow.com/a/43493383
# I don't really understand why this particular combination of filters works, but it does seem to improve OCR results
BINARY_THRESHOLD = 200
def process_image_for_ocr(file_path):
# TODO : Implement using opencv
temp_filename = set_image_dpi(file_path)
im_new = remove_noise_and_smooth(temp_filename)
return im_new
def set_image_dpi(file_path):
im = Image.open(file_path)
length_x, width_y = im.size
factor = max(1, int(5000 / length_x))
size = factor * length_x, factor * width_y
# size = (1800, 1800)
im_resized = im.resize(size, Image.ANTIALIAS)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
temp_filename = temp_file.name
im.save(temp_filename, dpi=(300, 300))
return temp_filename
def image_smoothening(img):
ret1, th1 = cv2.threshold(img, BINARY_THRESHOLD, 255, cv2.THRESH_BINARY)
ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
blur = cv2.GaussianBlur(th2, (1, 1), 0)
ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th3
def remove_noise_and_smooth(file_name):
img = cv2.imread(file_name, 0)
# gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
filtered = cv2.adaptiveThreshold(img.astype(np.uint8), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 41, 3)
kernel = np.ones((1, 1), np.uint8)
opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)
closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
img = image_smoothening(img)
or_image = cv2.bitwise_or(img, closing)
(h, w) = or_image.shape[:2]
img = resize(or_image, h, w)
return img
def find_lines(img):
'''
Find straight lines in an image.
Returns a list of lines.
These settings have been arrived at after much trial and error.
'''
# Convert to grayscale
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
# Theshold image (convert to black and white)
retval, th = cv2.threshold(gray,125,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
# cv2.imwrite('data/th.jpg',th)
# Use median blur to get rid of a lot of the text
median = cv2.medianBlur(th, 11)
# cv2.imwrite('data/median.jpg',median)
# Try to strengthen the remaining lines
kernel = np.ones((5,5),np.uint8)
opened = cv2.morphologyEx(median, cv2.MORPH_OPEN, kernel, iterations=1)
# cv2.imwrite('data/opened.jpg',opened)
# Find the edges of the remaining shapes
v = np.median(gray)
lower = int(max(0, (1.0 - 0.33) * v))
upper = int(min(255, (1.0 + 0.33) * v))
# edges = cv2.Canny(opened,50,150,apertureSize=3)
edges = cv2.Canny(opened,lower,upper,apertureSize=3)
# cv2.imwrite('data/edges.jpg',edges)
# Find straight lines in the edges
# Note that the minLineLength and maxLineGap values can have a dramatic effect on the number of lines detected.
# Note too that what looks to the human eye as a single straight line,
# can actually be a series of short line with tiny gaps between them,
# so while increasing the minLineLength reduces noise from text, it can also filter out columns.
lines = cv2.HoughLinesP(image=edges,rho=1,theta=np.pi/180, threshold=200,lines=np.array([]), minLineLength=100,maxLineGap=100)
# print(lines)
return lines
def find_margin(df):
return int(round(df.loc[(df['level'] == 4) & (df['left'] < 150)]['left'].mean()))
def find_col_width(df):
candidates = []
for confidence in reversed(range(80, 110, 10)):
for heading in ['buyers', 'closing', 'quotations']:
for word in df.loc[(df['level'] == 5) & (df['left'] < 1500)].sort_values(by='top').itertuples():
# print(word.text.lower())
if len(str(word.text)) > 5 and fuzz.partial_ratio(heading, word.text.lower()) >= confidence:
# print(word)
# print(fuzz.ratio('buyers', word.text.lower()))
if word.left > 625:
candidates.append(word.left)
return candidates
def find_header_height(df):
candidates = []
for confidence in reversed(range(80, 110, 10)):
for heading in ['shares', 'closing', 'sales', 'quotations', 'buyers', 'sellers', 'business']:
for word in df.loc[(df['level'] == 5) & (df['top'] < 1750) & (df['left'] < 3750) & (df['height'] < 90)].sort_values(by=['top', 'left']).itertuples():
if len(str(word.text)) > 5 and fuzz.partial_ratio(heading, word.text.lower()) >= confidence:
# print(word)
# print(fuzz.partial_ratio(heading, word.text.lower()))
#return word.top
candidates.append(word.top)
return candidates
def find_header(img_path):
# Image dimensions
img = process_image_for_ocr(img_path)
(h, w) = img.shape[:2]
points = []
# The header will always be at the top, so crop off the top of the image, rather than OCRing the whole thing
cropped = img[0:1750, 0:w]
col_widths = []
header_heights = []
# The psm settings can greatly effect the results, but they're unpredictable
# Sometimes one setting works better than the other, I don't know why
# So we're going to try them both and look for the best result.
for psm in [4, 6]:
df = pytesseract.image_to_data(cropped, config=f'--psm {psm} --oem 1 -l eng', output_type=pytesseract.Output.DATAFRAME)
col_widths += find_col_width(df)
# print(df.loc[(df['level'] == 5) & (df['left'] < 3750)].sort_values(by=['top', 'left']).to_dict('records'))
header_heights += find_header_height(df)
# margin = find_margin(df)
try:
# header_height = sorted(header_heights)[0]
header_height = int(statistics.median(header_heights))
except (IndexError, statistics.StatisticsError):
header_height = 0
try:
# col_width = sorted(col_widths)[0] - 10
col_width = int(statistics.median(col_widths))
except (IndexError, statistics.StatisticsError):
col_width = 0
# print(col_width, header_height)
return (col_width, header_height)
def check_for_skew(lines):
'''
Check for skewing by looking at the near vertical lines detected in the image.
'''
angles = []
# Loop through detected lines
for line in lines:
# Get coords of line
for x1,y1,x2,y2 in line:
# Ignore short lines and lines in header
if abs(y1 - y2) > 150 and x1 > 300:
# Get the angle of the line
if y2 > y1:
radians = math.atan2((y2 - y1), (x2 - x1))
else:
radians = math.atan2((y1 - y2), (x1 - x2))
degrees = math.degrees(radians)
# print(degrees)
# If it's vertical-ish, save this angle
if degrees >= 80 and degrees <= 100:
angles.append(degrees)
# Get the media of the saved angles
angle = statistics.median(angles) - 90
# print(angle)
return angle
def deskew(img, angle):
'''
Deskew image by rotating it by the supplied angle.
'''
# Get image dimensions
(h, w) = img.shape[:2]
# Get the centre of the image
center = (w // 2, h // 2)
# Rotate image by angle
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(img, M, (w, h),flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
# Return the rotated image
return rotated
def add_grid(img):
'''
Draws a 100 x 100px grid on image.
Can be useful in interpreting column detection results.
'''
h, w = img.shape[:2]
for x in range(0, w, 100):
cv2.line(img,(x,0),(x,h),(255,0,0),1)
for y in range(0, h, 100):
cv2.line(img,(0,y),(w,y),(255,0,0),1)
return img
def find_top(lines):
'''
Use horizontal lines near the top of the page to provide an approximation of the header height.
Used to crop page to ignore lines in header area.
More accurate header location is found using Tesseract.
'''
top = 0
y_values = []
for line in lines:
for x1,y1,x2,y2 in line:
bottom = y1 if y1 > y2 else y2
if bottom < 1000:
radians = math.atan2((y1 - y2), (x1 - x2))
degrees = math.degrees(radians)
if degrees > 179 and degrees < 181:
y_values.append(bottom)
if y_values:
top = sorted(y_values)[-1]
return top
def find_columns(lines, h, w, column_top, col_width):
'''
Identifies most likely column values from within the set of straight lines in an image.
This could do with some cleaning up, but it's working well at the moment, so I don't really want to fiddle any more.
Note that this does depend on some knowledge of the images to define ranges of expected values.
'''
x_values = []
# Get the approximate position of the header so we can ignore lines above this
column_top = find_top(lines)
# Find the x values of vertical lines
for line in lines:
for x1,y1,x2,y2 in line:
# Find the top
top = y1 if y1 < y2 else y2
# Ignore column lines at the top & bottom of the image
if top > column_top and top < (h - 600):
# Find the leftmost point
first = x1 if x1 < x2 else x2
# Find the angle of the line
radians = math.atan2((y1 - y2), (x1 - x2))
degrees = abs(math.degrees(radians))
# If the line is (close to) vertical, we'll save the left-most x value
if degrees >= 89 and degrees <= 91:
x_values.append(first)
# Sort the x_values
x_values = sorted(x_values)
# Cluster together values within the specified distance
clusters = []
start = 0
# Lines less than this distance apart will be clustered
distance = 10
cluster = []
# Loop through x values
for x in x_values:
# If the x value is less than the specified distance from the previous point,
# we'll add it to the current cluster
if x < start + distance:
cluster.append(x)
# If not we'll save the current cluster, and start a new one
else:
if cluster:
# Add the current cluster to the list of clusters
clusters.append(cluster)
# Start a new cluster at the current point
cluster = [x]
# Set the current position
start = x
# Add the last cluster once we've finished the loop
clusters.append(cluster)
# Now we have a list of clustered x values
# We'll compare nearby clusters and keep the ones with the most values (most likely to be columns)
best_clusters = [[0]]
# Loop through clusters
for cluster in clusters:
# If the current cluster is within 200px of the previous one
if cluster[0] < best_clusters[-1][-1] + 200:
# Check to see which cluster contains the most values
# If it's the current one we'll add it to our best clusters
if len(cluster) > len(best_clusters[-1]):
# Remove the previous cluster from best clusters
best_clusters.pop()
# Add this one
best_clusters.append(cluster)
# If this cluster isn't near the previous one, add it to best clusters
else:
best_clusters.append(cluster)
# print(best_clusters)
# Now we have our best candidates for columns in best clusters
# We'll do some further filtering by checking the clusters against our expectations of column positions
# The pixel values used below are based on trial and error with the Stock Exchange images
# Obviously if you were using this on other images you'd want to adjust them accordingly
columns = []
start = 0
gutter = 0
gap = None
max_col_width = 2000
# Loop through our best clusters
for cluster in best_clusters:
min_col_width = 950
# If the leftmost point in this cluster is less than 600 then it's the gutter
if cluster and cluster[0] < 600:
# Set the gutter value to a mean of the clustered points
gutter = mean(cluster)
# Sometimes the gutter isn't detected, so we'll set a reasonable start position
if gutter == 0:
gutter = 200
if gutter <= 200:
start = 250
else:
start = gutter + 50
if col_width:
# print(col_width)
#min_col_width = ((col_width - start) * 2) - 180
#max_col_width = ((col_width - start) * 2) + 180
min_col_width = max(min_col_width, int(round((col_width - start) * 1.65)))
max_col_width = int(round((col_width - start) * 2.35))
# print(min_col_width)
# print(list(reversed(range(min_col_width, max_col_width + 100, 100))))
else:
# Checking the gap between this cluster and the previous one
if gap:
this_gap = gap
else:
# Current gap is the leftmost point of this cluster minus the previous column position
this_gap = cluster[0] - start
# This range represents approximate min/max column widths
# We'll look for columns at 100 px intervals starting from the max value until we hit the min value
for width in reversed(range(min_col_width, max_col_width + 100, 100)):
cluster_mean = mean(cluster)
# print(width)
# print(cluster_mean)
# print(start)
# print(this_gap)
# print('----')
# Try to make sure columns are roughly the same width
if (cluster_mean - start) > (this_gap - 500) and (cluster_mean - start) < (this_gap + 500):
# If cluster falls within expected values, we'll assume it's a column
if cluster and cluster_mean >= (start + width) and cluster_mean <= (w - 900) and this_gap < 2600:
# Save mean of clustered values as column
columns.append(cluster_mean)
# Set the next start value to value of the last point in cluster
start = cluster_mean
gap = this_gap
# Don't look for any more columns in this cluster
break
columns.append(w)
return (gutter, columns)
def resize(img, h, w):
'''
Resize image to a max width of 5000 px.
'''
# Find the scale to use, based on max width
scale = 5000 / float(w)
# Resize the image
resized = cv2.resize(img, None, fx=scale, fy=scale, interpolation = cv2.INTER_AREA)
return resized
def save_header(img, header, w, image_name, output_dir):
'''
Save the detected header as a separate image.
'''
# Where to save the image
header_dir = os.path.join(output_dir, 'headers')
# Crop the image using header value
# Numpy slicing - roi = im[y1:y2, x1:x2]
header_img = img[0:header+20, 0:w]
# Save the cropped image
cv2.imwrite('{}/{}-header.jpg'.format(header_dir, image_name[:-4]), header_img)
def save_columns(img, columns, header, h, image_name, output_dir):
'''
Save each detected column as a separate image.
Note that the columns list should include the gutter at the beginning and the image width at the end.
'''
# Where to save the images
col_dir = os.path.join(output_dir, 'columns')
# Loop through the column values
for index, column in enumerate(columns):
# Get the value of the next column to use as the width of the cropped column
try:
next_col = columns[index+1]
except IndexError:
# If there's no next column we've reached the end of the image, so do nothing
pass
else:
# Add a little to the margins of the image
if column > 20:
this_col = column - 20
else:
this_col = column
# Crop the image to the dimensions of the column
col_img = img[max(0, header-20):h, this_col:next_col]
# Save the cropped image, using the undex value to denote column order
cv2.imwrite('{}/{}-col-{}.jpg'.format(col_dir, image_name[:-4], index+1), col_img)
def display_lines(image_name, output_dir, img, lines):
'''
For testing / debugging - shows ALL the detected lines
'''
for line in lines:
#print(line)
x1,y1,x2,y2 = line[0]
cv2.line(img,(x1,y1),(x2,y2),(0,0,255),8)
#cv2.imwrite('{}/{}-lines.jpg'.format(output_dir, image_name[:-4]), img)
def process_image(image_name, image_path, output_dir='test', markup=False, grid=False, save_derivs=True):
'''
Detect columns and header in the supplied image.
Parameters:
image_name
image_path
output_dir (must exist)
markup – if True, draw the results on the image, if False crop and save the detected regions.
grid – if True, draw a grid on the image
'''
img = cv2.imread(image_path)
# Get image dimensions
try:
h, w = img.shape[:2]
# Weed out dodgy images
except AttributeError:
print('Not a valid image: {}'.format(image_path))
# If it looks ok, then proceed...
else:
# To standardise things a little, we'll resize images with a width greater than 5000
if w > 5000:
img = resize(img, h, w)
# Get the new dimensions
h, w = img.shape[:2]
# Detect stratight lines in the image
lines = find_lines(img)
# Use the detected lines to check for skew
# I'm not actually sure if these deskewing steps are useful
angle = check_for_skew(lines)
# If image seems to be skewed, then deskew!
if angle != 0:
# print('Deskewing')
img = deskew(img, angle)
# Once deskewed we have to redo line detection because positions will have changed
lines = find_lines(img)
#display_lines(image_name, output_dir, img, lines)
# Find the bottom of the header
col_width, header = find_header(image_path)
# print(col_width, header)
# Filter the detected lines to identify columns
gutter, columns = find_columns(lines, h, w, header, col_width)
# Draw a grid on image (for debugging)
if grid:
img = add_grid(img)
# Crop & save columns and header
if save_derivs:
# Crop and save header
# save_header(img, header, w, image_name, output_dir)
# Add gutter and page width to the columns list
columns = [gutter] + columns
# Crop and save columns
save_columns(img, columns, header, h, image_name, output_dir)
# Draw detected column & header lines on image and save the results (for testing)
if markup:
# Draw gutter
cv2.line(img,(gutter,0),(gutter,h),(0,255,0),5)
# Draw columns
for column in columns:
cv2.line(img,(column,0),(column,h),(0,255,0),5)
# Draw header
cv2.line(img,(0, header),(w, header),(255,0,0),3)
# Save the annotated image
cv2.imwrite('{}/{}.jpg'.format(output_dir, image_name[:-4]), img)
process_image('testing.jpg', '/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-077/N193-077_0001.tif', 'testing', markup=True)
find_header('/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-001/N193-001_0006.tif')
# Directory to process
dir_path = '/Volumes/Sydney Stock Exchange Vol 1/Sydney Stock Exchange 001-109/Transferred AU NBAC N193-001'
# This is where the processed images should go
output_dir = 'fullsize-processed/AU NBAC N193-001'
os.makedirs(os.path.join(output_dir, 'columns'), exist_ok=True)
os.makedirs(os.path.join(output_dir, 'headers'), exist_ok=True)
# Loop through images with .tif(f) extension
for img_name in tqdm([i for i in os.listdir(dir_path) if '.tif' in i[-5:].lower()]):
# print(img_name)
img_path = os.path.join(dir_path, img_name)
try:
process_image(img_name, img_path, output_dir, markup=True)
except (TypeError, statistics.StatisticsError):
print('ERROR')
start_vol = 188
# Directory of directories
input_path = Path('/Volumes/Sydney Stock Exchange Vol 2/Sydney Stock Exchange Vol 2 110-199/')
# This is where the processed images should go
# output_path = Path('/Volumes/bigdata/mydata/stockexchange/processed')
# output_path = 'fulltext-processed'
# os.makedirs(os.path.join(output_dir, 'columns'), exist_ok=True)
# os.makedirs(os.path.join(output_dir, 'headers'), exist_ok=True)
# Loop through directories
for img_dir in tqdm([d for d in input_path.glob('*') if d.is_dir()], desc='Directories'):
# print(img_dir)
vol_num = int(re.search(r'(\d+)$', str(img_dir)).group(1))
if vol_num >= start_vol:
output_path = Path('/Volumes/bigdata/mydata/stockexchange/processed', img_dir.name.replace('Transferred ', ''))
Path(output_path, 'columns').mkdir(parents=True, exist_ok=True)
# Loop through images with .tif(f) extension
for img_path in tqdm([i for i in img_dir.glob('*') if '.tif' in i.name[-5:].lower()], leave=False, desc='Images'):
img_name = img_path.name
# print(str(output_path))
try:
#find_lines(img, img_path)
process_image(str(img_name), str(img_path), str(output_path), markup=True)
except (TypeError, statistics.StatisticsError):
pass
%%time
process_image('N193-022_0184.tif', '/Users/tim/Dropbox/working_code/stockexchange/src/notebooks/samples/AU-NBAC-N193-022/N193-022_0184.tif', 'data', markup=True)
%%timeit
process_image('N193-022_0184.tif', '/Users/tim/Dropbox/working_code/stockexchange/src/notebooks/samples/AU-NBAC-N193-022/N193-022_0184.tif', 'data', markup=True)
%%time
process_image('N193-022_0184.tif', '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950/AU NBAC N193-007/N193-007_0012.tif', 'data', markup=True)
%%timeit
process_image('N193-022_0184.tif', '/scratch/cloudstor/Shared/ANU-Library/Sydney Stock Exchange 1901-1950/AU NBAC N193-022/N193-022_0184.tif', 'data', markup=True)