Change WORKDIR if running not in container¶

In [1]:

import os
WORKDIR = os.path.join('/home', 'htr')

Setup¶

In [2]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [3]:

mpl.rc('figure',figsize=(5, 20))
mpl.rc('xtick', labelsize=16) 
mpl.rc('ytick', labelsize=16)
mpl.rc('font', size=16)

In [4]:

import warnings
warnings.filterwarnings('ignore',category=FutureWarning)


import seaborn as sns
import pandas as pd
import json
import glob
import string

from tqdm import tqdm

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import cv2
import imgaug as ia
import imgaug.augmenters as iaa
import imageio

In [5]:

ann_path = os.path.join(WORKDIR, 'HKR_Dataset_Words_Public', 'ann')
img_path = os.path.join(WORKDIR, 'HKR_Dataset_Words_Public', 'img')
meta_path = os.path.join(WORKDIR, 'metadata')

In [6]:

def read_image(path: str, ax: matplotlib.axes._subplots.Axes,
               title: str='', img_path: str=img_path) -> None:

    '''Utility fiunction for printing images from "path"'''

    image = imageio.imread(os.path.join(img_path, path))
    ax.imshow(image, cmap='gray')
    ax.axis("off")
    title = title if title else path
    ax.set_title(title)

In [7]:

def counts_to_df(df: pd.DataFrame, column: str='description') -> pd.DataFrame:
    
    '''Return dataframe with symbols counts from "column"'''

    counts = pd.DataFrame(df[column].map(list).explode())
    counts = counts.join(counts[column].value_counts(), on=column, rsuffix='1')
    counts.columns = ['symbols', 'counts']
    counts = counts[~(counts.symbols == '') & ~(counts.symbols == ' ')]  #.drop_duplicates()
    
    return counts

In [8]:

def meta_collect(ann_path: str, result_file: str, sep: str='\t') -> None:
    
    '''collect metadata for all images to "result_file"
    from json files in "ann_path" (execution time: about 5 mins)'''

    start = time()
    with open(result_file, 'w',  encoding='utf-8') as f:
        f.write(sep.join(['width', 'height', 'description',
                           'isModerated', 'moderatedBy', 'predicted']) + '\n')

    
        for file in tqdm(glob.glob(os.path.join(ann_path, '*.json'))):

            with open(file, encoding='utf-8') as js:
                tmp = json.load(js)

            try:
                f.write(sep.join([tmp['name'], str(tmp['size']['width']), str(tmp['size']['height']),
                               tmp['description'], str(tmp['moderation']['isModerated']),
                               tmp['moderation']['moderatedBy'], str(tmp['moderation']['predicted'])]) + '\n')
            except Exception:
                print(tmp['description'])
    print('execution time:', (time() - start), 'secs')

# meta_collect(ann_path, os.path.join(meta_path, 'metadata.tsv'))
df = pd.read_csv(os.path.join(meta_path, 'metadata.tsv'), sep='\t', index_col=0)

print(df.shape)
df.head()

(64943, 6)

Out[8]:

	width	height	description	isModerated	moderatedBy	predicted
0_0_0	495	64	Шёл человек.	1	Norlist	NaN
0_0_1	494	65	Шёл человек	1	Norlist	NaN
0_0_10	489	73	Шёл человек	1	Norlist	NaN
0_0_11	406	46	Шёл человек.	1	Norlist	NaN
0_0_12	379	76	Шёл человек	1	Norlist	NaN

EDA with some preprocessing¶

In [9]:

df['description_length'] = df.description.apply(len)
df

Out[9]:

	width	height	description	isModerated	moderatedBy	predicted	description_length
0_0_0	495	64	Шёл человек.	1	Norlist	NaN	12
0_0_1	494	65	Шёл человек	1	Norlist	NaN	11
0_0_10	489	73	Шёл человек	1	Norlist	NaN	11
0_0_11	406	46	Шёл человек.	1	Norlist	NaN	12
0_0_12	379	76	Шёл человек	1	Norlist	NaN	11
...	...	...	...	...	...	...	...
9_9_875	543	94	Вид постоялого	1	Norlist	NaN	14
9_9_877	462	73	Вид постоялого	1	Norlist	NaN	14
9_9_878	595	83	Вид постоялого	1	Norlist	NaN	14
9_9_879	532	72	Вид постоялого	1	Norlist	NaN	14
9_9_880	538	72	Вид постоялого	1	Norlist	NaN	14

64943 rows × 7 columns

Obtaining statistics¶

In [10]:

df.describe().round(2)

Out[10]:

	width	height	isModerated	predicted	description_length
count	64943.00	64943.00	64943.0	0.0	64943.00
mean	443.28	75.70	1.0	NaN	11.02
std	176.95	17.39	0.0	NaN	4.39
min	44.00	14.00	1.0	NaN	2.00
25%	305.00	64.00	1.0	NaN	7.00
50%	430.00	75.00	1.0	NaN	11.00
75%	565.00	87.00	1.0	NaN	14.00
max	1697.00	150.00	1.0	NaN	42.00

In [11]:

df.describe(include=object)

Out[11]:

	description	moderatedBy
count	64943	64943
unique	2808	2
top	Актау	Norlist
freq	484	59421

Removing useless columns¶

In [12]:

df[~df.predicted.isna()]

Out[12]:

	width	height	description	isModerated	moderatedBy	predicted	description_length

In [13]:

df.isModerated.value_counts()

Out[13]:

1    64943
Name: isModerated, dtype: int64

In [14]:

cou = df.moderatedBy.value_counts()
cou

Out[14]:

Norlist               59421
Daniyar Borisovich     5522
Name: moderatedBy, dtype: int64

In [15]:

df.drop(['predicted', 'moderatedBy', 'isModerated'], axis=1, inplace=True)

Some random pictures¶

In [16]:

n = 10
img_names = random.choice(df.index, n)
fig, axes = subplots(n, 1)

for img_name, ax in zip(img_names, axes):
    read_image(img_name + '.jpg', ax=ax, title=img_name + f'  ({df.loc[img_name].description})')
tight_layout()

Dataset symbol counts¶

In [17]:

# Creating dataframe with symbol counts with indexes from original df
counts = counts_to_df(df, 'description') # Do not dropping duplicates for saving original df indexes

# Barplot with symbol counts in dataset
fig, ax = subplots(figsize=(20, 10))
ax.set_title('Frequency of symbols')
color = '#2ca3db'
ax = sns.barplot(data=counts.sort_values('counts', ascending=False),
                 x='symbols', y='counts', ax=ax, color=color)
ax.set(xlabel='letter', ylabel='frequency')

tight_layout()

In [18]:

print(f'All characters:\n {counts.symbols.unique()}')

All characters:
 ['Ш' 'ё' 'л' 'ч' 'е' 'о' 'в' 'к' '.' ',' 'С' 'п' 'ы' 'м' 'а' 'т' 'О' 'н'
 'и' 'я' 'с' 'ь' 'Б' 'ш' 'у' 'ю' 'Г' 'д' 'К' 'х' 'р' 'ж' 'щ' 'г' '?' 'Л'
 'б' 'И' 'Н' 'Д' 'З' ';' 'Р' 'з' 'э' 'А' 'В' 'й' '–' 'ц' 'Т' 'o' 'Ч' 'Қ'
 'қ' 'Х' 'ғ' 'Й' 'Ы' 'П' 'Е' 'М' 'У' 'Ь' 'ө' 'Я' '(' ')' 'Ю' '-' 'Ж' 'Ө'
 'Щ' 'Э' 'H' 'Ү' 'Ф' '!' ':' 'ф' '…' '—' 'ъ']

Finding all non-ordinary symbols for Russian language¶

In [19]:

# Creating reference alphabet with Russian (lower- and uppercase) and punctuation symbols
alphabet_lower = [chr(ord("а") + i) for i in range(32)] + [chr(ord("а") + 33)] # Last is "ё"
alphabet_upper = [chr(ord("А") + i) for i in range(32)]
punctuation = list(string.punctuation)
ref_alphabet = set(alphabet_lower + alphabet_upper + punctuation)

# Creating alphabet from dataset
alphabet = set(counts.symbols)

print(f'Reference alphabet length: {len(ref_alphabet)}')
print(f'Actual alphabet length: {len(alphabet)}\n')

# difference between dataset and reference alphabet
diff_symbols = alphabet - ref_alphabet 
diff_counts = counts[counts.symbols.isin(diff_symbols)]
print(f'Non-ordinary symbols: {diff_symbols}')
print(f'Number of labels with non-ordinary symbols: {diff_counts.index.unique().shape[0]}')
print(f'Percentage of labels with non-ordinary symbols: {round(diff_counts.index.unique().shape[0]/df.shape[0]*100, 2)}%')

Reference alphabet length: 97
Actual alphabet length: 83

Non-ordinary symbols: {'қ', '–', 'Ө', 'o', '—', 'H', '…', 'ғ', 'ө', 'Қ', 'Ү'}
Number of labels with non-ordinary symbols: 815
Percentage of labels with non-ordinary symbols: 1.25%

Plotting labels with the non-reference symbols¶

In [20]:

fig, axes = subplots(len(diff_symbols), 1)

for sym, ax in zip(diff_symbols, axes):
    ind = counts[counts.symbols == sym].index[0]
    read_image(ind + '.jpg', ax, df.loc[ind].description + f'  ({sym})')
tight_layout()

Rows with non-reference symbols¶

In [21]:

pd.options.display.max_rows = 100
df.loc[counts[counts.symbols.isin(diff_symbols)].index.drop_duplicates()].drop_duplicates('description')

Out[21]:

	width	height	description	description_length
0_46_1	530	70	А встретятся –	14
0_46_46	502	81	А, встретятся –	15
0_9_0	704	66	Кем? Кем? Волкoм?	17
0_9_15	734	43	Кем? Кем? Волкoм	16
0_9_16	682	65	кем? Кем? Волкoв?	17
0_9_28_	869	59	Кем? Кем? волкoм?	17
0_9_626	712	76	Кем? Чем? Волкoм?	17
10_0_107	377	112	Қазахстан	9
10_0_126	303	85	Қазақстан	9
10_10_123	423	106	Карағанда	9
10_11_123	412	73	Қостанай	8
10_12_107	371	105	Қызылорда	9
10_19_418	324	116	Көкшетау	8
10_31_88	615	107	ҚАРАГАНДИНСКАЯ	14
10_32_88	483	106	Қостанайская	12
10_33_111	542	94	Қызылординская	14
10_35_88	658	101	Северо-Қазахстанская	20
10_7_12	208	75	Ақтау	5
10_7_278	216	69	АҚТАУ	5
10_8_123	228	78	Ақтобе	6
10_8_259	193	44	АКТӨБЕ	6
10_8_292	211	66	Актөбе	6
10_8_88	233	96	Ақтөбе	6
11_45_12	323	67	Аркалық	7
12_17_80	355	56	Қаражал	7
13_17_138	521	66	Hоводолинский	13
13_17_139	334	75	Hоводолинск	11
13_17_141_	483	117	Hоводомнский	12
13_17_156	520	46	Hоводолинскии	13
13_20_140	166	70	Ақсу	4
13_25_180	335	67	Балпық	6
13_27_202	318	76	Бестөбе	7
13_45_171	258	52	Мақат	5
13_48_156	483	46	Өтеген батыра	13
13_4_156	249	58	Үштобе	6
2_45_133	624	96	святых видений –	16
3_50_185	257	66	душои…	6
3_50_188	339	88	душой…	6
3_50_238	315	92	душой….	7
3_50_706	282	72	дущой…	6
4_12_250	453	76	Глушь и снег…	13
4_12_254	588	78	Глуш и снег…	12
4_12_292	438	83	Глушь снег…	11
4_14_252	578	78	Скучно, грустно…	16
4_36_246	694	92	То сердечная тоска…	19
4_39_248	699	108	Попадаются одне…	16
5_0_314	687	48	Прошло пять лет,—	17
5_0_339	573	52	прошло пять лет,—	17
5_10_311	501	67	Где выл пожар —	15
5_14_456	612	87	Взывали к мцению —	18
5_34_305	331	59	гремел —	8
5_41_307	612	61	ныло от разлуки,—	17
6_34_362	592	54	проклятым роем…	15
6_34_407	383	62	проклятьем роем…	16
6_35_362	691	58	терзали жизнь мою…	18
6_38_362	205	62	свою…	5
6_44_362	497	50	нежная, нашла…	14
6_44_363	737	64	нежная, нашла….	15
6_44_780	597	43	нежноя, нашла…	14
6_47_363	775	68	в сырую ночь ушла…	18
7_47_420	401	63	за веком —	10
7_47_426	398	52	ЗА ВЕКОМ —	10
7_47_453	408	103	за Веком —	10
7_48_420	339	77	Господь? —	10
7_48_457	271	118	Господ? —	9

Some Russian symbols are in latin spelling and some punctuation symbols are not in unicode format, so rework a part of them¶

In [22]:

df.description = df.description.str.replace('o', 'о').str.replace('H', 'Н')
df.description = df.description.str.replace('–', '-').str.replace('—', '-').str.replace('…', '...')

The remaining non-Russian and non-punctuation symbols are Kazakh symbols¶

In [23]:

counts = counts_to_df(df, 'description')

counts_dict = counts.set_index('symbols')['counts'].to_dict()
kazakh_symbols = set(counts_dict) - ref_alphabet
kazakh_counts = counts[counts.symbols.isin(kazakh_symbols)]
print(f'Kazakh symbols: {kazakh_symbols}')
print(f'Number of Kazakh labels: {kazakh_counts.index.unique().shape[0]}')
print(f'Percentage of Kazakh labels: {round(kazakh_counts.index.unique().shape[0]/df.shape[0]*100, 2)}%')

Kazakh symbols: {'қ', 'Ө', 'ғ', 'ө', 'Қ', 'Ү'}
Number of Kazakh labels: 98
Percentage of Kazakh labels: 0.15%

Dropping remaining non-reference symbols¶

In [24]:

df = df.drop(counts[counts.symbols.isin(kazakh_symbols)].index.drop_duplicates(), axis=0)
df

Out[24]:

	width	height	description	description_length
0_0_0	495	64	Шёл человек.	12
0_0_1	494	65	Шёл человек	11
0_0_10	489	73	Шёл человек	11
0_0_11	406	46	Шёл человек.	12
0_0_12	379	76	Шёл человек	11
...	...	...	...	...
9_9_875	543	94	Вид постоялого	14
9_9_877	462	73	Вид постоялого	14
9_9_878	595	83	Вид постоялого	14
9_9_879	532	72	Вид постоялого	14
9_9_880	538	72	Вид постоялого	14

64845 rows × 4 columns

Сomparison between the frequency of letters in the Russian alphabet (ref link) and in the dataset¶

In [25]:

fig, axes = subplots(2, 1, figsize=(15, 15))

tmp = counts_to_df(df)
tmp = tmp[~tmp.symbols.isin(punctuation)].drop_duplicates('symbols')

dictir = {}
for i, j in tmp.iterrows():
    sym, cou = j
    dictir[sym.lower()] = dictir.get(sym.lower(), 0) + cou
tmp = pd.DataFrame(dictir, index=['counts']
                  ).T.reset_index().sort_values('counts', ascending=False)

sns.barplot(data=tmp, x='index', y='counts', ax=axes[0], color=color)
axes[0].set(xlabel='letter', ylabel='frequency')
axes[0].set_title('Dataset alphabet')

ls = pd.read_csv(os.path.join('..', 'metadata', 'alphabet.tsv'), sep='\t', 
                 index_col=1).sort_values('Частотность', ascending=False)
sns.barplot(data=ls, x='Буква', y='Частотность', ax=axes[1], color=color)
axes[1].set(xlabel='letter', ylabel='frequency')
axes[1].set_title('Russian alphabet')
tight_layout()

Length of labels¶

In [26]:

plt.figure(figsize=(15, 8))
ax = sns.kdeplot(df.description_length, bw=0.3)
ax.set(xlabel='description length', ylabel='density', title='Length distribution of text');

Color histograms¶

In [27]:

def print_imgs_with_hist(df, size=5, width=20, height=25):
    num_cols = 2
    _, ax = plt.subplots(size, num_cols, figsize=(width, height))
    i = 0
    for path, label in df.sample(size).iterrows():
        path = os.path.join(img_path, path + '.jpg')
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        ax[i // num_cols, i % num_cols].imshow(img)
        ax[i // num_cols, i % num_cols].set_title(label.description)
        color = ('b','g','r')
        i += 1
        for j, col in enumerate(color):
            histr = cv2.calcHist([img], [j], None, [256], [0, 256])
            ax[i // num_cols, i % num_cols].plot(histr, color = col)
            ax[i // num_cols, i % num_cols].set_xlabel('color value')
            ax[i // num_cols, i % num_cols].set_ylabel('pixel count')
            ax[i // num_cols, i % num_cols].set_xlim([0, 255])
            ax[i // num_cols, i % num_cols].set_ylim([0, 2000])
        i += 1
    plt.show()

In [28]:

print(f'Color format: RGB')
print_imgs_with_hist(df[['description']])

Color format: RGB

Pictures sizes description¶

In [29]:

plt.figure(figsize=(15, 8))
ax = sns.scatterplot(x=df.width, y=df.height, alpha=0.4);
ax.set(xlabel='description length', ylabel='density', title='Size distribution of pictures');

In [30]:

print(f"width: max = {df.width.max()}, min = {df.width.min()}, mean = {df.width.mean()}")
print(f"height: max = {df.height.max()}, min = {df.height.min()}, mean = {df.height.mean()}\n")

mheight = df.height.value_counts().sort_values(ascending=False)
mwidth = df.width.value_counts().sort_values(ascending=False)

print(f'most common widths (of {mwidth.shape[0]} size):\nwidth\tcount\n{mwidth.head(10)}\nand their mean = {mwidth.head(10).index.to_series().mean()}\n')
print(f'most common heights (of {mheight.shape[0]} size):\nheight\tcount\n{mheight.head(10)}\nand their mean = {mheight.head(10).index.to_series().mean()}')

width: max = 1697, min = 44, mean = 443.46174724342666
height: max = 150, min = 14, mean = 75.69303724265556

most common widths (of 987 size):
width	count
349    166
307    164
310    162
363    157
330    157
358    153
312    151
365    151
317    150
442    148
Name: width, dtype: int64
and their mean = 345.3

most common heights (of 131 size):
height	count
75    1524
71    1507
72    1505
74    1498
73    1495
79    1470
70    1451
68    1444
76    1439
78    1414
Name: height, dtype: int64
and their mean = 73.6

Most common size values pictures¶

In [31]:

fig, axes = subplots(4, 1, figsize=(10, 15))

df_list = [df[df.height == mheight.index[0]],
           df[df.height == mheight.index[-1]],
           df[df.width == mwidth.index[0]],
           df[df.width == mwidth.index[-1]]]

titles = ['most common height example',
          'least common height example',
          'most common width example',
          'least common width example']

for tmp, title, ax in zip(df_list, titles, axes):
    read_image(tmp.index[0] + '.jpg', ax, title=title + f' ({tmp.description[0]})')
    ax.axis('on')

tight_layout()

Extreme size values pictures¶

In [32]:

fig, axes = subplots(6, 1, figsize=(10, 15))

df_list = [df[df.height == df.height.max()],
           df[df.height == df.height.min()],
           df[df.width == df.width.max()],
           df[df.width == df.width.min()],
           df[df.description.apply(len) == df.description.apply(len).max()],
           df[df.description.apply(len) == df.description.apply(len).min()]]

titles = ['max hight example',
          'min hight example',
          'max width example',
          'min width example',
          'max len description',
          'min len description']

for tmp, title, ax in zip(df_list, titles, axes):
    read_image(tmp.index[0] + '.jpg', ax, title=title + f' ({tmp.description[0]})')
    ax.axis('on')

tight_layout()

In [33]:

img_height, img_width = 100, 600
df = df[(df.width <= img_width) & (df.height <= img_height)]
max_length = df.description.str.len().max()
print(max_length)
df

Out[33]:

	width	height	description	description_length
0_0_0	495	64	Шёл человек.	12
0_0_1	494	65	Шёл человек	11
0_0_10	489	73	Шёл человек	11
0_0_11	406	46	Шёл человек.	12
0_0_12	379	76	Шёл человек	11
...	...	...	...	...
9_9_875	543	94	Вид постоялого	14
9_9_877	462	73	Вид постоялого	14
9_9_878	595	83	Вид постоялого	14
9_9_879	532	72	Вид постоялого	14
9_9_880	538	72	Вид постоялого	14

48419 rows × 4 columns

Train test val split¶

Getting utility dataframe with rows as picture names and columns as all symbols that presented in dataset¶

In [34]:

# There is 1.0 in cell if there is this symbol in this picture otherwise 0 
counts = counts_to_df(df, 'description')
counts.counts = 1
splitter = counts.reset_index().drop_duplicates().pivot(index='index', columns='symbols').fillna(0)
splitter

Out[34]:

	counts
symbols	!	(	)	,	-	.	:	;	?	А	...	ч	ш	щ	ъ	ы	ь	э	ю	я	ё
index
0_0_0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
0_0_1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
0_0_10	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
0_0_11	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
0_0_12	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9_9_875	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
9_9_877	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
9_9_878	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
9_9_879	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
9_9_880	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0

48419 rows × 72 columns

Splitting into three dataframes (train 85%, test 10%, val 5%)¶

In [35]:

# treat splitter df as multilabel class signature, so we can easy split original df to train and test
train, test, _, ls = train_test_split(df, splitter, shuffle=True,
                            test_size=0.15, random_state=12)

# And then split test to final test and val dfs
test, val, _, _ = train_test_split(test, ls, shuffle=True,
                            test_size=0.33, random_state=17)

train_counts = counts_to_df(train, 'description')
test_counts = counts_to_df(test, 'description')
val_counts = counts_to_df(val, 'description')

print('Sets differences between presented symbols in train, test and val data\n')

print('train_counts - test_counts:   ', set(train_counts.symbols) - set(test_counts.symbols))
print('train_counts - val_counts:   ', set(train_counts.symbols) - set(val_counts.symbols))

print('test_counts - train_counts:   ', set(test_counts.symbols) - set(train_counts.symbols))
print('test_counts - val_counts:   ', set(test_counts.symbols) - set(val_counts.symbols))

print('val_counts - train_counts:   ', set(val_counts.symbols) - set(train_counts.symbols))
print('val_counts - test_counts:   ', set(val_counts.symbols) - set(test_counts.symbols))

# Plot frequencies of symbols in three new dataframes
fig, axes = subplots(3, 1, figsize=(15, 15))

for tmp, ax, name in zip((train_counts, test_counts, val_counts), axes, ['train', 'test', 'val']):
    sns.barplot(data=tmp.sort_values('counts', ascending=False),
                x='symbols', y='counts', ax=ax)
    ax.set_title(name)
tight_layout()

Sets differences between presented symbols in train, test and val data

train_counts - test_counts:    {')', '('}
train_counts - val_counts:    {'ъ', ')', '('}
test_counts - train_counts:    set()
test_counts - val_counts:    {'ъ'}
val_counts - train_counts:    set()
val_counts - test_counts:    set()

As we see, all symbols, except "(" and ")" (they occur in dataset only 2 and 1 times respectively), are presented in all three dataframes and frequencies of symbols are very close too. Is it good or not?¶

Making tf Dataset¶

In [36]:

batch_size = 16

Creating mappers¶

In [37]:

# Mapping characters to integers
counts = counts_to_df(df)
counts = counts[~counts.isin(['', ' '])].symbols.unique().tolist() + [' ', '#']
vocab = pd.Series(counts).str.encode('utf8')

char_to_num = layers.experimental.preprocessing.StringLookup(
    vocabulary=vocab,
    mask_token=None,
)

# Mapping integers back to original characters
num_to_char = layers.experimental.preprocessing.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True,
)

blank_index = char_to_num(tf.strings.unicode_split('#', input_encoding="UTF-8")).numpy()[0]
blank_index  # For blank symbols

Out[37]:

Functions for tf datasets¶

In [38]:

def encode_single_sample(img_path, label):
    
    """Function for processing one image from tf dataset"""
    
    # 1. Read 
    img = tf.io.read_file(img_path)
    
    # 2. Decode and convert to grayscale
    img = tf.io.decode_png(img, channels=1)
    
    # 3. Convert to float32 in [0, 1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    
    # 4. Resize to the desired size
    img = 1 - img
    img = tf.image.resize_with_crop_or_pad(img, int32(img_height), int32(img_width))
    img = 0.5 - img

    # 5. Transpose the image because we want the time
    # dimension to correspond to the width of the image.
    img = tf.transpose(img, perm=[1, 0, 2])
    
    # 6. Map the characters in label to numbers
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    label = tf.pad(label, [[0, max_length-len(label)]], constant_values=blank_index)
    
    # 7. Return a dict as our model is expecting two inputs
    return {"image": img, "label": label}

In [39]:

def get_dataset(samples: pd.DataFrame, batch_size=batch_size, 
                shuffle_buffer:int=1024, prefetch:int=tf.data.experimental.AUTOTUNE) -> tf.data.Dataset:
    
    """Function for creating tf dataset"""
    
    dataset = tf.data.Dataset.from_tensor_slices(
        (samples.index.to_series().apply(lambda x: os.path.join(img_path, x) + '.jpg').tolist(),
         samples.description.tolist())
    )
    
    dataset = (
        dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
        )
        .batch(batch_size)
        .prefetch(prefetch)
    )
    
    return dataset

In [40]:

def show_batch(batch, batch_size):
    
    """Utility function for imshow batch"""

    _, ax = plt.subplots(batch_size, 1, figsize=(10, batch_size * 2))
    images = batch['image']
    labels = batch['label']
    for i in range(batch_size):
        img = ((images[i] + 0.5) * 255).numpy().astype('uint8')
        label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode('utf-8').replace('#', '')
    
        ax[i].imshow(img[i:, :, 0].T, cmap='gray')
        ax[i].set_title(label)
    tight_layout()
    plt.show()

One batch from tf dataset¶

In [41]:

ind = df.index.tolist()
# random.shuffle(ind)
ls = get_dataset(df.loc[ind].iloc[:16])
for batch in ls.take(1):
    show_batch(batch, batch_size=batch_size)

Data augmentation¶

In [42]:

paths = df.index.to_series().apply(lambda x: os.path.join(img_path, x) + '.jpg')

aug_1 = os.path.join(img_path, 'aug_1')

if not os.path.exists(aug_1):
    os.mkdir(aug_1)
sometimes = lambda aug: iaa.Sometimes(0.5, aug)
seq = iaa.Sequential(
    [

        iaa.Sometimes(0.1, iaa.GaussianBlur(3.0)),
    
        iaa.Sometimes(0.1, iaa.AveragePooling(2)),
        iaa.Sometimes(0.1, iaa.Emboss(alpha=(0.0, 1.0), strength=(0.75, 1.25))),
        iaa.Sometimes(0.1, iaa.GammaContrast((0.5, 1.0))),
        iaa.Invert(0.05, per_channel=True),
        iaa.Sometimes(0.1, iaa.CoarseDropout((0.0, 0.05), size_percent=(0.02, 0.25))),

        iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25),

        iaa.PerspectiveTransform(scale=(0.02, 0.05)),

        iaa.Sometimes(0.1, iaa.SaltAndPepper(0.05)),
    ],
    random_order=True
)

for path in tqdm(paths):
    break
    print('start')
    img = imageio.imread(path)
    image = [copy(img) for _ in range(30)]

    ls = seq(images=image)
    
    print(len(ls))
    for i in range(30):
        _, name = os.path.split(path)
        name = os.path.join(aug_1, f'{i}_aug_' + name)
        cv2.imwrite(name, ls[i])

  0%|                                                                                        | 0/48419 [00:00<?, ?it/s]

Preprocess module default use¶

In [43]:

from preprocess import PreprocessFrame, Dataset, make_augments

img_width = 600
img_height = 100

# parameters of resized images
new_img_width = 300
new_img_height = 50

# default paths
WORKING_DIR = os.path.join('/home', 'htr')
ann_path = os.path.join(WORKING_DIR, 'HKR_Dataset_Words_Public', 'ann')
img_path = os.path.join(WORKING_DIR, 'HKR_Dataset_Words_Public', 'img')

# collect metadata
# meta_collect(ann_path, os.path.join(WORKING_DIR, 'metadata', 'metadata.tsv'))

# get preprocessed metadata dataframe
df = PreprocessFrame(metadata=os.path.join(WORKING_DIR, 'metadata', 'metadata.tsv'),
                     img_height=img_height, img_width=img_width)
print(df.shape)

# Make augments file (if they exists: comment or delete line)
aug_df = None
# aug_df = make_augments(df=df, img_path=img_path, WORKING_DIR=WORKING_DIR,
#                         img_height=img_height, img_width=img_width)

# get augments metadata dataframe from original dataframe if not starting make_augments
if not isinstance(aug_df, pd.DataFrame):
    aug_df = df.copy()
    aug_df.index = aug_df.index.to_series().apply(lambda x: os.path.join('aug_1', 'aug_' + x))

train, test, val = list(Dataset(df, aug_df=aug_df,
                                test_size=0.1,
                                val_size=0.05,
                                img_path=img_path,
                                img_height=img_height,
                                img_width=img_width,
                                new_img_height=new_img_height,
                                new_img_width=new_img_width,
                                WORKING_DIR=WORKING_DIR,
                                shuffle=True,
                                random_state=12))
train

(48419, 4)

Out[43]:

<PrefetchDataset shapes: {image: (None, 300, 50, 1), label: (None, None)}, types: {image: tf.float32, label: tf.int64}>

In [44]:

for batch in train.take(1):
    show_batch(batch, batch_size=batch_size)

In [45]:

for batch in test.take(1):
    show_batch(batch, batch_size=batch_size)

In [46]:

for batch in val.take(1):
    show_batch(batch, batch_size=batch_size)