import os
WORKDIR = os.path.join('/home', 'htr')
%pylab inline
Populating the interactive namespace from numpy and matplotlib
mpl.rc('figure',figsize=(5, 20))
mpl.rc('xtick', labelsize=16)
mpl.rc('ytick', labelsize=16)
mpl.rc('font', size=16)
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
import seaborn as sns
import pandas as pd
import json
import glob
import string
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import cv2
import imgaug as ia
import imgaug.augmenters as iaa
import imageio
ann_path = os.path.join(WORKDIR, 'HKR_Dataset_Words_Public', 'ann')
img_path = os.path.join(WORKDIR, 'HKR_Dataset_Words_Public', 'img')
meta_path = os.path.join(WORKDIR, 'metadata')
def read_image(path: str, ax: matplotlib.axes._subplots.Axes,
title: str='', img_path: str=img_path) -> None:
'''Utility fiunction for printing images from "path"'''
image = imageio.imread(os.path.join(img_path, path))
ax.imshow(image, cmap='gray')
ax.axis("off")
title = title if title else path
ax.set_title(title)
def counts_to_df(df: pd.DataFrame, column: str='description') -> pd.DataFrame:
'''Return dataframe with symbols counts from "column"'''
counts = pd.DataFrame(df[column].map(list).explode())
counts = counts.join(counts[column].value_counts(), on=column, rsuffix='1')
counts.columns = ['symbols', 'counts']
counts = counts[~(counts.symbols == '') & ~(counts.symbols == ' ')] #.drop_duplicates()
return counts
def meta_collect(ann_path: str, result_file: str, sep: str='\t') -> None:
'''collect metadata for all images to "result_file"
from json files in "ann_path" (execution time: about 5 mins)'''
start = time()
with open(result_file, 'w', encoding='utf-8') as f:
f.write(sep.join(['width', 'height', 'description',
'isModerated', 'moderatedBy', 'predicted']) + '\n')
for file in tqdm(glob.glob(os.path.join(ann_path, '*.json'))):
with open(file, encoding='utf-8') as js:
tmp = json.load(js)
try:
f.write(sep.join([tmp['name'], str(tmp['size']['width']), str(tmp['size']['height']),
tmp['description'], str(tmp['moderation']['isModerated']),
tmp['moderation']['moderatedBy'], str(tmp['moderation']['predicted'])]) + '\n')
except Exception:
print(tmp['description'])
print('execution time:', (time() - start), 'secs')
# meta_collect(ann_path, os.path.join(meta_path, 'metadata.tsv'))
df = pd.read_csv(os.path.join(meta_path, 'metadata.tsv'), sep='\t', index_col=0)
print(df.shape)
df.head()
(64943, 6)
width | height | description | isModerated | moderatedBy | predicted | |
---|---|---|---|---|---|---|
0_0_0 | 495 | 64 | Шёл человек. | 1 | Norlist | NaN |
0_0_1 | 494 | 65 | Шёл человек | 1 | Norlist | NaN |
0_0_10 | 489 | 73 | Шёл человек | 1 | Norlist | NaN |
0_0_11 | 406 | 46 | Шёл человек. | 1 | Norlist | NaN |
0_0_12 | 379 | 76 | Шёл человек | 1 | Norlist | NaN |
df['description_length'] = df.description.apply(len)
df
width | height | description | isModerated | moderatedBy | predicted | description_length | |
---|---|---|---|---|---|---|---|
0_0_0 | 495 | 64 | Шёл человек. | 1 | Norlist | NaN | 12 |
0_0_1 | 494 | 65 | Шёл человек | 1 | Norlist | NaN | 11 |
0_0_10 | 489 | 73 | Шёл человек | 1 | Norlist | NaN | 11 |
0_0_11 | 406 | 46 | Шёл человек. | 1 | Norlist | NaN | 12 |
0_0_12 | 379 | 76 | Шёл человек | 1 | Norlist | NaN | 11 |
... | ... | ... | ... | ... | ... | ... | ... |
9_9_875 | 543 | 94 | Вид постоялого | 1 | Norlist | NaN | 14 |
9_9_877 | 462 | 73 | Вид постоялого | 1 | Norlist | NaN | 14 |
9_9_878 | 595 | 83 | Вид постоялого | 1 | Norlist | NaN | 14 |
9_9_879 | 532 | 72 | Вид постоялого | 1 | Norlist | NaN | 14 |
9_9_880 | 538 | 72 | Вид постоялого | 1 | Norlist | NaN | 14 |
64943 rows × 7 columns
df.describe().round(2)
width | height | isModerated | predicted | description_length | |
---|---|---|---|---|---|
count | 64943.00 | 64943.00 | 64943.0 | 0.0 | 64943.00 |
mean | 443.28 | 75.70 | 1.0 | NaN | 11.02 |
std | 176.95 | 17.39 | 0.0 | NaN | 4.39 |
min | 44.00 | 14.00 | 1.0 | NaN | 2.00 |
25% | 305.00 | 64.00 | 1.0 | NaN | 7.00 |
50% | 430.00 | 75.00 | 1.0 | NaN | 11.00 |
75% | 565.00 | 87.00 | 1.0 | NaN | 14.00 |
max | 1697.00 | 150.00 | 1.0 | NaN | 42.00 |
df.describe(include=object)
description | moderatedBy | |
---|---|---|
count | 64943 | 64943 |
unique | 2808 | 2 |
top | Актау | Norlist |
freq | 484 | 59421 |
df[~df.predicted.isna()]
width | height | description | isModerated | moderatedBy | predicted | description_length |
---|
df.isModerated.value_counts()
1 64943 Name: isModerated, dtype: int64
cou = df.moderatedBy.value_counts()
cou
Norlist 59421 Daniyar Borisovich 5522 Name: moderatedBy, dtype: int64
df.drop(['predicted', 'moderatedBy', 'isModerated'], axis=1, inplace=True)
n = 10
img_names = random.choice(df.index, n)
fig, axes = subplots(n, 1)
for img_name, ax in zip(img_names, axes):
read_image(img_name + '.jpg', ax=ax, title=img_name + f' ({df.loc[img_name].description})')
tight_layout()
# Creating dataframe with symbol counts with indexes from original df
counts = counts_to_df(df, 'description') # Do not dropping duplicates for saving original df indexes
# Barplot with symbol counts in dataset
fig, ax = subplots(figsize=(20, 10))
ax.set_title('Frequency of symbols')
color = '#2ca3db'
ax = sns.barplot(data=counts.sort_values('counts', ascending=False),
x='symbols', y='counts', ax=ax, color=color)
ax.set(xlabel='letter', ylabel='frequency')
tight_layout()
print(f'All characters:\n {counts.symbols.unique()}')
All characters: ['Ш' 'ё' 'л' 'ч' 'е' 'о' 'в' 'к' '.' ',' 'С' 'п' 'ы' 'м' 'а' 'т' 'О' 'н' 'и' 'я' 'с' 'ь' 'Б' 'ш' 'у' 'ю' 'Г' 'д' 'К' 'х' 'р' 'ж' 'щ' 'г' '?' 'Л' 'б' 'И' 'Н' 'Д' 'З' ';' 'Р' 'з' 'э' 'А' 'В' 'й' '–' 'ц' 'Т' 'o' 'Ч' 'Қ' 'қ' 'Х' 'ғ' 'Й' 'Ы' 'П' 'Е' 'М' 'У' 'Ь' 'ө' 'Я' '(' ')' 'Ю' '-' 'Ж' 'Ө' 'Щ' 'Э' 'H' 'Ү' 'Ф' '!' ':' 'ф' '…' '—' 'ъ']
# Creating reference alphabet with Russian (lower- and uppercase) and punctuation symbols
alphabet_lower = [chr(ord("а") + i) for i in range(32)] + [chr(ord("а") + 33)] # Last is "ё"
alphabet_upper = [chr(ord("А") + i) for i in range(32)]
punctuation = list(string.punctuation)
ref_alphabet = set(alphabet_lower + alphabet_upper + punctuation)
# Creating alphabet from dataset
alphabet = set(counts.symbols)
print(f'Reference alphabet length: {len(ref_alphabet)}')
print(f'Actual alphabet length: {len(alphabet)}\n')
# difference between dataset and reference alphabet
diff_symbols = alphabet - ref_alphabet
diff_counts = counts[counts.symbols.isin(diff_symbols)]
print(f'Non-ordinary symbols: {diff_symbols}')
print(f'Number of labels with non-ordinary symbols: {diff_counts.index.unique().shape[0]}')
print(f'Percentage of labels with non-ordinary symbols: {round(diff_counts.index.unique().shape[0]/df.shape[0]*100, 2)}%')
Reference alphabet length: 97 Actual alphabet length: 83 Non-ordinary symbols: {'қ', '–', 'Ө', 'o', '—', 'H', '…', 'ғ', 'ө', 'Қ', 'Ү'} Number of labels with non-ordinary symbols: 815 Percentage of labels with non-ordinary symbols: 1.25%
fig, axes = subplots(len(diff_symbols), 1)
for sym, ax in zip(diff_symbols, axes):
ind = counts[counts.symbols == sym].index[0]
read_image(ind + '.jpg', ax, df.loc[ind].description + f' ({sym})')
tight_layout()
pd.options.display.max_rows = 100
df.loc[counts[counts.symbols.isin(diff_symbols)].index.drop_duplicates()].drop_duplicates('description')
width | height | description | description_length | |
---|---|---|---|---|
0_46_1 | 530 | 70 | А встретятся – | 14 |
0_46_46 | 502 | 81 | А, встретятся – | 15 |
0_9_0 | 704 | 66 | Кем? Кем? Волкoм? | 17 |
0_9_15 | 734 | 43 | Кем? Кем? Волкoм | 16 |
0_9_16 | 682 | 65 | кем? Кем? Волкoв? | 17 |
0_9_28_ | 869 | 59 | Кем? Кем? волкoм? | 17 |
0_9_626 | 712 | 76 | Кем? Чем? Волкoм? | 17 |
10_0_107 | 377 | 112 | Қазахстан | 9 |
10_0_126 | 303 | 85 | Қазақстан | 9 |
10_10_123 | 423 | 106 | Карағанда | 9 |
10_11_123 | 412 | 73 | Қостанай | 8 |
10_12_107 | 371 | 105 | Қызылорда | 9 |
10_19_418 | 324 | 116 | Көкшетау | 8 |
10_31_88 | 615 | 107 | ҚАРАГАНДИНСКАЯ | 14 |
10_32_88 | 483 | 106 | Қостанайская | 12 |
10_33_111 | 542 | 94 | Қызылординская | 14 |
10_35_88 | 658 | 101 | Северо-Қазахстанская | 20 |
10_7_12 | 208 | 75 | Ақтау | 5 |
10_7_278 | 216 | 69 | АҚТАУ | 5 |
10_8_123 | 228 | 78 | Ақтобе | 6 |
10_8_259 | 193 | 44 | АКТӨБЕ | 6 |
10_8_292 | 211 | 66 | Актөбе | 6 |
10_8_88 | 233 | 96 | Ақтөбе | 6 |
11_45_12 | 323 | 67 | Аркалық | 7 |
12_17_80 | 355 | 56 | Қаражал | 7 |
13_17_138 | 521 | 66 | Hоводолинский | 13 |
13_17_139 | 334 | 75 | Hоводолинск | 11 |
13_17_141_ | 483 | 117 | Hоводомнский | 12 |
13_17_156 | 520 | 46 | Hоводолинскии | 13 |
13_20_140 | 166 | 70 | Ақсу | 4 |
13_25_180 | 335 | 67 | Балпық | 6 |
13_27_202 | 318 | 76 | Бестөбе | 7 |
13_45_171 | 258 | 52 | Мақат | 5 |
13_48_156 | 483 | 46 | Өтеген батыра | 13 |
13_4_156 | 249 | 58 | Үштобе | 6 |
2_45_133 | 624 | 96 | святых видений – | 16 |
3_50_185 | 257 | 66 | душои… | 6 |
3_50_188 | 339 | 88 | душой… | 6 |
3_50_238 | 315 | 92 | душой…. | 7 |
3_50_706 | 282 | 72 | дущой… | 6 |
4_12_250 | 453 | 76 | Глушь и снег… | 13 |
4_12_254 | 588 | 78 | Глуш и снег… | 12 |
4_12_292 | 438 | 83 | Глушь снег… | 11 |
4_14_252 | 578 | 78 | Скучно, грустно… | 16 |
4_36_246 | 694 | 92 | То сердечная тоска… | 19 |
4_39_248 | 699 | 108 | Попадаются одне… | 16 |
5_0_314 | 687 | 48 | Прошло пять лет,— | 17 |
5_0_339 | 573 | 52 | прошло пять лет,— | 17 |
5_10_311 | 501 | 67 | Где выл пожар — | 15 |
5_14_456 | 612 | 87 | Взывали к мцению — | 18 |
5_34_305 | 331 | 59 | гремел — | 8 |
5_41_307 | 612 | 61 | ныло от разлуки,— | 17 |
6_34_362 | 592 | 54 | проклятым роем… | 15 |
6_34_407 | 383 | 62 | проклятьем роем… | 16 |
6_35_362 | 691 | 58 | терзали жизнь мою… | 18 |
6_38_362 | 205 | 62 | свою… | 5 |
6_44_362 | 497 | 50 | нежная, нашла… | 14 |
6_44_363 | 737 | 64 | нежная, нашла…. | 15 |
6_44_780 | 597 | 43 | нежноя, нашла… | 14 |
6_47_363 | 775 | 68 | в сырую ночь ушла… | 18 |
7_47_420 | 401 | 63 | за веком — | 10 |
7_47_426 | 398 | 52 | ЗА ВЕКОМ — | 10 |
7_47_453 | 408 | 103 | за Веком — | 10 |
7_48_420 | 339 | 77 | Господь? — | 10 |
7_48_457 | 271 | 118 | Господ? — | 9 |
df.description = df.description.str.replace('o', 'о').str.replace('H', 'Н')
df.description = df.description.str.replace('–', '-').str.replace('—', '-').str.replace('…', '...')
counts = counts_to_df(df, 'description')
counts_dict = counts.set_index('symbols')['counts'].to_dict()
kazakh_symbols = set(counts_dict) - ref_alphabet
kazakh_counts = counts[counts.symbols.isin(kazakh_symbols)]
print(f'Kazakh symbols: {kazakh_symbols}')
print(f'Number of Kazakh labels: {kazakh_counts.index.unique().shape[0]}')
print(f'Percentage of Kazakh labels: {round(kazakh_counts.index.unique().shape[0]/df.shape[0]*100, 2)}%')
Kazakh symbols: {'қ', 'Ө', 'ғ', 'ө', 'Қ', 'Ү'} Number of Kazakh labels: 98 Percentage of Kazakh labels: 0.15%
df = df.drop(counts[counts.symbols.isin(kazakh_symbols)].index.drop_duplicates(), axis=0)
df
width | height | description | description_length | |
---|---|---|---|---|
0_0_0 | 495 | 64 | Шёл человек. | 12 |
0_0_1 | 494 | 65 | Шёл человек | 11 |
0_0_10 | 489 | 73 | Шёл человек | 11 |
0_0_11 | 406 | 46 | Шёл человек. | 12 |
0_0_12 | 379 | 76 | Шёл человек | 11 |
... | ... | ... | ... | ... |
9_9_875 | 543 | 94 | Вид постоялого | 14 |
9_9_877 | 462 | 73 | Вид постоялого | 14 |
9_9_878 | 595 | 83 | Вид постоялого | 14 |
9_9_879 | 532 | 72 | Вид постоялого | 14 |
9_9_880 | 538 | 72 | Вид постоялого | 14 |
64845 rows × 4 columns
fig, axes = subplots(2, 1, figsize=(15, 15))
tmp = counts_to_df(df)
tmp = tmp[~tmp.symbols.isin(punctuation)].drop_duplicates('symbols')
dictir = {}
for i, j in tmp.iterrows():
sym, cou = j
dictir[sym.lower()] = dictir.get(sym.lower(), 0) + cou
tmp = pd.DataFrame(dictir, index=['counts']
).T.reset_index().sort_values('counts', ascending=False)
sns.barplot(data=tmp, x='index', y='counts', ax=axes[0], color=color)
axes[0].set(xlabel='letter', ylabel='frequency')
axes[0].set_title('Dataset alphabet')
ls = pd.read_csv(os.path.join('..', 'metadata', 'alphabet.tsv'), sep='\t',
index_col=1).sort_values('Частотность', ascending=False)
sns.barplot(data=ls, x='Буква', y='Частотность', ax=axes[1], color=color)
axes[1].set(xlabel='letter', ylabel='frequency')
axes[1].set_title('Russian alphabet')
tight_layout()
plt.figure(figsize=(15, 8))
ax = sns.kdeplot(df.description_length, bw=0.3)
ax.set(xlabel='description length', ylabel='density', title='Length distribution of text');
def print_imgs_with_hist(df, size=5, width=20, height=25):
num_cols = 2
_, ax = plt.subplots(size, num_cols, figsize=(width, height))
i = 0
for path, label in df.sample(size).iterrows():
path = os.path.join(img_path, path + '.jpg')
img = cv2.imread(path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
ax[i // num_cols, i % num_cols].imshow(img)
ax[i // num_cols, i % num_cols].set_title(label.description)
color = ('b','g','r')
i += 1
for j, col in enumerate(color):
histr = cv2.calcHist([img], [j], None, [256], [0, 256])
ax[i // num_cols, i % num_cols].plot(histr, color = col)
ax[i // num_cols, i % num_cols].set_xlabel('color value')
ax[i // num_cols, i % num_cols].set_ylabel('pixel count')
ax[i // num_cols, i % num_cols].set_xlim([0, 255])
ax[i // num_cols, i % num_cols].set_ylim([0, 2000])
i += 1
plt.show()
print(f'Color format: RGB')
print_imgs_with_hist(df[['description']])
Color format: RGB
plt.figure(figsize=(15, 8))
ax = sns.scatterplot(x=df.width, y=df.height, alpha=0.4);
ax.set(xlabel='description length', ylabel='density', title='Size distribution of pictures');
print(f"width: max = {df.width.max()}, min = {df.width.min()}, mean = {df.width.mean()}")
print(f"height: max = {df.height.max()}, min = {df.height.min()}, mean = {df.height.mean()}\n")
mheight = df.height.value_counts().sort_values(ascending=False)
mwidth = df.width.value_counts().sort_values(ascending=False)
print(f'most common widths (of {mwidth.shape[0]} size):\nwidth\tcount\n{mwidth.head(10)}\nand their mean = {mwidth.head(10).index.to_series().mean()}\n')
print(f'most common heights (of {mheight.shape[0]} size):\nheight\tcount\n{mheight.head(10)}\nand their mean = {mheight.head(10).index.to_series().mean()}')
width: max = 1697, min = 44, mean = 443.46174724342666 height: max = 150, min = 14, mean = 75.69303724265556 most common widths (of 987 size): width count 349 166 307 164 310 162 363 157 330 157 358 153 312 151 365 151 317 150 442 148 Name: width, dtype: int64 and their mean = 345.3 most common heights (of 131 size): height count 75 1524 71 1507 72 1505 74 1498 73 1495 79 1470 70 1451 68 1444 76 1439 78 1414 Name: height, dtype: int64 and their mean = 73.6
fig, axes = subplots(4, 1, figsize=(10, 15))
df_list = [df[df.height == mheight.index[0]],
df[df.height == mheight.index[-1]],
df[df.width == mwidth.index[0]],
df[df.width == mwidth.index[-1]]]
titles = ['most common height example',
'least common height example',
'most common width example',
'least common width example']
for tmp, title, ax in zip(df_list, titles, axes):
read_image(tmp.index[0] + '.jpg', ax, title=title + f' ({tmp.description[0]})')
ax.axis('on')
tight_layout()
fig, axes = subplots(6, 1, figsize=(10, 15))
df_list = [df[df.height == df.height.max()],
df[df.height == df.height.min()],
df[df.width == df.width.max()],
df[df.width == df.width.min()],
df[df.description.apply(len) == df.description.apply(len).max()],
df[df.description.apply(len) == df.description.apply(len).min()]]
titles = ['max hight example',
'min hight example',
'max width example',
'min width example',
'max len description',
'min len description']
for tmp, title, ax in zip(df_list, titles, axes):
read_image(tmp.index[0] + '.jpg', ax, title=title + f' ({tmp.description[0]})')
ax.axis('on')
tight_layout()
img_height, img_width = 100, 600
df = df[(df.width <= img_width) & (df.height <= img_height)]
max_length = df.description.str.len().max()
print(max_length)
df
22
width | height | description | description_length | |
---|---|---|---|---|
0_0_0 | 495 | 64 | Шёл человек. | 12 |
0_0_1 | 494 | 65 | Шёл человек | 11 |
0_0_10 | 489 | 73 | Шёл человек | 11 |
0_0_11 | 406 | 46 | Шёл человек. | 12 |
0_0_12 | 379 | 76 | Шёл человек | 11 |
... | ... | ... | ... | ... |
9_9_875 | 543 | 94 | Вид постоялого | 14 |
9_9_877 | 462 | 73 | Вид постоялого | 14 |
9_9_878 | 595 | 83 | Вид постоялого | 14 |
9_9_879 | 532 | 72 | Вид постоялого | 14 |
9_9_880 | 538 | 72 | Вид постоялого | 14 |
48419 rows × 4 columns
# There is 1.0 in cell if there is this symbol in this picture otherwise 0
counts = counts_to_df(df, 'description')
counts.counts = 1
splitter = counts.reset_index().drop_duplicates().pivot(index='index', columns='symbols').fillna(0)
splitter
counts | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
symbols | ! | ( | ) | , | - | . | : | ; | ? | А | ... | ч | ш | щ | ъ | ы | ь | э | ю | я | ё |
index | |||||||||||||||||||||
0_0_0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
0_0_1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
0_0_10 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
0_0_11 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
0_0_12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9_9_875 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
9_9_877 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
9_9_878 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
9_9_879 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
9_9_880 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
48419 rows × 72 columns
# treat splitter df as multilabel class signature, so we can easy split original df to train and test
train, test, _, ls = train_test_split(df, splitter, shuffle=True,
test_size=0.15, random_state=12)
# And then split test to final test and val dfs
test, val, _, _ = train_test_split(test, ls, shuffle=True,
test_size=0.33, random_state=17)
train_counts = counts_to_df(train, 'description')
test_counts = counts_to_df(test, 'description')
val_counts = counts_to_df(val, 'description')
print('Sets differences between presented symbols in train, test and val data\n')
print('train_counts - test_counts: ', set(train_counts.symbols) - set(test_counts.symbols))
print('train_counts - val_counts: ', set(train_counts.symbols) - set(val_counts.symbols))
print('test_counts - train_counts: ', set(test_counts.symbols) - set(train_counts.symbols))
print('test_counts - val_counts: ', set(test_counts.symbols) - set(val_counts.symbols))
print('val_counts - train_counts: ', set(val_counts.symbols) - set(train_counts.symbols))
print('val_counts - test_counts: ', set(val_counts.symbols) - set(test_counts.symbols))
# Plot frequencies of symbols in three new dataframes
fig, axes = subplots(3, 1, figsize=(15, 15))
for tmp, ax, name in zip((train_counts, test_counts, val_counts), axes, ['train', 'test', 'val']):
sns.barplot(data=tmp.sort_values('counts', ascending=False),
x='symbols', y='counts', ax=ax)
ax.set_title(name)
tight_layout()
Sets differences between presented symbols in train, test and val data train_counts - test_counts: {')', '('} train_counts - val_counts: {'ъ', ')', '('} test_counts - train_counts: set() test_counts - val_counts: {'ъ'} val_counts - train_counts: set() val_counts - test_counts: set()
batch_size = 16
# Mapping characters to integers
counts = counts_to_df(df)
counts = counts[~counts.isin(['', ' '])].symbols.unique().tolist() + [' ', '#']
vocab = pd.Series(counts).str.encode('utf8')
char_to_num = layers.experimental.preprocessing.StringLookup(
vocabulary=vocab,
mask_token=None,
)
# Mapping integers back to original characters
num_to_char = layers.experimental.preprocessing.StringLookup(
vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True,
)
blank_index = char_to_num(tf.strings.unicode_split('#', input_encoding="UTF-8")).numpy()[0]
blank_index # For blank symbols
74
def encode_single_sample(img_path, label):
"""Function for processing one image from tf dataset"""
# 1. Read
img = tf.io.read_file(img_path)
# 2. Decode and convert to grayscale
img = tf.io.decode_png(img, channels=1)
# 3. Convert to float32 in [0, 1] range
img = tf.image.convert_image_dtype(img, tf.float32)
# 4. Resize to the desired size
img = 1 - img
img = tf.image.resize_with_crop_or_pad(img, int32(img_height), int32(img_width))
img = 0.5 - img
# 5. Transpose the image because we want the time
# dimension to correspond to the width of the image.
img = tf.transpose(img, perm=[1, 0, 2])
# 6. Map the characters in label to numbers
label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
label = tf.pad(label, [[0, max_length-len(label)]], constant_values=blank_index)
# 7. Return a dict as our model is expecting two inputs
return {"image": img, "label": label}
def get_dataset(samples: pd.DataFrame, batch_size=batch_size,
shuffle_buffer:int=1024, prefetch:int=tf.data.experimental.AUTOTUNE) -> tf.data.Dataset:
"""Function for creating tf dataset"""
dataset = tf.data.Dataset.from_tensor_slices(
(samples.index.to_series().apply(lambda x: os.path.join(img_path, x) + '.jpg').tolist(),
samples.description.tolist())
)
dataset = (
dataset.map(
encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
.batch(batch_size)
.prefetch(prefetch)
)
return dataset
def show_batch(batch, batch_size):
"""Utility function for imshow batch"""
_, ax = plt.subplots(batch_size, 1, figsize=(10, batch_size * 2))
images = batch['image']
labels = batch['label']
for i in range(batch_size):
img = ((images[i] + 0.5) * 255).numpy().astype('uint8')
label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode('utf-8').replace('#', '')
ax[i].imshow(img[i:, :, 0].T, cmap='gray')
ax[i].set_title(label)
tight_layout()
plt.show()
ind = df.index.tolist()
# random.shuffle(ind)
ls = get_dataset(df.loc[ind].iloc[:16])
for batch in ls.take(1):
show_batch(batch, batch_size=batch_size)
paths = df.index.to_series().apply(lambda x: os.path.join(img_path, x) + '.jpg')
aug_1 = os.path.join(img_path, 'aug_1')
if not os.path.exists(aug_1):
os.mkdir(aug_1)
sometimes = lambda aug: iaa.Sometimes(0.5, aug)
seq = iaa.Sequential(
[
iaa.Sometimes(0.1, iaa.GaussianBlur(3.0)),
iaa.Sometimes(0.1, iaa.AveragePooling(2)),
iaa.Sometimes(0.1, iaa.Emboss(alpha=(0.0, 1.0), strength=(0.75, 1.25))),
iaa.Sometimes(0.1, iaa.GammaContrast((0.5, 1.0))),
iaa.Invert(0.05, per_channel=True),
iaa.Sometimes(0.1, iaa.CoarseDropout((0.0, 0.05), size_percent=(0.02, 0.25))),
iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25),
iaa.PerspectiveTransform(scale=(0.02, 0.05)),
iaa.Sometimes(0.1, iaa.SaltAndPepper(0.05)),
],
random_order=True
)
for path in tqdm(paths):
break
print('start')
img = imageio.imread(path)
image = [copy(img) for _ in range(30)]
ls = seq(images=image)
print(len(ls))
for i in range(30):
_, name = os.path.split(path)
name = os.path.join(aug_1, f'{i}_aug_' + name)
cv2.imwrite(name, ls[i])
0%| | 0/48419 [00:00<?, ?it/s]
from preprocess import PreprocessFrame, Dataset, make_augments
img_width = 600
img_height = 100
# parameters of resized images
new_img_width = 300
new_img_height = 50
# default paths
WORKING_DIR = os.path.join('/home', 'htr')
ann_path = os.path.join(WORKING_DIR, 'HKR_Dataset_Words_Public', 'ann')
img_path = os.path.join(WORKING_DIR, 'HKR_Dataset_Words_Public', 'img')
# collect metadata
# meta_collect(ann_path, os.path.join(WORKING_DIR, 'metadata', 'metadata.tsv'))
# get preprocessed metadata dataframe
df = PreprocessFrame(metadata=os.path.join(WORKING_DIR, 'metadata', 'metadata.tsv'),
img_height=img_height, img_width=img_width)
print(df.shape)
# Make augments file (if they exists: comment or delete line)
aug_df = None
# aug_df = make_augments(df=df, img_path=img_path, WORKING_DIR=WORKING_DIR,
# img_height=img_height, img_width=img_width)
# get augments metadata dataframe from original dataframe if not starting make_augments
if not isinstance(aug_df, pd.DataFrame):
aug_df = df.copy()
aug_df.index = aug_df.index.to_series().apply(lambda x: os.path.join('aug_1', 'aug_' + x))
train, test, val = list(Dataset(df, aug_df=aug_df,
test_size=0.1,
val_size=0.05,
img_path=img_path,
img_height=img_height,
img_width=img_width,
new_img_height=new_img_height,
new_img_width=new_img_width,
WORKING_DIR=WORKING_DIR,
shuffle=True,
random_state=12))
train
(48419, 4)
<PrefetchDataset shapes: {image: (None, 300, 50, 1), label: (None, None)}, types: {image: tf.float32, label: tf.int64}>
for batch in train.take(1):
show_batch(batch, batch_size=batch_size)
for batch in test.take(1):
show_batch(batch, batch_size=batch_size)
for batch in val.take(1):
show_batch(batch, batch_size=batch_size)