# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
data_root = 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data' # Change me to store data elsewhere
array_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
filename_for_log_regr = data_root + '\\finalized_model_log_regr_500K_samples.sav'
url = 'https://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
def download_progress_hook(count, blockSize, totalSize):
"""A hook to report the progress of a download. This is mostly intended for users with
slow internet connections. Reports every 5% change in download progress.
"""
global last_percent_reported
percent = int(count * blockSize * 100 / totalSize)
if last_percent_reported != percent:
if percent % 5 == 0:
sys.stdout.write("%s%%" % percent)
sys.stdout.flush()
else:
sys.stdout.write(".")
sys.stdout.flush()
last_percent_reported = percent
def maybe_download(filename, expected_bytes, force=False):
"""Download a file if not present, and make sure it's the right size."""
dest_filename = os.path.join(data_root, filename)
if force or not os.path.exists(dest_filename):
print('Attempting to download:', filename)
filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
print('\nDownload Complete!')
statinfo = os.stat(dest_filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', dest_filename)
else:
raise Exception(
'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
return dest_filename
train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)
Found and verified D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large.tar.gz Found and verified D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small.tar.gz
num_classes = 10
np.random.seed(133)
def maybe_extract(filename, force=False):
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
if os.path.isdir(root) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data for %s. This may take a while. Please wait.' % root)
tar = tarfile.open(filename)
sys.stdout.flush()
tar.extractall(data_root)
tar.close()
data_folders = [
os.path.join(root, d) for d in sorted(os.listdir(root))
if os.path.isdir(os.path.join(root, d))]
if len(data_folders) != num_classes:
raise Exception(
'Expected %d folders, one per class. Found %d instead.' % (
num_classes, len(data_folders)))
print(data_folders)
return data_folders
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)
D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large already present - Skipping extraction of D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large.tar.gz. ['D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\A', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\B', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\C', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\D', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\E', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\F', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\G', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\H', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\I', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_large\\J'] D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small already present - Skipping extraction of D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small.tar.gz. ['D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\A', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\B', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\C', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\D', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\E', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\F', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\G', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\H', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\I', 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data\\notMNIST_small\\J']
image_size = 28 # Pixel width and height.
pixel_depth = 255.0 # Number of levels per pixel.
def load_letter(folder, min_num_images):
"""Load the data for a single letter label."""
image_files = os.listdir(folder) ## it counts all files in the folder
dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
dtype=np.float32) #it creates a 3d array
print(folder)
num_images = 0
for image in image_files:
image_file = os.path.join(folder, image)
try:
image_data = (imageio.imread(image_file).astype(float) -
pixel_depth / 2) / pixel_depth
if image_data.shape != (image_size, image_size):
raise Exception('Unexpected image shape: %s' % str(image_data.shape))
dataset[num_images, :, :] = image_data
num_images = num_images + 1
except (IOError, ValueError) as e:
print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
dataset = dataset[0:num_images, :, :]
if num_images < min_num_images:
raise Exception('Many fewer images than expected: %d < %d' %
(num_images, min_num_images))
print('Full dataset tensor:', dataset.shape)
print('Mean:', np.mean(dataset))
print('Standard deviation:', np.std(dataset))
return dataset
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
dataset_names = []
for folder in data_folders: #for each letter's folder
set_filename = folder + '.pickle' #create a file with appropriate letter's name
dataset_names.append(set_filename) #add them to a returned set
if os.path.exists(set_filename) and not force:
# You may override by setting force=True.
print('%s already present - Skipping pickling.' % set_filename)
else:
print('Pickling %s.' % set_filename)
dataset = load_letter(folder, min_num_images_per_class) #it loads a letter from folder to a 3D array
try:
with open(set_filename, 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) #it dumps a 3D array to a file
except Exception as e:
print('Unable to save data to', set_filename, ':', e)
return dataset_names
train_datasets = maybe_pickle(train_folders, 45000) # it creates 3D array for all letters in a train dataset
test_datasets = maybe_pickle(test_folders, 1800)# it creates 3D array for all letters in a test dataset
D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\A.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\B.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\C.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\D.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\E.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\F.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\G.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\H.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\I.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_large\J.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\A.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\B.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\C.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\D.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\E.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\F.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\G.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\H.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\I.pickle already present - Skipping pickling. D:\1_Workspaces\UNDER_VCS\github\1_ML_NN\python_with_math\data\notMNIST_small\J.pickle already present - Skipping pickling.
#it creates an empty n-dimensional arrays
def make_arrays(nb_rows, img_size):
if nb_rows:
dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
labels = np.ndarray(nb_rows, dtype=np.int32)
else:
dataset, labels = None, None
return dataset, labels
# it creates a single 3dArray from all GIVEN picle files
def merge_datasets(pickle_files, train_size, valid_size=0):
print ('====================== merging =======================================')
num_classes = len(pickle_files)
valid_dataset, valid_labels = make_arrays(valid_size, image_size) #empty array
train_dataset, train_labels = make_arrays(train_size, image_size) #empty array
vsize_per_class = valid_size // num_classes
tsize_per_class = train_size // num_classes
print ('valid_size ', valid_size, '\n')
print ('train_size ', train_size, '\n', 'num_classes', num_classes, '\n')
print ('vsize_per_class ', vsize_per_class, '\n', 'tsize_per_class', tsize_per_class, '\n\n')
start_v, start_t = 0, 0
end_v, end_t = vsize_per_class, tsize_per_class
end_l = vsize_per_class+tsize_per_class
for label, pickle_file in enumerate(pickle_files):
try:
with open(pickle_file, 'rb') as f:
letter_set = pickle.load(f)
# let's shuffle the letters to have random validation and training set
np.random.shuffle(letter_set)
if valid_dataset is not None:
valid_letter = letter_set[:vsize_per_class, :, :]
valid_dataset[start_v:end_v, :, :] = valid_letter
valid_labels[start_v:end_v] = label
start_v += vsize_per_class
end_v += vsize_per_class
train_letter = letter_set[vsize_per_class:end_l, :, :]
train_dataset[start_t:end_t, :, :] = train_letter
train_labels[start_t:end_t] = label
start_t += tsize_per_class
end_t += tsize_per_class
except Exception as e:
print('Unable to process data from', pickle_file, ':', e)
raise
# it returns 4 objects!!
return valid_dataset, valid_labels, train_dataset, train_labels
#SUBJECT TO CHANGE accordingly to task 6
train_size = 200000
valid_size = 10000
test_size = 10000
# it uses all 4 objects
valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(train_datasets, train_size, valid_size)
# it uses only 2 last objects
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)
print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)
====================== merging ======================================= valid_size 10000 train_size 200000 num_classes 10 vsize_per_class 1000 tsize_per_class 20000 ====================== merging ======================================= valid_size 0 train_size 10000 num_classes 10 vsize_per_class 0 tsize_per_class 1000 Training: (200000, 28, 28) (200000,) Validation: (10000, 28, 28) (10000,) Testing: (10000, 28, 28) (10000,)
#TODO - instead of steps above just dump valid_dataset_reshaped and load it
#reshaping validation dataset
nsamples, nx, ny = valid_dataset.shape
valid_dataset_reshaped = valid_dataset.reshape((nsamples,nx*ny))
print (valid_dataset_reshaped.shape)
# load the model from disk
loaded_model = pickle.load(open(filename_for_log_regr, 'rb'))
result = loaded_model.score(valid_dataset_reshaped, valid_labels)
print(result)
(10000, 784) 0.8273
#reshaping test dataset
nsamples, nx, ny = test_dataset.shape
test_dataset_reshaped = test_dataset.reshape((nsamples,nx*ny))
print (test_dataset_reshaped.shape)
score = loaded_model.score(test_dataset_reshaped, test_labels)
print(score)
(10000, 784) 0.8949
#read a real image for letter which has been prepared by me:
letter_root_name = '\\my_letters\\my_H_28_28_again'
my_letter = data_root + letter_root_name + '.png'
my_letter_gray = data_root + letter_root_name + '_gray' + '.png'
image_data = imageio.imread(my_letter);
print(image_data.shape)
(28, 28, 4)
imageio.imwrite(my_letter_gray, image_data[:, :, 0])
image_data_gray = imageio.imread(my_letter_gray).astype(float);
print(image_data_gray.shape)
(28, 28)
C:\Other_IT\Anaconda\lib\site-packages\imageio\core\util.py:104: UserWarning: Conversion from float64 to uint8, range [0.0, 255.0] 'range [{2}, {3}]'.format(dtype_str, out_type.__name__, mi, ma))
nx_img, ny_img = image_data_gray.shape;
reshaped_image_gray = image_data_gray.reshape(nx_img*ny_img);
plt.imshow(image_data_gray);
print (reshaped_image_gray.shape);
(784,)
reshaped_2d_array = reshaped_image_gray.reshape(-784, 784);
print(reshaped_2d_array.shape)
predicted = loaded_model.predict(reshaped_2d_array);
index_of_letter = predicted[0]
print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter])
(1, 784) index_of_letter: 7 letter is: h