#!/usr/bin/env python # coding: utf-8 # In[1]: # These are all the modules we'll be using later. Make sure you can import them # before proceeding further. from __future__ import print_function import imageio import matplotlib.pyplot as plt import numpy as np import os import sys import tarfile from IPython.display import display, Image from sklearn.linear_model import LogisticRegression from six.moves.urllib.request import urlretrieve from six.moves import cPickle as pickle # Config the matplotlib backend as plotting inline in IPython get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: url = 'https://commondatastorage.googleapis.com/books1000/' last_percent_reported = None data_root = 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data' # Change me to store data elsewhere def download_progress_hook(count, blockSize, totalSize): """A hook to report the progress of a download. This is mostly intended for users with slow internet connections. Reports every 5% change in download progress. """ global last_percent_reported percent = int(count * blockSize * 100 / totalSize) if last_percent_reported != percent: if percent % 5 == 0: sys.stdout.write("%s%%" % percent) sys.stdout.flush() else: sys.stdout.write(".") sys.stdout.flush() last_percent_reported = percent def maybe_download(filename, expected_bytes, force=False): """Download a file if not present, and make sure it's the right size.""" dest_filename = os.path.join(data_root, filename) if force or not os.path.exists(dest_filename): print('Attempting to download:', filename) filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook) print('\nDownload Complete!') statinfo = os.stat(dest_filename) if statinfo.st_size == expected_bytes: print('Found and verified', dest_filename) else: raise Exception( 'Failed to verify ' + dest_filename + '. Can you get to it with a browser?') return dest_filename train_filename = maybe_download('notMNIST_large.tar.gz', 247336696) test_filename = maybe_download('notMNIST_small.tar.gz', 8458043) # In[3]: num_classes = 10 np.random.seed(133) def maybe_extract(filename, force=False): root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz if os.path.isdir(root) and not force: # You may override by setting force=True. print('%s already present - Skipping extraction of %s.' % (root, filename)) else: print('Extracting data for %s. This may take a while. Please wait.' % root) tar = tarfile.open(filename) sys.stdout.flush() tar.extractall(data_root) tar.close() data_folders = [ os.path.join(root, d) for d in sorted(os.listdir(root)) if os.path.isdir(os.path.join(root, d))] if len(data_folders) != num_classes: raise Exception( 'Expected %d folders, one per class. Found %d instead.' % ( num_classes, len(data_folders))) print(data_folders) return data_folders train_folders = maybe_extract(train_filename) test_folders = maybe_extract(test_filename) # In[4]: Image(filename=data_root+'\\notMNIST_large\\A\\a2F6b28udHRm.png') # In[5]: Image(filename=data_root+'\\notMNIST_large\\A\\a3JvZWdlciAwNl81NS50dGY=.png') # In[6]: image_size = 28 # Pixel width and height. pixel_depth = 255.0 # Number of levels per pixel. def load_letter(folder, min_num_images): """Load the data for a single letter label.""" image_files = os.listdir(folder) ## it counts all files in the folder dataset = np.ndarray(shape=(len(image_files), image_size, image_size), dtype=np.float32) #it creates a 3d array print(folder) num_images = 0 for image in image_files: image_file = os.path.join(folder, image) try: image_data = (imageio.imread(image_file).astype(float) - pixel_depth / 2) / pixel_depth if image_data.shape != (image_size, image_size): raise Exception('Unexpected image shape: %s' % str(image_data.shape)) dataset[num_images, :, :] = image_data num_images = num_images + 1 except (IOError, ValueError) as e: print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.') dataset = dataset[0:num_images, :, :] if num_images < min_num_images: raise Exception('Many fewer images than expected: %d < %d' % (num_images, min_num_images)) print('Full dataset tensor:', dataset.shape) print('Mean:', np.mean(dataset)) print('Standard deviation:', np.std(dataset)) return dataset def maybe_pickle(data_folders, min_num_images_per_class, force=False): dataset_names = [] for folder in data_folders: #for each letter's folder set_filename = folder + '.pickle' #create a file with appropriate letter's name dataset_names.append(set_filename) #add them to a returned set if os.path.exists(set_filename) and not force: # You may override by setting force=True. print('%s already present - Skipping pickling.' % set_filename) else: print('Pickling %s.' % set_filename) dataset = load_letter(folder, min_num_images_per_class) #it loads a letter from folder to a 3D array try: with open(set_filename, 'wb') as f: pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) #it dumps a 3D array to a file except Exception as e: print('Unable to save data to', set_filename, ':', e) return dataset_names train_datasets = maybe_pickle(train_folders, 45000) # it creates 3D array for all letters in a train dataset test_datasets = maybe_pickle(test_folders, 1800)# it creates 3D array for all letters in a test dataset # In[7]: # it shows an inside structure of the dataset a_picle_filename = train_datasets[0] print (a_picle_filename) input = open(a_picle_filename, 'rb') _3Darray_A = pickle.load(input) input.close() print ('============================================= adout an n-dimesional array for letter A ===============================') print (_3Darray_A.ndim, '\t', _3Darray_A.shape) print ('============================================= point to print in a screen of image at index 0 =========================') print (_3Darray_A[0][0][4]) print ('============================================= array[0] of pixels of image at index 0 =================================') print (_3Darray_A[0][0]) print ('============================================= array[1] of pixels of image at index 0 =================================') print (_3Darray_A[0][1]) print ('============================================= Image at index 0 =======================================================') print (_3Darray_A[0].shape, '\n', _3Darray_A[0]) #28 * 28 features print ('============================================= Full set of images =====================================================') #print (_3Darray_A) # In[8]: # here is pictures from A-picle written to a folder for i in range (10): x = _3Darray_A[i] print (i, " = ", x.shape) imageio.imwrite(data_root + '\\imageio\\picle_A_' + str(i) + '.png', x[:]) # here is pictures from B-picle written to a folder b_picle_filename = train_datasets[1] print (b_picle_filename) input = open(b_picle_filename, 'rb') _3Darray_B = pickle.load(input) input.close() for i in range (10): x = _3Darray_B[i] print (i, " = ", x.shape) imageio.imwrite(data_root + '\\imageio\\picle_B_' + str(i) + '.png', x[:]) # In[9]: # here is a picture from pilce which is shown direct here: y = _3Darray_A[0] plt.imshow(y) # In[10]: # here is a real image as seen at the time when it was read and written to a 3dArray. Notice it the same as previous. image_data = (imageio.imread(data_root + '\\notMNIST_large\\A\\a29ydW5pc2hpLnR0Zg==.png').astype(float) - pixel_depth / 2) / pixel_depth plt.imshow(image_data) # In[11]: #it creates an empty n-dimensional arrays def make_arrays(nb_rows, img_size): if nb_rows: dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32) labels = np.ndarray(nb_rows, dtype=np.int32) else: dataset, labels = None, None return dataset, labels # it creates a single 3dArray from all GIVEN picle files def merge_datasets(pickle_files, train_size, valid_size=0): print ('====================== merging =======================================') num_classes = len(pickle_files) valid_dataset, valid_labels = make_arrays(valid_size, image_size) #empty array train_dataset, train_labels = make_arrays(train_size, image_size) #empty array vsize_per_class = valid_size // num_classes tsize_per_class = train_size // num_classes print ('valid_size ', valid_size, '\n') print ('train_size ', train_size, '\n', 'num_classes', num_classes, '\n') print ('vsize_per_class ', vsize_per_class, '\n', 'tsize_per_class', tsize_per_class, '\n\n') start_v, start_t = 0, 0 end_v, end_t = vsize_per_class, tsize_per_class end_l = vsize_per_class+tsize_per_class for label, pickle_file in enumerate(pickle_files): try: with open(pickle_file, 'rb') as f: letter_set = pickle.load(f) # let's shuffle the letters to have random validation and training set np.random.shuffle(letter_set) if valid_dataset is not None: valid_letter = letter_set[:vsize_per_class, :, :] valid_dataset[start_v:end_v, :, :] = valid_letter valid_labels[start_v:end_v] = label start_v += vsize_per_class end_v += vsize_per_class train_letter = letter_set[vsize_per_class:end_l, :, :] train_dataset[start_t:end_t, :, :] = train_letter train_labels[start_t:end_t] = label start_t += tsize_per_class end_t += tsize_per_class except Exception as e: print('Unable to process data from', pickle_file, ':', e) raise # it returns 4 objects!! return valid_dataset, valid_labels, train_dataset, train_labels #SUBJECT TO CHANGE accordingly to task 6 train_size = 200000 valid_size = 10000 test_size = 10000 # it uses all 4 objects valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(train_datasets, train_size, valid_size) # it uses only 2 last objects _, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size) print('Training:', train_dataset.shape, train_labels.shape) print('Validation:', valid_dataset.shape, valid_labels.shape) print('Testing:', test_dataset.shape, test_labels.shape) # In[12]: def randomize(dataset, labels): permutation = np.random.permutation(labels.shape[0]) shuffled_dataset = dataset[permutation,:,:] shuffled_labels = labels[permutation] return shuffled_dataset, shuffled_labels train_dataset, train_labels = randomize(train_dataset, train_labels) test_dataset, test_labels = randomize(test_dataset, test_labels) valid_dataset, valid_labels = randomize(valid_dataset, valid_labels) # In[13]: print('Training:', train_dataset.shape, train_labels.shape) print('Validation:', valid_dataset.shape, valid_labels.shape) print('Testing:', test_dataset.shape, test_labels.shape) # In[14]: # it dumps a big 3D array to a big file. Note - each time we have a RANDOMLY SHUFFLED array # so we HAVE TO dump it without checking pickle_file = os.path.join(data_root, 'notMNIST.pickle') def dumpBigShuffledArray(): try: f = open(pickle_file, 'wb') save = { 'train_dataset': train_dataset, 'train_labels': train_labels, 'valid_dataset': valid_dataset, 'valid_labels': valid_labels, 'test_dataset': test_dataset, 'test_labels': test_labels, } pickle.dump(save, f, pickle.HIGHEST_PROTOCOL) f.close() except Exception as e: print('Unable to save data to', pickle_file, ':', e) raise # In[15]: dumpBigShuffledArray() # but if you want to skip - just jump to the next cell # In[16]: # you can check the file existance # don't worry to run this cell just because it cheks a file firstly if os.path.exists(pickle_file): # You may override by setting force=True. statinfo = os.stat(pickle_file) print('Compressed pickle size:', statinfo.st_size) print('%s already present - Skipping pickling.' % pickle_file) else: dumpBigShuffledArray() # In[17]: #it checks a picture from a big 3dArray. IT ALWAYS WILL BE A DIFFERENT PICTURE. input = open(pickle_file, 'rb') _3Darray_BIG = pickle.load(input) input.close() train_dataset_my = _3Darray_BIG['train_dataset'] img_overlapped = train_dataset_my[10] plt.imshow(img_overlapped) # HINT - use it as picture to predict # In[18]: #log_regr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, #intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', #max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1); log_regr = LogisticRegression(solver='sag', n_jobs=1); # In[19]: #it reshapes 3-rd dimensional array to 2D with 784 features (why??) nsamples, nx, ny = train_dataset.shape train_dataset_reshaped = train_dataset.reshape((nsamples,nx*ny)) #image 28x28 pixels -> 784 pixels print (train_dataset_reshaped.shape) # In[20]: nsamples_t, nx_t, ny_t = test_dataset.shape test_dataset_reshaped = test_dataset.reshape((nsamples_t,nx_t*ny_t)) print (test_dataset_reshaped.shape) # In[21]: # it can get lots of time to rain Neural Network log_regr.fit(train_dataset_reshaped,train_labels); print ("NN was trained") #TODO measure time and other resource # In[23]: #so now we can check how well NN was trained #we will use it to pick a letter accordingly to a predicted index array_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']; #reshaping validation dataset nsamples, nx, ny = valid_dataset.shape valid_dataset_reshaped = valid_dataset.reshape((nsamples,nx*ny)) print (valid_dataset_reshaped.shape) score = log_regr.score(valid_dataset_reshaped, valid_labels); print (score); #for default configuration of LogisticRegression: # 0.804 for 10000 samples, 0.8273 for 200000 samples, 0.8273 for 500000 samples # for LogisticRegression(solver='sag') 0.8244 for 200000 samples # In[25]: nsamples_t, nx_t, ny_t = test_dataset.shape test_dataset_reshaped = test_dataset.reshape((nsamples_t,nx_t*ny_t)) print (test_dataset_reshaped.shape) score = log_regr.score(test_dataset_reshaped, test_labels) print(score) # In[26]: #read a real image with letter A and reshape it for predicting image_data_1 = imageio.imread(data_root + '\\notMNIST_large\\A\\a29ydW5pc2hpLnR0Zg==.png').astype(float); nx_img, ny_img = image_data_1.shape; reshaped_image = image_data_1.reshape(nx_img*ny_img); #reshape 2D array to 1D array plt.imshow(image_data_1); print (reshaped_image.shape); # In[27]: res = reshaped_image.reshape(-784, 784); predicted = log_regr.predict(res); index_of_letter = predicted[0] print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter]) # In[28]: #read a real image for letter B and reshape it for predicting image_data_2 = imageio.imread(data_root + '\\imageio\\picle_B_9.png').astype(float); nx_img, ny_img = image_data_2.shape; reshaped_image_2 = image_data_2.reshape(nx_img*ny_img); plt.imshow(image_data_2); print (reshaped_image_2.shape); # In[29]: res_2 = reshaped_image_2.reshape(-784, 784); predicted_2 = log_regr.predict(res_2); index_of_letter = predicted_2[0] print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter]) # In[30]: # check it with img_overlapped res_3 = img_overlapped.reshape(-784, 784); predicted_3 = log_regr.predict(res_3); index_of_letter = predicted_3[0] print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter]) # In[31]: #check it with image from dataset image_data_4 = imageio.imread(data_root+'\\notMNIST_large\\F\\a3JvZWdlciAwNV81NS50dGY=.png').astype(float); nx_img4, ny_img4 = image_data_4.shape; reshaped_image_4 = image_data_4.reshape(nx_img4*ny_img4); plt.imshow(image_data_4); print (reshaped_image_4.shape); # In[32]: res_4 = reshaped_image_4.reshape(-784, 784) predicted_4 = log_regr.predict(res_4) index_of_letter = predicted_4[0] print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter]) # In[33]: # save the model to disk filename_for_log_regr = data_root + '\\finalized_model_log_regr_200K_samples_solver_sag.sav' pickle.dump(log_regr, open(filename_for_log_regr, 'wb')) # In[34]: # load the model from disk loaded_model = pickle.load(open(filename_for_log_regr, 'rb')) result = loaded_model.score(valid_dataset_reshaped, valid_labels) print(result) # In[39]: #read a real image for letter which has been prepared by me: letter_root_name = '\\my_letters\\my_H_28_28_again' my_letter = data_root + letter_root_name + '.png' my_letter_gray = data_root + letter_root_name + '_gray' + '.png' image_data = imageio.imread(my_letter); print(image_data.shape) # In[40]: imageio.imwrite(my_letter_gray, image_data[:, :, 0]) image_data_gray = imageio.imread(my_letter_gray).astype(float); print(image_data_gray.shape) # In[41]: nx_img, ny_img = image_data_gray.shape; reshaped_image_gray = image_data_gray.reshape(nx_img*ny_img); plt.imshow(image_data_gray); print (reshaped_image_gray.shape); # In[42]: reshaped_2d_array = reshaped_image_gray.reshape(-784, 784); print(reshaped_2d_array.shape) predicted = log_regr.predict(reshaped_2d_array); index_of_letter = predicted[0] print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter]) # In[ ]: