#!/usr/bin/env python
# coding: utf-8

# In[1]:


# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

# Config the matplotlib backend as plotting inline in IPython
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


url = 'https://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
data_root = 'D:\\1_Workspaces\\UNDER_VCS\\github\\1_ML_NN\\python_with_math\\data' # Change me to store data elsewhere

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  dest_filename = os.path.join(data_root, filename)
  if force or not os.path.exists(dest_filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(dest_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', dest_filename)
  else:
    raise Exception(
      'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
  return dest_filename

train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)


# In[3]:


num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall(data_root)
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)


# In[4]:


Image(filename=data_root+'\\notMNIST_large\\A\\a2F6b28udHRm.png')


# In[5]:


Image(filename=data_root+'\\notMNIST_large\\A\\a3JvZWdlciAwNl81NS50dGY=.png')


# In[6]:


image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder, min_num_images):
  """Load the data for a single letter label."""
  image_files = os.listdir(folder) ## it counts all files in the folder
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32) #it creates a 3d array
  print(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (imageio.imread(image_file).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except (IOError, ValueError) as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  for folder in data_folders:                #for each letter's folder
    set_filename = folder + '.pickle'        #create a file with appropriate letter's name
    dataset_names.append(set_filename)       #add them to a returned set
    if os.path.exists(set_filename) and not force:
      # You may override by setting force=True.
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class) #it loads a letter from folder to a 3D array
      try:
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)    #it dumps a 3D array to a file
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names

train_datasets = maybe_pickle(train_folders, 45000) # it creates 3D array for all letters in a train dataset
test_datasets = maybe_pickle(test_folders, 1800)# it creates 3D array for all letters in a test dataset


# In[7]:


# it shows an inside structure of the dataset
a_picle_filename = train_datasets[0]
print (a_picle_filename)
input = open(a_picle_filename, 'rb')
_3Darray_A = pickle.load(input)
input.close()
print ('============================================= adout an n-dimesional array for letter A ===============================')
print (_3Darray_A.ndim, '\t', _3Darray_A.shape)
print ('============================================= point to print in a screen of image at index 0 =========================')
print (_3Darray_A[0][0][4])
print ('============================================= array[0] of pixels of image at index 0 =================================')
print (_3Darray_A[0][0])
print ('============================================= array[1] of pixels of image at index 0 =================================')
print (_3Darray_A[0][1])
print ('============================================= Image at index 0 =======================================================')
print (_3Darray_A[0].shape, '\n', _3Darray_A[0]) #28 * 28 features
print ('============================================= Full set of images =====================================================')
#print (_3Darray_A)


# In[8]:


# here is pictures from A-picle written to a folder
for i in range (10):
    x = _3Darray_A[i]
    print (i, " = ", x.shape)
    imageio.imwrite(data_root + '\\imageio\\picle_A_' + str(i) + '.png', x[:])

# here is pictures from B-picle written to a folder
b_picle_filename = train_datasets[1]
print (b_picle_filename)
input = open(b_picle_filename, 'rb')
_3Darray_B = pickle.load(input)
input.close()
for i in range (10):
    x = _3Darray_B[i]
    print (i, " = ", x.shape)
    imageio.imwrite(data_root + '\\imageio\\picle_B_' + str(i) + '.png', x[:])


# In[9]:


# here is a picture from pilce which is shown direct here:
y = _3Darray_A[0]
plt.imshow(y)


# In[10]:


# here is a real image as seen at the time when it was read and written to a 3dArray. Notice it the same as previous.
image_data = (imageio.imread(data_root + '\\notMNIST_large\\A\\a29ydW5pc2hpLnR0Zg==.png').astype(float) - 
                    pixel_depth / 2) / pixel_depth
plt.imshow(image_data)


# In[11]:


#it creates an empty n-dimensional arrays
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

# it creates a single 3dArray from all GIVEN picle files
def merge_datasets(pickle_files, train_size, valid_size=0):
  print ('====================== merging =======================================')
  num_classes = len(pickle_files)

  valid_dataset, valid_labels = make_arrays(valid_size, image_size)  #empty array
  train_dataset, train_labels = make_arrays(train_size, image_size)  #empty array

  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes

  print ('valid_size ', valid_size, '\n')
  print ('train_size ', train_size, '\n', 'num_classes', num_classes, '\n')
  print ('vsize_per_class ', vsize_per_class, '\n', 'tsize_per_class', tsize_per_class, '\n\n')
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class

  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
        
# it returns 4 objects!!
  return valid_dataset, valid_labels, train_dataset, train_labels
            
#SUBJECT TO CHANGE accordingly to task 6            
train_size = 200000
valid_size = 10000
test_size = 10000

# it uses all 4 objects
valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(train_datasets, train_size, valid_size)

# it uses only 2 last objects
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)


# In[12]:


def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)


# In[13]:


print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)


# In[14]:


# it dumps a big 3D array to a big file. Note - each time we have a RANDOMLY SHUFFLED array 
# so we HAVE TO dump it without checking

pickle_file = os.path.join(data_root, 'notMNIST.pickle')

def dumpBigShuffledArray():
    try:
        f = open(pickle_file, 'wb')
        save = {
            'train_dataset': train_dataset,
            'train_labels': train_labels,
            'valid_dataset': valid_dataset,
            'valid_labels': valid_labels,
            'test_dataset': test_dataset,
            'test_labels': test_labels,
        }
        pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
        f.close()
    except Exception as e:
        print('Unable to save data to', pickle_file, ':', e)
        raise


# In[15]:


dumpBigShuffledArray()

# but if you want to skip - just jump to the next cell


# In[16]:


# you can check the file existance
# don't worry to run this cell just because it cheks a file firstly

if os.path.exists(pickle_file):
      # You may override by setting force=True.
      statinfo = os.stat(pickle_file)
      print('Compressed pickle size:', statinfo.st_size)
      print('%s already present - Skipping pickling.' % pickle_file)      
else:
  dumpBigShuffledArray()


# In[17]:


#it checks a picture from a big 3dArray. IT ALWAYS WILL BE A DIFFERENT PICTURE.
input = open(pickle_file, 'rb')
_3Darray_BIG = pickle.load(input)
input.close()
train_dataset_my = _3Darray_BIG['train_dataset']
img_overlapped = train_dataset_my[10]
plt.imshow(img_overlapped)
# HINT - use it as picture to predict


# In[18]:


#log_regr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                   #intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', 
                   #max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1);
log_regr = LogisticRegression(solver='sag', n_jobs=1);


# In[19]:


#it reshapes 3-rd dimensional array to 2D with 784 features (why??)
nsamples, nx, ny = train_dataset.shape
train_dataset_reshaped = train_dataset.reshape((nsamples,nx*ny)) #image 28x28 pixels -> 784 pixels
print (train_dataset_reshaped.shape)


# In[20]:


nsamples_t, nx_t, ny_t = test_dataset.shape
test_dataset_reshaped = test_dataset.reshape((nsamples_t,nx_t*ny_t))
print (test_dataset_reshaped.shape)


# In[21]:


# it can get lots of time to rain Neural Network
log_regr.fit(train_dataset_reshaped,train_labels);
print ("NN was trained") #TODO measure time and other resource


# In[23]:


#so now we can check how well NN was trained
#we will use it to pick a letter accordingly to a predicted index
array_letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']; 
#reshaping validation dataset
nsamples, nx, ny = valid_dataset.shape
valid_dataset_reshaped = valid_dataset.reshape((nsamples,nx*ny))
print (valid_dataset_reshaped.shape)
score = log_regr.score(valid_dataset_reshaped, valid_labels);
print (score); 
#for default configuration of LogisticRegression: 
# 0.804 for 10000 samples, 0.8273 for 200000 samples, 0.8273 for 500000 samples
# for LogisticRegression(solver='sag') 0.8244 for 200000 samples


# In[25]:


nsamples_t, nx_t, ny_t = test_dataset.shape
test_dataset_reshaped = test_dataset.reshape((nsamples_t,nx_t*ny_t))
print (test_dataset_reshaped.shape)
score = log_regr.score(test_dataset_reshaped, test_labels)
print(score)


# In[26]:


#read a real image with letter A and reshape it for predicting
image_data_1 = imageio.imread(data_root + '\\notMNIST_large\\A\\a29ydW5pc2hpLnR0Zg==.png').astype(float);
nx_img, ny_img = image_data_1.shape;
reshaped_image = image_data_1.reshape(nx_img*ny_img); #reshape 2D array to 1D array
plt.imshow(image_data_1);
print (reshaped_image.shape);


# In[27]:


res = reshaped_image.reshape(-784, 784);
predicted = log_regr.predict(res);
index_of_letter = predicted[0]
print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter])


# In[28]:


#read a real image for letter B and reshape it for predicting
image_data_2 = imageio.imread(data_root + '\\imageio\\picle_B_9.png').astype(float);
nx_img, ny_img = image_data_2.shape;
reshaped_image_2 = image_data_2.reshape(nx_img*ny_img);
plt.imshow(image_data_2);
print (reshaped_image_2.shape);


# In[29]:


res_2 = reshaped_image_2.reshape(-784, 784);
predicted_2 = log_regr.predict(res_2);
index_of_letter = predicted_2[0]
print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter])


# In[30]:


# check it with img_overlapped
res_3 = img_overlapped.reshape(-784, 784);
predicted_3 = log_regr.predict(res_3);
index_of_letter = predicted_3[0]
print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter])


# In[31]:


#check it with image from dataset
image_data_4 = imageio.imread(data_root+'\\notMNIST_large\\F\\a3JvZWdlciAwNV81NS50dGY=.png').astype(float);
nx_img4, ny_img4 = image_data_4.shape;
reshaped_image_4 = image_data_4.reshape(nx_img4*ny_img4);
plt.imshow(image_data_4);
print (reshaped_image_4.shape);


# In[32]:


res_4 = reshaped_image_4.reshape(-784, 784)
predicted_4 = log_regr.predict(res_4)
index_of_letter = predicted_4[0]
print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter])


# In[33]:


# save the model to disk
filename_for_log_regr = data_root + '\\finalized_model_log_regr_200K_samples_solver_sag.sav'
pickle.dump(log_regr, open(filename_for_log_regr, 'wb'))


# In[34]:


# load the model from disk
loaded_model = pickle.load(open(filename_for_log_regr, 'rb'))
result = loaded_model.score(valid_dataset_reshaped, valid_labels)
print(result)


# In[39]:


#read a real image for letter which has been prepared by me:
letter_root_name = '\\my_letters\\my_H_28_28_again'
my_letter = data_root + letter_root_name + '.png'
my_letter_gray = data_root + letter_root_name + '_gray' + '.png'

image_data = imageio.imread(my_letter);
print(image_data.shape)


# In[40]:


imageio.imwrite(my_letter_gray, image_data[:, :, 0])
image_data_gray = imageio.imread(my_letter_gray).astype(float);
print(image_data_gray.shape)


# In[41]:


nx_img, ny_img = image_data_gray.shape;
reshaped_image_gray = image_data_gray.reshape(nx_img*ny_img);
plt.imshow(image_data_gray);
print (reshaped_image_gray.shape);


# In[42]:


reshaped_2d_array = reshaped_image_gray.reshape(-784, 784);
print(reshaped_2d_array.shape)
predicted = log_regr.predict(reshaped_2d_array);
index_of_letter = predicted[0]
print (' index_of_letter: ', index_of_letter, '\n', 'letter is: ', array_letters[index_of_letter])


# In[ ]: