#!/usr/bin/env python # coding: utf-8 # # Miscellaneous benchmark data sets # # Here we house the basic data preparation routines used for all the data sets used in these educational materials, excluding those data sets that have their own notebook (like MNIST digits, CIFAR-10, vim-2, etc.). # __Contents:__ # - Iris data set # ___ # # ## Fisher's Iris data set # # This is a classic data set, perfect for simple prototyping. Let's examine the first and last few lines of the CSV files. # In[1]: import os import csv import numpy as np # In[2]: get_ipython().system(' cat data/iris/iris.data | head -n 5') get_ipython().system(' cat data/iris/iris.data | tail -n 5') # Note that there are only four lines of data from the `tail` command, where we might have expected five. This is because there is an empty line there. __Remove this line__ manually or using a shell command, and save this as `iris_rev.data`. Checking the revised file: # In[3]: get_ipython().system(' cat data/iris/iris_rev.data | head -n 5') get_ipython().system(' cat data/iris/iris_rev.data | tail -n 5') # In[4]: get_ipython().system(' wc -l data/iris/iris_rev.data') # Great, with that minor fix in place, we may now proceed. As just noted, we have 150 data points. # In[5]: NUM_DATA = 150 NUM_TRAIN = 100 # Set manually. NUM_TEST = NUM_DATA - NUM_TRAIN NUM_FEATURES = 4 NUM_CLASSES = 3 NUM_LABELS = 1 LABEL_DICT = {"Iris-setosa": 0, "Iris-versicolor": 1, "Iris-virginica": 2} # In[6]: toread = os.path.join("data", "iris", "iris_rev.data") data_X = np.zeros((NUM_DATA,NUM_FEATURES), dtype=np.float32) data_y = np.zeros((NUM_DATA,1), dtype=np.int8) with open(toread, newline="") as f_table: f_reader = csv.reader(f_table, delimiter=",") i = 0 for line in f_reader: data_X[i,:] = np.array(line[0:-1], dtype=data_X.dtype) data_y[i,:] = np.array(LABEL_DICT[line[-1]], dtype=data_y.dtype) i += 1 # We've read the training data from disk, but would like to store it, along with the testing data, in a hierarchical data file. We use __PyTables__ to do this. # In[7]: import tables # In[8]: # Open file connection, writing new file to disk. myh5 = tables.open_file("data/iris/data.h5", mode="w", title="Iris data") print(myh5) # currently empty. # In[9]: myh5.create_group(myh5.root, "train", "Training data") myh5.create_group(myh5.root, "test", "Testing data") print(myh5) # In[10]: # Training data arrays. a = tables.Int8Atom() myh5.create_earray(myh5.root.train, name="labels", atom=a, shape=(0,NUM_LABELS), title="Label values") a = tables.Float32Atom() myh5.create_earray(myh5.root.train, name="inputs", atom=a, shape=(0,NUM_FEATURES), title="Input images") # Testing data arrays. a = tables.Int8Atom() myh5.create_earray(myh5.root.test, name="labels", atom=a, shape=(0,NUM_LABELS), title="Label values") a = tables.Float32Atom() myh5.create_earray(myh5.root.test, name="inputs", atom=a, shape=(0,NUM_FEATURES), title="Input images") print(myh5) # Shuffle up the data set before taking splitting it into training/testing sets. # In[11]: shufidx = np.random.choice(a=NUM_DATA, size=NUM_DATA, replace=False) idx_tr = shufidx[0:NUM_TRAIN] idx_te = shufidx[NUM_TRAIN:] # In[12]: # Training data for i in idx_tr: myh5.root.train.inputs.append([data_X[i,:]]) myh5.root.train.labels.append([data_y[i,:]]) print(myh5) # In[13]: # Testing data for i in idx_te: myh5.root.test.inputs.append([data_X[i,:]]) myh5.root.test.labels.append([data_y[i,:]]) print(myh5) # Finally, close the file connection. # In[14]: myh5.close() # ___