This notebook walks through a simple bag of words based image classifier. In this technique, small patches are extracted from an image and the number of each patch "type" is used as a feature to a classifier such as an SVM. The patch types are automatically discovered via clustering of all patches found in the training set.
Take a look here for a more thorough exposition
This notebook requires sklearn. You can install something like Enthought Canopy to get all of the packages without any hassle.
This approach is fairly naive (and extremely slow) as is, but it gives an idea of how to implement your own classifiers for use in a pipeline.
%pylab inline
Populating the interactive namespace from numpy and matplotlib
# Display given faces in a grid
def show_patches(ims, grid_size=(16, 6)):
fig = plt.figure(figsize=grid_size)
for i in range(min(len(ims), 30)):
ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[])
ax.imshow(ims[i].reshape((8, 8)), cmap=plt.cm.bone, interpolation='nearest')
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import svm
We are using the CIFAR 10 dataset - a dataset of 32x32 images from 10 classes
# Load data
import cPickle
def load_data(file):
with open(file, 'rb') as f:
data = cPickle.load(f)
return data
data = load_data("cifar-10-batches-py/data_batch_1")
images = data['data']
# Reshape to go from length 3072 vector to 32x32 rgb images
# order='F' deals with specifics of how the data is laid out
images = images.reshape((-1, 32, 32, 3), order='F')
labels = np.array(data['labels'])
def get_classes(classes=[0, 1, 2], per_class=100):
# Array of indices i where labels[i] is in classes
indices = np.concatenate([np.where(labels == c)[0][:per_class] for c in classes])
return images[indices], labels[indices]
# For speed, let's consider only 2 classes, 100 images per class for now
classes = [0,1,2,3]
X, Y = get_classes(classes, 100)
# That's cars, cats, and birds in this case
for c in classes:
plt.imshow(images[labels == c][0], interpolation='nearest')
plt.show()
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
def score(clf, X, Y, folds=2, verbose=False):
predictions = np.zeros(len(Y))
for i, (train, test) in enumerate(KFold(len(X), n_folds=folds, shuffle=True)):
clf.fit(X[train], Y[train])
predictions[test] = clf.predict(X[test])
if verbose:
print("Fold {}: {}".format(i + 1, accuracy_score(Y[test], predictions[test])))
return accuracy_score(Y, predictions)
from sklearn.feature_extraction import image
def rgb2gray(rgb):
return np.dot(rgb[...,:3], [0.299, 0.587, 0.144])
# Used to whiten patches - reduces variance, helps with classification
from sklearn.decomposition import RandomizedPCA
class PatchExtractor(sklearn.base.BaseEstimator):
def __init__(self, patch_size=(8, 8)):
self.patch_size = patch_size
def _extract_patches(self, x):
""" Extracts patches from given H x W image """
# Extract patches in a grid, reshape to proper size
# Details not really important - written this way for speed
return image.extract_patches(x, self.patch_size,
self.patch_size[0]).reshape((-1,
self.patch_size[0] * self.patch_size[1]))
def fit(self, X, Y=None):
patches = np.concatenate([self._extract_patches(rgb2gray(x)) for x in X])
print(patches.shape)
#self.pca = RandomizedPCA(whiten=True)
#self.pca.fit(patches)
return self
def transform(self, X, Y=None):
return np.array([self._extract_patches(rgb2gray(x)) for x in X])
#return np.array([self.pca.transform(self._extract_patches(rgb2gray(x))) for x in X])
Patch types are automatically discovered during training via KMeans clustering. For feature transformation of a single image, each patch is assigned to its nearest cluster and the feature vector of an image is the number of patches from each type (a histogram).
from sklearn.cluster import KMeans
class Codebook(sklearn.base.BaseEstimator):
def __init__(self, size=10):
self.size = size
self.clusterer = KMeans(n_clusters=size)
def _get_histogram(self, x):
""" Returns histogram of codewords for given features """
# Alternative method: return distance of each patch to cluster centers
# return self.clusterer.transform(x).ravel()
# Assign each patch to a cluster
clusters = self.clusterer.predict(x)
# Get the number of each patch type
return np.bincount(clusters, minlength=self.size)
def fit(self, X, Y=None):
# print("Fitting clusterer")
self.clusterer.fit(np.concatenate(X))
return self
def transform(self, X, Y=None):
return np.array([self._get_histogram(x) for x in X])
X, Y = get_classes([1, 3, 5, 7, 9], 100)
patcher = PatchExtractor(patch_size=(8, 8))
codebook = Codebook(size=50)
clf = svm.SVC(kernel='linear')
pipeline = Pipeline([("Patch_extractor", patcher), ("Codebook", codebook), ("svm", clf)])
score(pipeline, X, Y, 2, verbose=True)
(4000, 64) Fold 1: 0.304 (4000, 64) Fold 2: 0.328
0.316
We end up getting about a 30% accuracy across 10 classes, which is significantly better than chance but can definitely be improved.
X, Y = get_classes([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 900)
patcher = PatchExtractor(patch_size=(8, 8))
codebook = Codebook(size=50)
clf = svm.SVC(kernel='linear')
pipeline = Pipeline([("Patch_extractor", patcher), ("Codebook", codebook), ("svm", clf)])
score(pipeline, X, Y, 2, verbose=True)
(72000, 64) Fold 1: 0.316444444444 (72000, 64) Fold 2: 0.313777777778
0.31511111111111112
# You end up with edge like features
show_patches(codebook.clusterer.cluster_centers_)
If you wanted to try different codebook sizes, GridSearchCV is a good approach:
X, Y = get_classes([5, 7], 100)
from sklearn.grid_search import GridSearchCV
param_grid = [
{'Codebook__size': [3, 10]},
]
grid_search = GridSearchCV(pipeline, param_grid, scoring=score)
grid_search.fit(X, Y)
While this may be useful on some datasets with characteristic colors per category, that doesn't appear to be the case with CIFAR. We end up doing worse than chance.
class RGBFeature(sklearn.base.BaseEstimator):
""" Maps an image to its RGB color averages """
def fit(self, X, Y=None):
return self
def transform(self, X, Y=None):
# Get average of each color channel
return np.array([[np.average(X[:, :, i]) for i in range(3)] for x in X])
X, Y = get_classes([1, 3, 5, 7, 9], 500)
rgb = RGBFeature()
clf = svm.SVC(kernel='linear')
pipeline = Pipeline([("RGB Average", rgb), ("svm", clf)])
score(pipeline, X, Y, 5, verbose=True)
Fold 1: 0.164 Fold 2: 0.168 Fold 3: 0.18 Fold 4: 0.16 Fold 5: 0.178
0.17000000000000001