import os import multiprocessing import re import time import random from glob import glob import itertools import pickle import numpy as np import skimage from skimage import io from sklearn import cross_validation from sklearn import svm from sklearn import preprocessing from sklearn.linear_model.logistic import LogisticRegression def build_file_list(dir): """ Given a directory, it builds a shuffled list of the file """ random.seed(42) image_filenames = glob('{}/*.jpg'.format(dir)) image_filenames.sort() # make the function independent of the order your operating system returns the files random.shuffle(image_filenames) return image_filenames def build_labels(file_list,n_samples=None): """ build the labels from the filenames: cats corresponds to a 1, dogs corresonds to a -1 """ if(n_samples==None): n_samples=len(file_list) n_samples=max(n_samples,len(file_list)) file_list = file_list[:n_samples] y = np.zeros(n_samples,dtype=np.int32) for (i,f) in enumerate(file_list): if "dog" in str(f): y[i]=-1 else: y[i]=1 assert("cat" in str(f)) return y file_list = build_file_list("data/train_resized") pickle.dump(file_list, open("file_list.pkl","wb")) y=build_labels(file_list,n_samples=None) np.save('y',y) def file_to_rgb(filename): """ return an image in rgb format: a gray scale image will be converted, a rgb image will be left untouched""" bild = io.imread(filename) if (bild.ndim==2): rgb_bild= skimage.color.gray2rgb(bild) else: rgb_bild = bild return rgb_bild def hsv_to_feature(hsv,N,C_h,C_s,C_v): """ Takes an hsv picture and returns a feature vector for it. The vector is built as described in the paper 'Machine Learning Attacks Against the Asirra CAPTCHA' """ res = np.zeros((N,N,C_h,C_s,C_v)) cell_size= 250/N h_range = np.arange(0.0,1.0,1.0/C_h) h_range = np.append(h_range,1.0) s_range = np.arange(0.0,1.0,1.0/C_s) s_range = np.append(s_range,1.0) v_range = np.arange(0.0,1.0,1.0/C_v) v_range = np.append(v_range,1.0) for i in range(N): for j in range(N): cell= hsv[i*cell_size:i*cell_size+cell_size,j*cell_size:j*cell_size+cell_size,:] # check for h for h in range(C_h): h_cell = np.logical_and(cell[:,:,0]>=h_range[h],cell[:,:,0]=s_range[s],cell[:,:,1]=v_range[v],cell[:,:,2]