import os
import multiprocessing
import re
import time
import random
from glob import glob
import itertools
import pickle

import numpy as np

import skimage
from skimage import io

from sklearn import cross_validation
from sklearn import svm
from sklearn import preprocessing
from sklearn.linear_model.logistic import LogisticRegression


def build_file_list(dir):
  """ Given a directory, it builds a shuffled list of the file """
  random.seed(42)
  image_filenames = glob('{}/*.jpg'.format(dir))
  image_filenames.sort() # make the function independent of the order your operating system returns the files
  random.shuffle(image_filenames)
  return image_filenames

def build_labels(file_list,n_samples=None):
  """ build the labels from the filenames: cats corresponds to a 1, dogs corresonds to a -1 """
  if(n_samples==None): n_samples=len(file_list)
  n_samples=max(n_samples,len(file_list))
  file_list = file_list[:n_samples]
  y = np.zeros(n_samples,dtype=np.int32)
  for (i,f) in enumerate(file_list):
    if "dog" in str(f): 
      y[i]=-1
    else:
      y[i]=1
      assert("cat" in str(f)) 
  return y


file_list = build_file_list("data/train_resized")
pickle.dump(file_list, open("file_list.pkl","wb"))

y=build_labels(file_list,n_samples=None)
np.save('y',y)

def file_to_rgb(filename):
  """ return an image in rgb format: a gray scale image will be converted, a rgb image will be left untouched"""
  bild = io.imread(filename)
  if (bild.ndim==2):
    rgb_bild= skimage.color.gray2rgb(bild)
  else:
    rgb_bild = bild
  return rgb_bild

def hsv_to_feature(hsv,N,C_h,C_s,C_v):
  """ Takes an hsv picture and returns a feature vector for it.
  The vector is built as described in the paper 'Machine Learning Attacks Against the Asirra CAPTCHA' """  
  res = np.zeros((N,N,C_h,C_s,C_v))
  cell_size= 250/N
  h_range = np.arange(0.0,1.0,1.0/C_h)
  h_range = np.append(h_range,1.0)
  s_range = np.arange(0.0,1.0,1.0/C_s)
  s_range = np.append(s_range,1.0)
  v_range = np.arange(0.0,1.0,1.0/C_v)
  v_range = np.append(v_range,1.0)
  for i in range(N):
    for j in range(N):
      cell= hsv[i*cell_size:i*cell_size+cell_size,j*cell_size:j*cell_size+cell_size,:]
      # check for h
      for h in range(C_h):
        h_cell = np.logical_and(cell[:,:,0]>=h_range[h],cell[:,:,0]<h_range[h+1])
        for s in range(C_s): 
          s_cell = np.logical_and(cell[:,:,1]>=s_range[s],cell[:,:,1]<s_range[s+1])
          for v in range(C_v):
            v_cell = np.logical_and(cell[:,:,2]>=v_range[v],cell[:,:,2]<v_range[v+1])
            gesamt = np.logical_and(np.logical_and(h_cell,s_cell),v_cell)
            res[i,j,h,s,v] = gesamt.any()
  return np.asarray(res).reshape(-1)

def build_color_featurevector(pars):
  """ Takes a jpeg file and the parameters of the feature vector and builds such a vector"""
  filename,N,C_h,C_s,C_v =pars
  rgb_bild = file_to_rgb(filename)
  assert (rgb_bild.shape[2]==3)
  return hsv_to_feature(skimage.color.rgb2hsv(rgb_bild),N,C_h,C_s,C_v)
	    
def build_color_featurematrix(file_list,N,C_h,C_s,C_v):
    """ Builds the feature matrix of the jpegs in file list
    return featurematrix where the i-th row corresponds to the feature in the i-th image of the file list"
    """
    pool = multiprocessing.Pool()
    x = [(f,N,C_h,C_s,C_v) for f in file_list]
    res = pool.map(build_color_featurevector,x)
    return np.array(res)

def build_color_feature_matrices_or_load(file_list):
  try:
    F1 = np.load("F1.npy")
  except IOError:
    F1 = build_color_featurematrix(file_list,1,10,10,10)
  try:
    F2 = np.load("F2.npy")
  except IOError:
    F2 = build_color_featurematrix(file_list,3,10,8,8)
  try:
    F3 = np.load("F3.npy")
  except IOError:
    F3 = build_color_featurematrix(file_list,5,10,6,6)
  return F1,F2,F3

file_list = pickle.load(open("file_list.pkl","rb"))
%time F1,F2,F3 =build_color_feature_matrices_or_load(file_list[:10000])
np.save("F1",F1) 
np.save("F2",F2)
np.save("F3",F3)

def classify_color_feature(F,y):
  start = time.time()
  clf = svm.SVC(kernel='rbf',gamma=0.001)
  scores = cross_validation.cross_val_score(clf, F, y, cv=5,n_jobs=-1) 
  time_diff = time.time() - start 
  print "Accuracy: %.1f  +- %.1f   (calculated in %.1f seconds)"   % (np.mean(scores)*100,np.std(scores)*100,time_diff)


F1=np.load("F1.npy")
F2=np.load("F2.npy")
F3=np.load("F3.npy")
y=np.load("y.npy")
union = np.hstack((F1,F2,F3))

classify_color_feature(F1[:5000],y[:5000])
classify_color_feature(F2[:5000],y[:5000])
classify_color_feature(F3[:5000],y[:5000])

classify_color_feature(F3[:10000],y[:10000]) 

classify_color_feature(union[:5000],y[:5000])
classify_color_feature(union[:10000],y[:10000]) 

def texture_texture_distance(T1,T2):
  """ Returns the distance between two tiles. """
  y=np.linalg.norm(T1-T2,axis=2)
  assert(y.shape==(5,5))
  return np.mean(y)

def build_tiles(number_of_tiles,files,threshold):
  """ Returns a number_of_tiles*5*5 - Matrix, where every 5*5_texture is at least threshold from each other """
  current=0
  textures = np.zeros((number_of_tiles,5,5,3))
  while(current<number_of_tiles):
    file_index = random.randint(0,len(files)-1)
    i = random.randint(0,49)
    j = random.randint(0,49)
    bild = io.imread(files[file_index])
    if (bild.ndim==2):
      rgb_bild= skimage.color.gray2rgb(bild)
    else:
      rgb_bild = bild
    cell = rgb_bild[i*5:i*5+5,j*5:j*5+5,:] 
    close = False
    for i in range(current):
      T = textures[i,:,:] 
      if(texture_texture_distance(cell,T)<threshold):
        close=True
        break
    if(not close):
      textures[current,:,:]=cell
      current+=1
  return textures
    
def build_textures_or_load(number_of_tiles,files,threshold):
    try:
        textures = np.load("textures.npy")
    except IOError:
        textures = build_tiles(number_of_tiles,files,threshold)
    return textures
    

def texture_image_distance_simple(rgb,T):
  """ Returns the distance between an image and a tile. 
      This is a simplified version of the distance described in the paper. Instead of using every possible
      upper-left corner, we only use these, which are multiplies of five"""
  assert(rgb.shape==(250,250,3))
  assert(T.shape==(5,5,3))
  bigtile = np.tile(T,(50,50,1))
  distances = np.linalg.norm(rgb-bigtile,axis=2)
  assert(distances.shape==(250,250))
  splitted = [np.hsplit(x,50) for x in np.vsplit(distances,50)]
  merged = list(itertools.chain.from_iterable(splitted)) # flatten the list
  assert(len(merged)==50*50) # splitted should contain a list of the submatrices
  maxvalues=[np.max(x) for x in merged]
  return np.min(maxvalues)

def build_texture_feature_vector(pars):
  filename,textures=pars
  bild = io.imread(filename)
  if (bild.ndim==2):
    rgb= skimage.color.gray2rgb(bild)
  else:
    rgb = bild
  res=[]
  for t in textures:
    res.append(texture_image_distance_simple(rgb,t))
  return res

def build_texture_feature_matrix(file_list,texture):
  """ Builds the feature matrix of the jpegs in file_list, takes maximal n_samples
    return X """
  pool = multiprocessing.Pool()
  res = pool.map(build_texture_feature_vector,[(f,texture) for f in file_list])
  return np.array(res)

def build_texture_feature_matrix_or_load(file_list,textures):
    try:
        G = np.load("G.npy")
    except IOError:
        print "Building matrix"
        G = build_texture_feature_matrix(file_list,textures)
    return G

file_list = pickle.load(open("file_list.pkl","rb"))
%time textures = build_textures_or_load(5000,file_list,40)
np.save("textures",textures) 

textures=np.load("textures.npy")
file_list = pickle.load(open("file_list.pkl","rb"))
%time G=build_texture_feature_matrix_or_load(file_list[:10000],textures)
np.save("G",G)

def classify_texture_feature(G,y):
  start = time.time()
  scores = cross_validation.cross_val_score(LogisticRegression(), G, y, cv=5,n_jobs=-1) 
  time_diff = time.time() - start 
  print "Accuracy: %.1f  +- %.1f   (calculated in %.1f seconds)"   % (np.mean(scores)*100,np.std(scores)*100,time_diff)

G=np.load("G.npy")
y=np.load("y.npy")
classify_texture_feature(G[:5000,:1000],y[:5000])
classify_texture_feature(G[:5000,:5000],y[:5000])
classify_texture_feature(G[:10000,:5000],y[:10000])

class Combined:
  def __init__(self,clf1,clf2):
    self.clf1=clf1
    self.clf2=clf2
  def predict(self,F,G):
    y1=self.clf1.predict_proba(F)
    y2=self.clf2.predict_proba(G)
    y_out= 2*y1/3+y2/3
    m=np.argmax(y_out, axis=1)
    m[m==1]=1.0
    m[m==0]=-1.0
    return m

F1=np.load("F1.npy")
F2=np.load("F2.npy")
F3=np.load("F3.npy")
union = np.hstack((F1,F2,F3))
G=np.load("G.npy")
y=np.load("y.npy")

clf_color = svm.SVC(kernel='rbf',gamma=0.001,probability=True)
clf_color.fit(union[:1000],y[:1000])

clf_texture = LogisticRegression()
clf_texture.fit(G[:1000],y[:1000])

combined=Combined(clf_color,clf_texture)
print "Accuracy: ", np.mean(combined.predict(union[1000:2000],G[1000:2000])==y[1000:2000])