import numpy as np
import pylab as plt

#Set the random seed for reproducability
np.random.seed(123)
#Generate the data
X1=np.random.multivariate_normal(np.array([1,1]),np.diag([.1,.1]),10)#10 points from the first GMM component
X2=np.random.multivariate_normal(np.array([3,3]),np.diag([.1,.1]),10)#10 points from the second GMM component
X=np.concatenate((X1,X2))

#Plot the data
%matplotlib inline
plt.plot(X[:,0],X[:,1],'bo',markersize=10)
plt.xlim([0,5]);
plt.ylim([0,5]);

def kmeans(data,centers):

	new_cent=np.zeros(centers.shape)
	#assignment step
	dist=np.zeros((data.shape[0],centers.shape[0]))
	for i in range(centers.shape[0]):
		dist[:,i]=np.sum((data-centers[i,:])**2,1)

	assign=np.argmin(dist,1)

	#compute new centers and cost
	cost=0
	for i in range(centers.shape[0]):
		new_cent[i,:]=np.mean(data[assign==i,:],0)
		cost+=np.sum(np.sum((data[assign==i,:]-centers[i,:])**2,1))
	
	return new_cent,cost,assign

#We need a value of K
K=2 # we assume there are two clusters

#random starting points
centers=np.random.multivariate_normal(np.array([2,2]),np.diag([1,1]),K)

#run one iteration of K-means
new_cent,cost,assign=kmeans(X,centers)

#plot the assignment based on input cluster centers
col=['r','g'] #colour code for the two clusters
plt.plot(X[assign==0,0],X[assign==0,1],col[0]+'o',markersize=10)
plt.plot(X[assign==1,0],X[assign==1,1],col[1]+'o',markersize=10)
plt.plot(centers[0,0],centers[0,1],col[0]+'x',markersize=10,mew=2) #input center
plt.plot(centers[1,0],centers[1,1],col[1]+'x',markersize=10,mew=2) #input center
plt.xlim([0,5]);
plt.ylim([0,5]);

#Plot the new centers
centers=new_cent # this will be the centers in the next iteration
plt.plot(X[assign==0,0],X[assign==0,1],col[0]+'o',markersize=10)
plt.plot(X[assign==1,0],X[assign==1,1],col[1]+'o',markersize=10)
plt.plot(centers[0,0],centers[0,1],col[0]+'x',markersize=10,mew=2)
plt.plot(centers[1,0],centers[1,1],col[1]+'x',markersize=10,mew=2) 
plt.xlim([0,5]);
plt.ylim([0,5]);

##Code box for your solution

##Code box for your solution

import PIL
from sklearn.cluster import KMeans
import matplotlib.cm as cm #colour map to display grayscale image

#Obtain the image
img=PIL.Image.open('data/jomo.jpg')

#Display the image
#img.show()
plt.imshow(np.asarray(img),cmap = cm.Greys_r)


A=np.asarray(img)

#get the vectors
px=img.size[0]
vq=np.zeros(((px*px)/4,4))
k=0
for i in range(0,A.shape[0],2):
	for j in range(0,A.shape[1],2):
		vq[k,:]=np.ndarray.flatten(A[i:i+2,j:j+2])
		k+=1

kmeans = KMeans(init='k-means++', n_clusters=2, n_init=10)
kmeans.fit(vq)
cent=kmeans.cluster_centers_

#quantized image
img_mtx2=np.zeros(img.size)
k=0
for i in range(0,img.size[0],2):
	for j in range(0,img.size[1],2):
		img_mtx2[i:i+2,j:j+2]=np.reshape(np.round(cent[kmeans.predict(vq[k,:])]),(2,2))
		k+=1

plt.imshow(img_mtx2,cmap = cm.Greys_r)

from scipy.stats import norm


x=np.linspace(-5,5,100)
plt.plot(x,norm.pdf(x,0,1),linewidth=3)
plt.xlabel(r'$x$',fontsize=16)
plt.ylabel(r'$p(x)$',fontsize=16)
plt.title(r'Univariate Gaussian with $\mu=0$ and $\sigma=1$',fontsize=16)


plt.plot(x,0.4*norm.pdf(x,-3,.5),'r',linewidth=1)
plt.plot(x,0.2*norm.pdf(x,1,1),'g',linewidth=1)
plt.plot(x,0.4*norm.pdf(x,3,.5),'b',linewidth=1)
plt.plot(x,0.4*norm.pdf(x,-3,.5)+0.2*norm.pdf(x,1,1)+0.4*norm.pdf(x,3,.5),'m',linewidth=2)
plt.xlabel(r'$x$',fontsize=16)
plt.ylabel(r'$p(x)$',fontsize=16)

import scipy.io.wavfile

#open the wav file
x=scipy.io.wavfile.read('data/digit.wav')
freq=x[0]#sampling frequency
signal=x[1]/np.float(2**15);#normalize to range -1 to 1, The numbers in x[1] are 16 bit signed integers

plt.plot(np.arange(len(signal))*(1./freq),signal)
plt.xlabel('Time (seconds)')

blk_size=256
num_blk=int(np.floor(len(signal)/blk_size))
energy=np.zeros(num_blk)

for i in range(num_blk):
	energy[i]=np.sum(signal[i*blk_size:(i+1)*blk_size]**2)
    
plt.hist(np.log(energy),50)
plt.xlabel('Logarithm of block energy')
	

mu, std = norm.fit(np.log(energy))
plt.hist(np.log(energy), bins=50, normed=True)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'r', linewidth=3)
plt.xlabel('Logarithm of block energy')

from sklearn import mixture

g = mixture.GMM(n_components=2)
g.fit(np.log(energy)) 
p2 = g.weights_[0]*norm.pdf(x, g.means_[0], np.sqrt(g.covars_[0]))+g.weights_[1]*norm.pdf(x, g.means_[1], np.sqrt(g.covars_[1]))
plt.hist(np.log(energy), bins=50, normed=True)
plt.plot(x, p2, 'r', linewidth=3)
plt.xlabel('Logarithm of block energy')

speech_comp=np.argmax(g.means_)

pred_prob=g.predict_proba(np.log(energy))

#set threshold
thresh=0.5
#determine speech blocks
speech=pred_prob[:,speech_comp]>thresh

#Now plot the speech signal and indicate the regions classified as speech
plt.figure()
plt.plot(np.arange(len(signal))*(1./freq),signal)
for i in range(num_blk):
	if speech[i]:
		plt.plot(np.array([i*blk_size,(i+1)*blk_size])*(1./freq),[0,0],'r-',linewidth=4)

plt.xlabel('Time (seconds)')

import scipy.cluster.hierarchy

np.random.seed(123)
X1=np.random.multivariate_normal(np.array([1,1]),np.diag([.1,.1]),3)
X2=np.random.multivariate_normal(np.array([3,3]),np.diag([.1,.1]),3)
X=np.concatenate((X1,X2))

%matplotlib inline
for i in range(X.shape[0]):
    plt.text(X[i,0],X[i,1],str(i),color='b',fontsize=16 )
    
plt.xlim([0,5]);
plt.ylim([0,5]) ;  

#hierarchical clustering
Z=scipy.cluster.hierarchy.single(X)

#print Z to show the steps in the clustering process
print Z

scipy.cluster.hierarchy.dendrogram(Z,leaf_font_size=16)