import numpy as np import pylab as plt #Set the random seed for reproducability np.random.seed(123) #Generate the data X1=np.random.multivariate_normal(np.array([1,1]),np.diag([.1,.1]),10)#10 points from the first GMM component X2=np.random.multivariate_normal(np.array([3,3]),np.diag([.1,.1]),10)#10 points from the second GMM component X=np.concatenate((X1,X2)) #Plot the data %matplotlib inline plt.plot(X[:,0],X[:,1],'bo',markersize=10) plt.xlim([0,5]); plt.ylim([0,5]); def kmeans(data,centers): new_cent=np.zeros(centers.shape) #assignment step dist=np.zeros((data.shape[0],centers.shape[0])) for i in range(centers.shape[0]): dist[:,i]=np.sum((data-centers[i,:])**2,1) assign=np.argmin(dist,1) #compute new centers and cost cost=0 for i in range(centers.shape[0]): new_cent[i,:]=np.mean(data[assign==i,:],0) cost+=np.sum(np.sum((data[assign==i,:]-centers[i,:])**2,1)) return new_cent,cost,assign #We need a value of K K=2 # we assume there are two clusters #random starting points centers=np.random.multivariate_normal(np.array([2,2]),np.diag([1,1]),K) #run one iteration of K-means new_cent,cost,assign=kmeans(X,centers) #plot the assignment based on input cluster centers col=['r','g'] #colour code for the two clusters plt.plot(X[assign==0,0],X[assign==0,1],col[0]+'o',markersize=10) plt.plot(X[assign==1,0],X[assign==1,1],col[1]+'o',markersize=10) plt.plot(centers[0,0],centers[0,1],col[0]+'x',markersize=10,mew=2) #input center plt.plot(centers[1,0],centers[1,1],col[1]+'x',markersize=10,mew=2) #input center plt.xlim([0,5]); plt.ylim([0,5]); #Plot the new centers centers=new_cent # this will be the centers in the next iteration plt.plot(X[assign==0,0],X[assign==0,1],col[0]+'o',markersize=10) plt.plot(X[assign==1,0],X[assign==1,1],col[1]+'o',markersize=10) plt.plot(centers[0,0],centers[0,1],col[0]+'x',markersize=10,mew=2) plt.plot(centers[1,0],centers[1,1],col[1]+'x',markersize=10,mew=2) plt.xlim([0,5]); plt.ylim([0,5]); ##Code box for your solution ##Code box for your solution import PIL from sklearn.cluster import KMeans import matplotlib.cm as cm #colour map to display grayscale image #Obtain the image img=PIL.Image.open('data/jomo.jpg') #Display the image #img.show() plt.imshow(np.asarray(img),cmap = cm.Greys_r) A=np.asarray(img) #get the vectors px=img.size[0] vq=np.zeros(((px*px)/4,4)) k=0 for i in range(0,A.shape[0],2): for j in range(0,A.shape[1],2): vq[k,:]=np.ndarray.flatten(A[i:i+2,j:j+2]) k+=1 kmeans = KMeans(init='k-means++', n_clusters=2, n_init=10) kmeans.fit(vq) cent=kmeans.cluster_centers_ #quantized image img_mtx2=np.zeros(img.size) k=0 for i in range(0,img.size[0],2): for j in range(0,img.size[1],2): img_mtx2[i:i+2,j:j+2]=np.reshape(np.round(cent[kmeans.predict(vq[k,:])]),(2,2)) k+=1 plt.imshow(img_mtx2,cmap = cm.Greys_r) from scipy.stats import norm x=np.linspace(-5,5,100) plt.plot(x,norm.pdf(x,0,1),linewidth=3) plt.xlabel(r'$x$',fontsize=16) plt.ylabel(r'$p(x)$',fontsize=16) plt.title(r'Univariate Gaussian with $\mu=0$ and $\sigma=1$',fontsize=16) plt.plot(x,0.4*norm.pdf(x,-3,.5),'r',linewidth=1) plt.plot(x,0.2*norm.pdf(x,1,1),'g',linewidth=1) plt.plot(x,0.4*norm.pdf(x,3,.5),'b',linewidth=1) plt.plot(x,0.4*norm.pdf(x,-3,.5)+0.2*norm.pdf(x,1,1)+0.4*norm.pdf(x,3,.5),'m',linewidth=2) plt.xlabel(r'$x$',fontsize=16) plt.ylabel(r'$p(x)$',fontsize=16) import scipy.io.wavfile #open the wav file x=scipy.io.wavfile.read('data/digit.wav') freq=x[0]#sampling frequency signal=x[1]/np.float(2**15);#normalize to range -1 to 1, The numbers in x[1] are 16 bit signed integers plt.plot(np.arange(len(signal))*(1./freq),signal) plt.xlabel('Time (seconds)') blk_size=256 num_blk=int(np.floor(len(signal)/blk_size)) energy=np.zeros(num_blk) for i in range(num_blk): energy[i]=np.sum(signal[i*blk_size:(i+1)*blk_size]**2) plt.hist(np.log(energy),50) plt.xlabel('Logarithm of block energy') mu, std = norm.fit(np.log(energy)) plt.hist(np.log(energy), bins=50, normed=True) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'r', linewidth=3) plt.xlabel('Logarithm of block energy') from sklearn import mixture g = mixture.GMM(n_components=2) g.fit(np.log(energy)) p2 = g.weights_[0]*norm.pdf(x, g.means_[0], np.sqrt(g.covars_[0]))+g.weights_[1]*norm.pdf(x, g.means_[1], np.sqrt(g.covars_[1])) plt.hist(np.log(energy), bins=50, normed=True) plt.plot(x, p2, 'r', linewidth=3) plt.xlabel('Logarithm of block energy') speech_comp=np.argmax(g.means_) pred_prob=g.predict_proba(np.log(energy)) #set threshold thresh=0.5 #determine speech blocks speech=pred_prob[:,speech_comp]>thresh #Now plot the speech signal and indicate the regions classified as speech plt.figure() plt.plot(np.arange(len(signal))*(1./freq),signal) for i in range(num_blk): if speech[i]: plt.plot(np.array([i*blk_size,(i+1)*blk_size])*(1./freq),[0,0],'r-',linewidth=4) plt.xlabel('Time (seconds)') import scipy.cluster.hierarchy np.random.seed(123) X1=np.random.multivariate_normal(np.array([1,1]),np.diag([.1,.1]),3) X2=np.random.multivariate_normal(np.array([3,3]),np.diag([.1,.1]),3) X=np.concatenate((X1,X2)) %matplotlib inline for i in range(X.shape[0]): plt.text(X[i,0],X[i,1],str(i),color='b',fontsize=16 ) plt.xlim([0,5]); plt.ylim([0,5]) ; #hierarchical clustering Z=scipy.cluster.hierarchy.single(X) #print Z to show the steps in the clustering process print Z scipy.cluster.hierarchy.dendrogram(Z,leaf_font_size=16)