# special IPython command to prepare the notebook for matplotlib %matplotlib inline import numpy as np import scipy import pandas as pd # pandas import matplotlib.pyplot as plt # module for plotting from mpl_toolkits.mplot3d import Axes3D #3D plotting import datetime as dt # module for manipulating dates and times import requests import scipy.stats as stats import statsmodels.api as sm from scipy.stats import binom from __future__ import division import re from StringIO import StringIO from zipfile import ZipFile from pandas import read_csv from urllib import urlopen import sklearn import sklearn.preprocessing import sklearn.datasets #nice defaults for matplotlib from matplotlib import rcParams dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = True rcParams['axes.facecolor'] = '#eeeeee' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' url_exprs = "https://raw.githubusercontent.com/cs109/2014_data/master/exprs_GSE5859.csv" X = pd.read_csv(url_exprs, index_col=0) url_sampleinfo = "https://raw.githubusercontent.com/cs109/2014_data/master/sampleinfo_GSE5859.csv" sampleid = pd.read_csv(url_sampleinfo) a = list(sampleid.filename) b = list(X.columns) matchIndex = [b.index(x) for x in a] X = X[matchIndex] A = np.repeat(1/float(X.shape[0]), X.shape[0]).reshape(1, X.shape[0]) colmeans = np.dot(A, X) print "Column averages:",np.round(colmeans,2)[0] pd.DataFrame(np.array([X.columns.values, np.round(colmeans,2)[0]]).transpose(), columns = ["Gene", "Average"]).head(15) plt.hist(colmeans[0], bins = np.arange(5.60, 5.86, 0.02)) plt.xlim(5.60, 5.84) plt.xticks(np.arange(5.60, 5.86, 0.04)) plt.xlabel("Average") plt.ylabel("Frequency") plt.title("Histogram of colmeans") plt.show() sex = sampleid['sex']=='M' A = np.repeat(1/float(X.shape[1]), X.shape[1]).reshape(1, X.shape[1])[0] A = np.repeat(1/float(X.shape[1]), X.shape[1]).reshape(1, X.shape[1])[0] B = [1./sum(sex) if x == True else -1./sum(1-sex) for x in sex] pd.DataFrame([A,B], columns = sampleid.filename.values) zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content zip_files = StringIO() zip_files.write(zip_folder) csv_files = ZipFile(zip_files) teams = csv_files.open('Teams.csv') teams = read_csv(teams) teams = csv_files.open('Teams.csv') teams = read_csv(teams) dat = teams[(teams['G']==162) & (teams['yearID'] < 2002)] dat['Singles'] = dat['H']-dat['2B']-dat['3B']-dat['HR'] dat = dat[['R','Singles', 'HR', 'BB']] dat.head(5) y = dat.R X = np.column_stack((np.repeat(1, dat.shape[0]) , dat.BB)) print X beta = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y) print beta proj = X.dot(beta) print proj plt.scatter(X[:,1], y) plt.plot(X[:,1],proj) plt.xlabel('BB') plt.ylabel('R') plt.show() y = dat.R X = np.column_stack((np.repeat(1, dat.shape[0]) , dat.BB, dat.HR)) print X beta = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y) print beta beta.shape new_prox = X.dot(beta) len(new_prox) len(y) #this requires some 3D plotting fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(X[:,1], X[:,2], y) ax.set_xlabel('BB') ax.set_ylabel('HR') ax.set_zlabel('R') plt.show() Y1 = np.random.normal(0, size = (100,1)) Y = np.column_stack((Y1,Y1,Y1,Y1,Y1)) Y.shape #Singular Value Decomposition #U are the left singular vectors #d are the singular values #U*d gives PCA scores #V are the right singular vectors -- PCA loadings U, d, Vh = scipy.linalg.svd(Y, full_matrices=False) print np.round(d, 6) U1 = U[:,0].reshape(100,1) dV1 = (d[0] * Vh[0,:].reshape(1,5)) newY = np.dot(U1, dV1) print dV1 np.round(np.abs(Y-newY).max(),6) url_exprs = "https://raw.githubusercontent.com/cs109/2014_data/master/exprs_GSE5859.csv" exprs = pd.read_csv(url_exprs, index_col=0) url_sampleinfo = "https://raw.githubusercontent.com/cs109/2014_data/master/sampleinfo_GSE5859.csv" sampleinfo = pd.read_csv(url_sampleinfo) ## Re-order the columns in the `exprs` DataFrame to match the order of the file names in the `sampleinfo` DataFrame. fns = list(sampleinfo.filename) cns = list(exprs.columns) matchIndex = [cns.index(x) for x in fns] exprs = exprs[matchIndex] ##Add a column to the `sampleinfo` DataFrame titled `days` containing the days since October 31, 2002. sampleinfo["date"] = pd.to_datetime(sampleinfo.date) oct31 = dt.datetime(2002,10,31,0,0) sampleinfo["days"] = map(lambda x: (x - oct31).days, sampleinfo.date) ##Subset for CEU only samples only. sampleinfoCEU = sampleinfo[sampleinfo.ethnicity == "CEU"] exprsCEU = exprs[sampleinfoCEU.filename] ###Remove the mean X = exprsCEU.apply(lambda x: x - exprsCEU.mean(axis=1), \ axis = 0) X=X.T X.shape #Singular Value Decomposition #U are the left singular vectors #d are the singular values #U*d gives PCA scores #V are the right singular vectors -- PCA loadings U, d, Vh = scipy.linalg.svd(X,full_matrices=False) Vh.shape plt.scatter(sampleinfoCEU.days, U.transpose()[0,:]) plt.xlabel('Date sample was processed (Number of days since Oct 31, 2012)') plt.ylabel('PC1') plt.title('Relationship between the PC1 and the date the samples were processed') plt.ylim() plt.show() rankDays = scipy.stats.mstats.rankdata(sampleinfoCEU.days) plt.scatter(rankDays, U.transpose()[0,:]) plt.xlabel('Ranked Date (Number of days since Oct 31, 2012)') plt.ylabel('PC1') plt.title('Relationship between the PC1 and the ranked date') plt.show() plt.hist(U.transpose()[0,:], bins = np.arange(-0.2,0.3,0.05)) plt.xlabel('PC1') plt.ylabel('Frequency') plt.title('Distribution of the values from PC1') plt.show() #Load the dataset from sklearn's library of example datasets iris = sklearn.datasets.load_iris() #data are the features (here, flower measurements) X = iris.data #target are the classes we wish to recover (here, flower species) Y = iris.target X = X[:,2:4] X = sklearn.preprocessing.scale(X, axis = 0) plt.scatter(X[:,0], X[:,1]) plt.axvline(x=0, color = 'black', ls='--') plt.axhline(y=0, color = 'black', ls='--') plt.gca().set_aspect('equal') plt.show() np.apply_along_axis(np.std, 0, X) plt.scatter(X[:,0], X[:,1]) plt.axvline(x=0, color = 'black', ls='--') plt.axhline(y=0, color = 'black', ls='--') plt.plot([-2,2], [2, -2], c='black') plt.gca().set_aspect('equal') plt.show() V = (-1,1)/np.sqrt(2) V = V.reshape(2,1) dat = np.dot(np.dot(X, V), V.T) plt.scatter(X[:,0], X[:,1]) plt.axvline(x=0, color = 'black', ls='--') plt.axhline(y=0, color = 'black', ls='--') plt.plot([-2,2], [2, -2], c="black") plt.scatter(dat[:,0], dat[:,1], c='red') plt.gca().set_aspect('equal') plt.show() np.std(np.dot(X, V)) V = (1,1)/np.sqrt(2) V = V.reshape(2,1) dat = np.dot(np.dot(X, V), V.T) plt.scatter(X[:,0], X[:,1]) plt.axvline(x=0, color = 'black', ls='--') plt.axhline(y=0, color = 'black', ls='--') plt.plot([-2,2], [-2, 2], c="black") plt.scatter(dat[:,0], dat[:,1], c='red') plt.gca().set_aspect('equal') plt.show() np.std(np.dot(X, V)) #Singular Value Decomposition #U are the left singular vectors #d are the singular values #U*d gives PCA scores #V are the right singular vectors -- PCA loadings U, d, V = scipy.linalg.svd(X, full_matrices=False) print V np.sqrt((d ** 2) / X.shape[0]) #Plot components of left singular vectors plt.scatter(U.transpose()[0],U.transpose()[1]) plt.xlabel('Projection to first PC') plt.ylabel('Projection to second PC') plt.axvline(x=0, color = 'black', ls='--') plt.axhline(y=0, color = 'black', ls='--') plt.show() xnew = np.dot(U.transpose()[0][None].T, [d[0]*V[0]]) plt.scatter(xnew[:, 0], X[:, 0]) plt.plot(xnew[:, 0],xnew[:, 0]) plt.xlabel("Original 1st column") plt.ylabel("Approximation") plt.show() plt.scatter(xnew[:, 1], X[:, 1]) plt.plot(xnew[:, 1],xnew[:, 1]) plt.xlabel("Original 2nd column") plt.ylabel("Approximation") plt.show() #d^2 is proportional to the variance explained by the dimension #For variance explained, normalize d^2 var_exp = d**2/sum(d**2) print "The first component explains %0.2f%% of the variance."%(var_exp[0]*100) #Load the dataset from sklearn's library of example datasets iris = sklearn.datasets.load_iris() #data are the features (here, flower measurements) X = iris.data #target are the classes we wish to recover (here, flower species) Y = iris.target train = np.array(range(0,35)+range(50,85)+range(100,135)) X = X[:, 0:4] colmeans = np.dot(np.matrix([np.zeros(len(train))+1/len(train)]),X[train]) colmeans = np.array(colmeans)[0] X = np.array([a-colmeans for a in X]) names = {0:'setosa', 1:'versicolor', 2:'virginica'} cols = np.zeros(len(X)) cols[train] = Y[train] U, d, V = scipy.linalg.svd(X[train,], full_matrices=False) plt.scatter(range(1,5), d**2/np.sum(d**2), s=70) plt.xlabel("Dimension") plt.ylabel("Percent explained") plt.xlim(0.9, 4.1) plt.ylim(-0.1,1) plt.show() colors = [dark2_colors[i] for i in Y[train]] plt.scatter(U[:,0], U[:,1], color = colors, s=50) x = np.array([-0.15, 0.15]) plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1) plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1) plt.xlabel("PC1") plt.ylabel("PC2") plt.ylim(-0.3,0.3) plt.show() newd = np.array(np.matrix([1/d]).T) newd = np.diag(newd[:,0]) a = list(set(range(0,len(X))) - set(train)) a.sort() newu = np.dot(X[a, ],V.T).dot(newd) colors = [dark2_colors[i] for i in Y[train]] plt.scatter(U[:,0], U[:,1], color = colors, s=50) x = np.array([-0.15, 0.15]) plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1) plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1) plt.xlabel("PC1") plt.ylabel("PC2") plt.ylim(-0.3,0.3) plt.show() colors = [dark2_colors[i] for i in Y[train]] plt.scatter(U[:,0], U[:,1], color = colors, s=50) colors = [dark2_colors[i] for i in Y[a]] plt.scatter(newu[:,0], newu[:,1], s=70, marker = '^', facecolors=colors, edgecolors='k') x = np.array([-0.15, 0.15]) plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1) plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1) plt.xlabel("PC1") plt.ylabel("PC2") plt.ylim(-0.3,0.3) plt.show()