# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import scipy
import pandas as pd # pandas
import matplotlib.pyplot as plt # module for plotting 
from mpl_toolkits.mplot3d import Axes3D #3D plotting
import datetime as dt # module for manipulating dates and times

import requests 
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import binom
from __future__ import division
import re
from StringIO import StringIO
from zipfile import ZipFile 
from pandas import read_csv
from urllib import urlopen

import sklearn
import sklearn.preprocessing
import sklearn.datasets

#nice defaults for matplotlib
from matplotlib import rcParams

dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

url_exprs = "https://raw.githubusercontent.com/cs109/2014_data/master/exprs_GSE5859.csv"
X = pd.read_csv(url_exprs, index_col=0)

url_sampleinfo = "https://raw.githubusercontent.com/cs109/2014_data/master/sampleinfo_GSE5859.csv"
sampleid = pd.read_csv(url_sampleinfo)

a = list(sampleid.filename)
b = list(X.columns)
matchIndex = [b.index(x) for x in a]
X = X[matchIndex]

A = np.repeat(1/float(X.shape[0]), X.shape[0]).reshape(1, X.shape[0])
colmeans = np.dot(A, X)
print "Column averages:",np.round(colmeans,2)[0]
pd.DataFrame(np.array([X.columns.values, np.round(colmeans,2)[0]]).transpose(), 
             columns = ["Gene", "Average"]).head(15)

plt.hist(colmeans[0], bins = np.arange(5.60, 5.86, 0.02))
plt.xlim(5.60, 5.84)
plt.xticks(np.arange(5.60, 5.86, 0.04))
plt.xlabel("Average")
plt.ylabel("Frequency")
plt.title("Histogram of colmeans")
plt.show()

sex = sampleid['sex']=='M'
A = np.repeat(1/float(X.shape[1]), X.shape[1]).reshape(1, X.shape[1])[0]


A = np.repeat(1/float(X.shape[1]), X.shape[1]).reshape(1, X.shape[1])[0]
B = [1./sum(sex) if x == True else -1./sum(1-sex) for x in sex]
pd.DataFrame([A,B], columns = sampleid.filename.values)

zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content
zip_files = StringIO()
zip_files.write(zip_folder)
csv_files = ZipFile(zip_files)

teams = csv_files.open('Teams.csv')
teams = read_csv(teams)

teams = csv_files.open('Teams.csv')
teams = read_csv(teams)
dat = teams[(teams['G']==162) & (teams['yearID'] < 2002)]
dat['Singles'] = dat['H']-dat['2B']-dat['3B']-dat['HR']
dat = dat[['R','Singles', 'HR', 'BB']]
dat.head(5)

y = dat.R
X = np.column_stack((np.repeat(1, dat.shape[0]) , dat.BB))
print X

beta = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y)
print beta

proj = X.dot(beta)
print proj

plt.scatter(X[:,1], y)
plt.plot(X[:,1],proj)
plt.xlabel('BB')
plt.ylabel('R')
plt.show()

y = dat.R
X = np.column_stack((np.repeat(1, dat.shape[0]) , dat.BB, dat.HR))
print X

beta = np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(y)
print beta
beta.shape

new_prox = X.dot(beta)
len(new_prox)
len(y)

#this requires some 3D plotting
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,1], X[:,2], y)
ax.set_xlabel('BB')
ax.set_ylabel('HR')
ax.set_zlabel('R')
plt.show()

Y1 = np.random.normal(0, size = (100,1))
Y = np.column_stack((Y1,Y1,Y1,Y1,Y1))
Y.shape

#Singular Value Decomposition
#U are the left singular vectors
#d are the singular values
#U*d gives PCA scores
#V are the right singular vectors -- PCA loadings
U, d, Vh = scipy.linalg.svd(Y, full_matrices=False)

print np.round(d, 6)

U1 = U[:,0].reshape(100,1)
dV1 = (d[0] * Vh[0,:].reshape(1,5))
newY = np.dot(U1, dV1)
print dV1

np.round(np.abs(Y-newY).max(),6)

url_exprs = "https://raw.githubusercontent.com/cs109/2014_data/master/exprs_GSE5859.csv"
exprs = pd.read_csv(url_exprs, index_col=0)

url_sampleinfo = "https://raw.githubusercontent.com/cs109/2014_data/master/sampleinfo_GSE5859.csv"
sampleinfo = pd.read_csv(url_sampleinfo)
## Re-order the columns in the `exprs` DataFrame to match the order of the file names in the `sampleinfo` DataFrame.
fns = list(sampleinfo.filename)
cns = list(exprs.columns)
matchIndex = [cns.index(x) for x in fns]
exprs = exprs[matchIndex]

##Add a column to the `sampleinfo` DataFrame titled `days` containing the days since October 31, 2002.
sampleinfo["date"] = pd.to_datetime(sampleinfo.date)
oct31 = dt.datetime(2002,10,31,0,0)

sampleinfo["days"] = map(lambda x: (x - oct31).days, sampleinfo.date)

##Subset for CEU only samples only. 
sampleinfoCEU = sampleinfo[sampleinfo.ethnicity == "CEU"]
exprsCEU = exprs[sampleinfoCEU.filename]

###Remove the mean
X = exprsCEU.apply(lambda x: x - exprsCEU.mean(axis=1), \
                              axis = 0)
X=X.T
X.shape

#Singular Value Decomposition
#U are the left singular vectors
#d are the singular values
#U*d gives PCA scores
#V are the right singular vectors -- PCA loadings
U, d, Vh = scipy.linalg.svd(X,full_matrices=False)

Vh.shape

plt.scatter(sampleinfoCEU.days, U.transpose()[0,:])
plt.xlabel('Date sample was processed (Number of days since Oct 31, 2012)')
plt.ylabel('PC1')
plt.title('Relationship between the PC1 and the date the samples were processed')
plt.ylim()
plt.show()

rankDays = scipy.stats.mstats.rankdata(sampleinfoCEU.days)
plt.scatter(rankDays, U.transpose()[0,:])
plt.xlabel('Ranked Date (Number of days since Oct 31, 2012)')
plt.ylabel('PC1')
plt.title('Relationship between the PC1 and the ranked date')
plt.show()

plt.hist(U.transpose()[0,:], bins = np.arange(-0.2,0.3,0.05))
plt.xlabel('PC1')
plt.ylabel('Frequency')
plt.title('Distribution of the values from PC1')
plt.show()

#Load the dataset from sklearn's library of example datasets
iris = sklearn.datasets.load_iris()

#data are the features (here, flower measurements)
X = iris.data
#target are the classes we wish to recover (here, flower species)
Y = iris.target

X = X[:,2:4]
X = sklearn.preprocessing.scale(X, axis = 0)

plt.scatter(X[:,0], X[:,1])
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.gca().set_aspect('equal')
plt.show()

np.apply_along_axis(np.std, 0, X)

plt.scatter(X[:,0], X[:,1])
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.plot([-2,2], [2, -2], c='black')
plt.gca().set_aspect('equal')
plt.show()

V = (-1,1)/np.sqrt(2)
V = V.reshape(2,1)
dat = np.dot(np.dot(X, V), V.T)

plt.scatter(X[:,0], X[:,1])
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.plot([-2,2], [2, -2], c="black")

plt.scatter(dat[:,0], dat[:,1], c='red')
plt.gca().set_aspect('equal')
plt.show()

np.std(np.dot(X, V))

V = (1,1)/np.sqrt(2)
V = V.reshape(2,1)
dat = np.dot(np.dot(X, V), V.T)

plt.scatter(X[:,0], X[:,1])
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.plot([-2,2], [-2, 2], c="black")

plt.scatter(dat[:,0], dat[:,1], c='red')
plt.gca().set_aspect('equal')
plt.show()

np.std(np.dot(X, V))

#Singular Value Decomposition
#U are the left singular vectors
#d are the singular values
#U*d gives PCA scores
#V are the right singular vectors -- PCA loadings
U, d, V = scipy.linalg.svd(X, full_matrices=False)

print V

np.sqrt((d ** 2) / X.shape[0])

#Plot components of left singular vectors
plt.scatter(U.transpose()[0],U.transpose()[1])
plt.xlabel('Projection to first PC')
plt.ylabel('Projection to second PC')
plt.axvline(x=0, color = 'black', ls='--')
plt.axhline(y=0, color = 'black', ls='--')
plt.show()

xnew = np.dot(U.transpose()[0][None].T, [d[0]*V[0]])

plt.scatter(xnew[:, 0], X[:, 0])
plt.plot(xnew[:, 0],xnew[:, 0])
plt.xlabel("Original 1st column")
plt.ylabel("Approximation")
plt.show()

plt.scatter(xnew[:, 1], X[:, 1])
plt.plot(xnew[:, 1],xnew[:, 1])
plt.xlabel("Original 2nd column")
plt.ylabel("Approximation")
plt.show()

#d^2 is proportional to the variance explained by the dimension
#For variance explained, normalize d^2

var_exp = d**2/sum(d**2)

print "The first component explains %0.2f%% of the variance."%(var_exp[0]*100)

#Load the dataset from sklearn's library of example datasets
iris = sklearn.datasets.load_iris()

#data are the features (here, flower measurements)
X = iris.data
#target are the classes we wish to recover (here, flower species)
Y = iris.target

train = np.array(range(0,35)+range(50,85)+range(100,135))
X = X[:, 0:4]

colmeans = np.dot(np.matrix([np.zeros(len(train))+1/len(train)]),X[train])
colmeans = np.array(colmeans)[0]
X = np.array([a-colmeans for a in X])

names = {0:'setosa', 1:'versicolor', 2:'virginica'}

cols = np.zeros(len(X))
cols[train] = Y[train]

U, d, V = scipy.linalg.svd(X[train,], full_matrices=False)

plt.scatter(range(1,5), d**2/np.sum(d**2), s=70)
plt.xlabel("Dimension")
plt.ylabel("Percent explained")
plt.xlim(0.9, 4.1)
plt.ylim(-0.1,1)
plt.show()

colors = [dark2_colors[i] for i in Y[train]]
plt.scatter(U[:,0], U[:,1], color = colors, s=50)
x = np.array([-0.15, 0.15])
plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1)
plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.ylim(-0.3,0.3)
plt.show()

newd = np.array(np.matrix([1/d]).T)
newd = np.diag(newd[:,0])
a = list(set(range(0,len(X))) - set(train))
a.sort()
newu = np.dot(X[a, ],V.T).dot(newd)

colors = [dark2_colors[i] for i in Y[train]]
plt.scatter(U[:,0], U[:,1], color = colors, s=50)
x = np.array([-0.15, 0.15])
plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1)
plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.ylim(-0.3,0.3)
plt.show()

colors = [dark2_colors[i] for i in Y[train]]
plt.scatter(U[:,0], U[:,1], color = colors, s=50)
colors = [dark2_colors[i] for i in Y[a]]
plt.scatter(newu[:,0], newu[:,1], s=70, marker = '^', facecolors=colors, edgecolors='k')
x = np.array([-0.15, 0.15])
plt.plot(x, -3*x-.165, color='k', linestyle='-', linewidth=1)
plt.plot(x, -3.2*x+0.2, color='k', linestyle='-', linewidth=1)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.ylim(-0.3,0.3)
plt.show()