Notebook

BayesC0¶

Rohan L. Fernando, Hao Cheng and Dorian Garrick¶

June 2015¶

You are free to use this code in your work, but please acknowledge the JWAS package and website until the associated paper, "JWAS: Julia Implementation of Whole-Genome Analyses Software", is published.

Simulating Genotypes and Phenotypes¶

In [1]:

using(Distributions)

In [2]:

nObs     = 100
nMarkers = 1000
X = sample([0,1,2],(nObs,nMarkers))
α  = randn(nMarkers)
a  = X*α
stdGen = std(a)
a = a/stdGen
y = a + randn(nObs)
saveAlpha = α
nothing

Centering Genotype Covariates¶

In [3]:

meanXCols = mean(X,1)
X = X - ones(nObs,1)*meanXCols;

Priors¶

In [4]:

seed            =   10    # set the seed for the random number generator
chainLength     =  2000   # number of iterations
probFixed       =    0    # parameter "pi" the probability SNP effect is zero
dfEffectVar     =    4    # hyper parameter (degrees of freedom) for locus effect variance          
nuRes           =    4    # hyper parameter (degrees of freedom) for residual variance
varGenotypic    =    1    # used to derive hyper parameter (scale) for locus effect variance 
varResidual     =    1    # used to derive hyper parameter (scale) for locus effect variance 
scaleVar        =   varGenotypic*(dfEffectVar-2)/dfEffectVar    # scale factor for locus effects
scaleRes        =   varResidual*(nuRes-2)/nuRes                 # scale factor for residual variance
nothing

Function for Sampling Marker Effects¶

First we make an array, xArray, that contains the columns of $\mathbf{X}$. This is done without duplicating each column of $\mathbf{X}$.

In [5]:

function get_column(X,nRows,j)
    indx = 1 + (j-1)*nRows
    ptr = pointer(X,indx)
    pointer_to_array(ptr,nRows)
end

Out[5]:

get_column (generic function with 1 method)

In [6]:

xpx = [(X[:,i]'X[:,i])[1]::Float64 for i=1:nMarkers]
xArray = Array(Array{Float64,1},nMarkers)
for i=1:nMarkers
    xArray[i] = get_column(X,nObs,i)
end

In [7]:

typeof(xArray[1])

Out[7]:

Array{Float64,1}

Computing the adjusted right-hand-side efficiently¶

We want to compute: $$ rhs = \mathbf{X}_j'(\mathbf{y}_{corr} + \mathbf{X}_j\mathbf{\alpha}_j), $$ where $\mathbf{y}_{corr} = \mathbf{y} - \mathbf{X}\mathbf{\alpha}.$ This is more efficiently obtained as $$ rhs = \mathbf{X}_j'\mathbf{y}_{corr} + \mathbf{X}_j'\mathbf{X}_j\mathbf{\alpha}_j, $$ using the diagonals of $\mathbf{X}'\mathbf{X}$ that have already been computed (line 4 of the function below).

In [8]:

function sampleEffects!(nMarkers,xArray,xpx,yCorr,α,meanAlpha,vare,varEffects)
    nObs = size(X,1)
    for j=1:nMarkers
        rhs::Float64      = dot(xArray[j],yCorr) + xpx[j]*α[j]   
        lhs::Float64      = xpx[j] + vare/varEffects
        invLhs::Float64   = 1.0/lhs
        mean::Float64     = invLhs*rhs
        oldAlpha::Float64 = α[j] 
        α[j] = mean + randn()*sqrt(invLhs*vare)
        BLAS.axpy!(oldAlpha-α[j],xArray[j],yCorr)  
    end
    nothing
end

Out[8]:

sampleEffects! (generic function with 1 method)

Function for BayesC0¶

The intercept is sampled first and the sampleEffects! function is called to sample the marker effects.

In [9]:

chi1=Chisq(nObs+nuRes)
chi2=Chisq(dfEffectVar+nMarkers)

function BayesC0!(numIter,nMarkers,X,xpx,yCorr,mu,meanMu,α,meanAlpha,vare,varEffects)
    for i=1:numIter
        # sample residula variance
        vare = (dot(yCorr,yCorr)+nuRes*scaleRes)/rand(chi1)
        
        # sample intercept
        yCorr  = yCorr+mu
        rhs    = sum(yCorr)                   
        invLhs = 1.0/(nObs)               
        mean   = rhs*invLhs   
        mu     = mean + randn()*sqrt(invLhs*vare)
        yCorr  = yCorr - mu                    
        meanMu = meanMu + (mu - meanMu)/i
        
        # sample effects
        sampleEffects!(nMarkers,xArray,xpx,yCorr,α,meanAlpha,vare,varEffects)
        meanAlpha = meanAlpha + (α - meanAlpha)/i
        
        #sameple locus effect variance
        varEffects = (scaleVar*dfEffectVar + dot(α,α))/rand(chi2)  
               
        if (i%1000)==0
            yhat = meanMu+X*meanAlpha
            resCorr = cor(a,yhat)
            println ("Correlation of between true and predicted breeding value: ", resCorr)
        end
    end
end

Out[9]:

BayesC0! (generic function with 1 method)

Run BayesC0¶

In [16]:

meanMu    = 0
meanAlpha = zeros(nMarkers)   

#initial valus
vare = 1
varEffects = 1
mu = mean(y)
yCorr = y - mu
alpha = fill(0.0,nMarkers)

#run it
@time BayesC0!(chainLength,nMarkers,X,xpx,yCorr,mu,meanMu,alpha,meanAlpha,vare,varEffects)

Correlation of between true and predicted breeding value: 0.6344163465804081
Correlation of between true and predicted breeding value: 0.6346902997156448
elapsed time: 0.216529693 seconds (53211800 bytes allocated, 11.48% gc time)

Compare Runtime with R Implementation¶

In [11]:

;Rscript RBayesC0/BayesC0.R

   user  system elapsed 
 48.341   1.328  49.707

In [18]:

48.34/.217

Out[18]:

222.76497695852535

In [12]:

;cat RBayesC0/BayesC0.R

#   This code is for illustrative purposes and not efficient for large problems
#   Real life data analysis (using the same file formats) is available at 
#   bigs.ansci.iastate.edu/login.html based on GenSel cpp software implementation
# 
#               Rohan Fernando      (rohan@iastate.edu)
#               Dorian Garrick      (dorian@iastate.edu) 
#               copyright August 2012

# Parameters
setwd("RBayesC0")
seed            =   10    # set the seed for the random number generator
chainLength     =  2000    # number of iterations
dfEffectVar     =    4    # hyper parameter (degrees of freedom) for locus effect variance          
nuRes           =    4    # hyper parameter (degrees of freedom) for residual variance
varGenotypic    =    1    # used to derive hyper parameter (scale) for locus effect variance 
varResidual     =    1    # used to derive hyper parameter (scale) for residual variance 
windowSize      =   10    # number of consecutive markers in a genomic window
outputFrequency =   100    # frequency for reporting performance and for computing genetic variances

markerFileName         = "genotypes.dat"
trainPhenotypeFileName = "trainPhenotypes.dat"
testPhenotypeFileName  = "testPhenotypes.dat"

set.seed(seed)

genotypeFile         = read.table(markerFileName, header=TRUE)                 # this is not efficient for large files!
trainPhenotypeFile   = read.table(trainPhenotypeFileName, skip=1)[,1:2]        # skip the header as R dislikes # character
testPhenotypeFile    = read.table(testPhenotypeFileName,  skip=1)[,1:2]        # skip the header as R dislikes # character
commonTrainingData   = merge(trainPhenotypeFile, genotypeFile, by.x=1, by.y=1) # Only use animals with genotype and phenotype
commonTestData       = merge(testPhenotypeFile,  genotypeFile, by.x=1, by.y=1) # Only use animals with genotype and phenotype


remove(genotypeFile)                                                # Free up space 
remove(trainPhenotypeFile)                                          # Free up space 
remove(testPhenotypeFile)                                           # Free up space 
animalID = unname(as.matrix(commonTrainingData[,1]))                # First field is animal identifier
y        = commonTrainingData[, 2]                                  # Second field is trait values
Z        = commonTrainingData[, 3: ncol(commonTrainingData)]        # Remaining fields are GenSel-coded genotypes 
Z        = unname(as.matrix((Z + 10)/10));                          # Recode genotypes to 0, 1, 2 (number of B alleles)
markerID = colnames(commonTrainingData)[3:ncol(commonTrainingData)] # Remember the marker locus identifiers
remove(commonTrainingData)

testID = unname(as.matrix(commonTestData[,1]))                  # First field is animal identifier
yTest        = commonTestData[, 2]                              # Second field is trait values
ZTest        = commonTestData[, 3: ncol(commonTestData)]        # Remaining fields are GenSel-coded genotypes 
ZTest        = unname(as.matrix((ZTest + 10)/10));              # Recode genotypes to 0, 1, 2 (number of B alleles)
remove(commonTestData)

nmarkers = ncol(Z)                                              # number of markers
nrecords = nrow(Z)                                              # number of animals

# center the genotype matrix to accelerate mixing 
markerMeans = colMeans(Z)                             # compute the mean for each marker
Z = t(t(Z) - markerMeans)                             # deviate covariate from its mean
p = markerMeans/2.0                                   # compute frequency B allele for each marker
mean2pq = mean(2*p*(1-p))                             # compute mean genotype variance

varEffects  = varGenotypic/(nmarkers*mean2pq)         # variance of locus effects is computed from genetic variance 
                                                      #(e.g. Fernando et al., Acta Agriculturae Scand Section A, 2007; 57: 192-195)
scaleVar    = varEffects*(dfEffectVar-2)/dfEffectVar; # scale factor for locus effects
scaleRes    = varResidual*(nuRes-2)/nuRes             # scale factor for residual variance

numberWindows = nmarkers/windowSize                   # number of genomic windows
numberSamples = chainLength/outputFrequency           # number of samples of genetic variances


alpha           = array(0.0, nmarkers) # reserve a vector to store sampled locus effects 
meanAlpha       = array(0.0, nmarkers) # reserve a vector to accumulate the posterior mean of locus effects
modelFreq       = array(0.0, nmarkers) # reserve a vector to store model frequency
mu              = mean(y)              # starting value for the location parameter 
meanMu          = 0                    # reserve a scalar to accumulate the posterior mean
geneticVar      = array(0,numberSamples) # reserve a vector to store sampled genetic variances
                                       # reserve a matrix to store sampled proportion proportion of variance due to window 
windowVarProp   = matrix(0,nrow=numberSamples,ncol=numberWindows)
sampleCount     = 0                    # initialize counter for number of samples of genetic variances
                  


# adjust y for the fixed effect (ie location parameter)
ycorr = y - mu


ZPZ=t(Z)%*%Z
zpz=diag(ZPZ)

ptime=proc.time()
# mcmc sampling
for (iter in 1:chainLength){
	
# sample residual variance
	vare = ( t(ycorr)%*%ycorr + nuRes*scaleRes )/rchisq(1,nrecords + nuRes)  
	
# sample intercept
	ycorr = ycorr + mu                    # Unadjust y for the previous sample of mu
	rhs    = sum(ycorr)                   # Form X'y
	invLhs = 1.0/nrecords                 # Form (X'X)-1
	mean = rhs*invLhs                     # Solve (X'X) mu = X'y                    
	mu = rnorm(1,mean,sqrt(invLhs*vare))  # Sample new location parameter                     
	ycorr = ycorr - mu                    # Adjust y for the new sample of mu
	meanMu = meanMu + mu                  # Accumulate the sum to compute posterior mean
	
# sample effect for each locus
	for (locus in 1:nmarkers){
		
		rhs=t(Z[,locus])%*%ycorr +zpz[locus]*alpha[locus]
		mmeLhs = zpz[locus] + vare/varEffects                        # Form the coefficient matrix of MME
		invLhs = 1.0/mmeLhs                                   # Invert the coefficient matrix 
		mean = invLhs*rhs                                     # Solve the MME for locus effect
		oldAlpha=alpha[locus]
		alpha[locus]= rnorm(1,mean,sqrt(invLhs*vare))         # Sample the locus effect from data
		ycorr = ycorr + Z[,locus]*(oldAlpha-alpha[locus]);               # Adjust the data for this locus effect
		meanAlpha[locus] = meanAlpha[locus] + alpha[locus];   # Accumulate the sum for posterior mean
	}
		
	# sample the common locus effect variance		
	varEffects = ( scaleVar*dfEffectVar + sum(alpha^2) )/rchisq(1,dfEffectVar+nmarkers)  
		
}

proc.time()-ptime