# Exercise 1: Read in population allele frequencies generated by Plink
# Use PLINK to obtain allele frequencies for each of the four HapMap Populations.
# Files with family and individual IDs for the populations:
# hapmap-CEU.txt, hapmap-YRI.txt, hapmap-CHB.txt, and hapmap-ASW.txt.
# You can use the following PLINK command to obtain allele frequencies for only the CEU individuals.
# ./plink --bfile plink_hapmap_CEU_YRI_ASW_CHB –-keep hapmap-CEU.txt --freq --out CEU
CEU.data=read.table("~/Dropbox/Teaching/2015/BMS262/Module1/Excercises/CEU.frq",header=TRUE)
YRI.data=read.table("~/Dropbox/Teaching/2015/BMS262/Module1/Excercises/YRI.frq",header=TRUE)
ASW.data=read.table("~/Dropbox/Teaching/2015/BMS262/Module1/Excercises/ASW.frq",header=TRUE)
CHB.data=read.table("~/Dropbox/Teaching/2015/BMS262/Module1/Excercises/CHB.frq",header=TRUE)
head(CEU.data)
head(YRI.data)
CHR | SNP | A1 | A2 | MAF | NCHROBS | |
---|---|---|---|---|---|---|
1 | 1 | rs9442372 | A | G | 0.4107 | 224 |
2 | 1 | rs2887286 | C | T | 0.1607 | 224 |
3 | 1 | rs3813199 | A | G | 0.09375 | 224 |
4 | 1 | rs6685064 | T | C | 0.07143 | 224 |
5 | 1 | rs9439462 | T | C | 0.04505 | 222 |
6 | 1 | rs3820011 | A | C | 0.3063 | 222 |
CHR | SNP | A1 | A2 | MAF | NCHROBS | |
---|---|---|---|---|---|---|
1 | 1 | rs9442372 | A | G | 0.4898 | 294 |
2 | 1 | rs2887286 | T | C | 0.4864 | 294 |
3 | 1 | rs3813199 | A | G | 0.1837 | 294 |
4 | 1 | rs6685064 | C | T | 0.4048 | 294 |
5 | 1 | rs9439462 | C | T | 0.2483 | 294 |
6 | 1 | rs3820011 | A | C | 0.1497 | 294 |
#### Use the same reference allele ####
CEU.frq=CEU.data$MAF
SWITCH1=CEU.data$A1!=YRI.data$A1
NewFreq=YRI.data$MAF
NewFreq[SWITCH1]=1-YRI.data$MAF[SWITCH1]
YRI.data$NewFreq=NewFreq
YRI.frq=NewFreq
head(YRI.data)
CHR | SNP | A1 | A2 | MAF | NCHROBS | NewFreq | |
---|---|---|---|---|---|---|---|
1 | 1 | rs9442372 | A | G | 0.4898 | 294 | 0.4898 |
2 | 1 | rs2887286 | T | C | 0.4864 | 294 | 0.5136 |
3 | 1 | rs3813199 | A | G | 0.1837 | 294 | 0.1837 |
4 | 1 | rs6685064 | C | T | 0.4048 | 294 | 0.5952 |
5 | 1 | rs9439462 | C | T | 0.2483 | 294 | 0.7517 |
6 | 1 | rs3820011 | A | C | 0.1497 | 294 | 0.1497 |
# Exercise 2: Obtain Fst measures for CEU versus YRI Hapmap Populations
FREQpop1=CEU.frq
FREQpop2=YRI.frq
FREQ=(FREQpop1+FREQpop2)/2
INCLUDE=(FREQ>0 & FREQ<1)
FREQpop1=FREQpop1[INCLUDE]
FREQpop2=FREQpop2[INCLUDE]
FREQ=FREQ[INCLUDE]
head(FREQpop1)
head(FREQpop2)
head(FREQ)
# Two ways to estimate FST
# FST Estimate 1
r=2 # r is the number of populations
s2=((FREQpop1-FREQ)^2+(FREQpop2-FREQ)^2)/((r-1))
FSTVEC=s2/(FREQ*(1-FREQ))
FST1=mean(FSTVEC)
# FST Estimate 2: Weir & Cockerham's ESTIMATOR (take into account sample size)
n1=max(CEU.data$NCHROBS)/2
n2=max(YRI.data$NCHROBS)/2
nc=sum(n1+n2)-sum(n1^(2)+n2^(2))/sum(n1+n2)
MSA= (1/(r-1))* (n1*(FREQpop1-FREQ)^2+n2*(FREQpop2-FREQ)^2)
MSW=(1/(n1-1+n2-1))*(n1* FREQpop1*(1-FREQpop1)+n2*FREQpop2*(1-FREQpop2))
numer=sum(MSA-MSW)
denom=(sum(MSA+(nc-1)*MSW))
FST2=numer/denom
FST1
FST2
# Exercise 3: Obtain Fst values for other HapMap population samples
SWITCH2=CEU.data$A1!=ASW.data$A1
NewFreq=ASW.data$MAF
NewFreq[SWITCH2]=1-ASW.data$MAF[SWITCH2]
ASW.data$NewFreq=NewFreq
ASW.frq=NewFreq
FREQpop1=CEU.frq
FREQpop2=ASW.frq
FREQ=(FREQpop1+FREQpop2)/2
INCLUDE=(FREQ>0 & FREQ<1)
FREQpop1=FREQpop1[INCLUDE]
FREQpop2=FREQpop2[INCLUDE]
FREQ=FREQ[INCLUDE]
r=2 # r is the number of populations
s2=((FREQpop1-FREQ)^2+(FREQpop2-FREQ)^2)/((r-1))
FSTVEC=s2/(FREQ*(1-FREQ))
FST1=mean(FSTVEC)
# FST Estimate 2: Weir & Cockerham's ESTIMATOR (take into account sample size)
n1=max(CEU.data$NCHROBS)/2
n2=max(ASW.data$NCHROBS)/2
nc=sum(n1+n2)-sum(n1^(2)+n2^(2))/sum(n1+n2)
MSA= (1/(r-1))* (n1*(FREQpop1-FREQ)^2+n2*(FREQpop2-FREQ)^2)
MSW=(1/(n1-1+n2-1))*(n1* FREQpop1*(1-FREQpop1)+n2*FREQpop2*(1-FREQpop2))
numer=sum(MSA-MSW)
denom=(sum(MSA+(nc-1)*MSW))
FST2=numer/denom
FST1
FST2
SWITCH3=CEU.data$A1!=CHB.data$A1
NewFreq=CHB.data$MAF
NewFreq[SWITCH3]=1-CHB.data$MAF[SWITCH3]
CHB.data$NewFreq=NewFreq
CHB.frq=NewFreq
FREQpop1=CEU.frq
FREQpop2=CHB.frq
FREQ=(FREQpop1+FREQpop2)/2
INCLUDE=(FREQ>0 & FREQ<1)
FREQpop1=FREQpop1[INCLUDE]
FREQpop2=FREQpop2[INCLUDE]
FREQ=FREQ[INCLUDE]
r=2 # r is the number of populations
s2=((FREQpop1-FREQ)^2+(FREQpop2-FREQ)^2)/((r-1))
FSTVEC=s2/(FREQ*(1-FREQ))
FST1=mean(FSTVEC)
# FST Estimate 2: Weir & Cockerham's ESTIMATOR (take into account sample size)
n1=max(CEU.data$NCHROBS)/2
n2=max(CHB.data$NCHROBS)/2
nc=sum(n1+n2)-sum(n1^(2)+n2^(2))/sum(n1+n2)
MSA= (1/(r-1))* (n1*(FREQpop1-FREQ)^2+n2*(FREQpop2-FREQ)^2)
MSW=(1/(n1-1+n2-1))*(n1* FREQpop1*(1-FREQpop1)+n2*FREQpop2*(1-FREQpop2))
numer=sum(MSA-MSW)
denom=(sum(MSA+(nc-1)*MSW))
FST2=numer/denom
FST1
FST2
# Which of the HapMap population pairs are the most divergent?
# Which populations are the most similar based on Fst?