# From Phasukijwattana et al. (2010), Leber Hereditary Optic
# Neuropathy (LHON) disease and genotypes for marker rs6767450
# Exercise 1: Read in the data
LHON=read.table("~/Dropbox/Teaching/2015/BMS262/Module1/Excercises/LHON/LHON.txt",header=TRUE)
### View the first few lines of the LHON data
head(LHON)
IID | GENO | PHENO | |
---|---|---|---|
1 | ID1 | TT | CONTROL |
2 | ID2 | CT | CONTROL |
3 | ID3 | TT | CASE |
4 | ID4 | CT | CONTROL |
5 | ID5 | TT | CONTROL |
6 | ID6 | TT | CONTROL |
### Could also use the View command
View(LHON)
### Take a closer look at the types of variables in the LHON data frame
str(LHON)
'data.frame': 328 obs. of 3 variables: $ IID : Factor w/ 328 levels "ID1","ID10","ID100",..: 1 112 223 263 274 285 296 307 318 2 ... $ GENO : Factor w/ 3 levels "CC","CT","TT": 3 2 3 2 3 3 1 3 3 3 ... $ PHENO: Factor w/ 2 levels "CASE","CONTROL": 2 2 1 2 2 2 2 2 2 2 ...
# Exercise 2: Perform logistic regression. Obtain odds ratios and confidence intervals
# Create a 0 and 1 phenotype variable indicating Case/Control Status to perform a logistic regression analysis
LHON$newpheno=with(LHON,ifelse(PHENO=="CASE",1,0))
## What would be the reference genotype for a logistic regression analysis?
## The first factor will be the reference genotype.
levels(LHON$GENO)
# Perform a logistic regression analysis
model1=glm(newpheno~GENO,family=binomial(link="logit"),data=LHON)
# View the summary results of the logistic regression model, including parameter estimates and standard errors
summary(model1)
# Note:
# AIC: Aikake Information Criterion:
# A relative estimate of the information lost when a given model is used to represent the process that generates
# the data. It deals with the trade-off between the goodness of fit of the model and the complexity of the model.
# Does not give an absolute goodness of fit of the model. Generally the model with the smallest
# AIC is preferred.
Call: glm(formula = newpheno ~ GENO, family = binomial(link = "logit"), data = LHON) Deviance Residuals: Min 1Q Median 3Q Max -0.9695 -0.8701 -0.8701 1.5197 2.1093 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -0.5108 0.5164 -0.989 0.3226 GENOCT -1.5994 0.6378 -2.508 0.0122 * GENOTT -0.2654 0.5349 -0.496 0.6197 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 383.49 on 327 degrees of freedom Residual deviance: 368.48 on 325 degrees of freedom AIC: 374.48 Number of Fisher Scoring iterations: 4
## What is the odds ratio for the CT genotype?
exp(-1.5994)
# Obtain a confidence interval for the odds ratio parameter for the CT genotype
myse=1.96*(.6378)
CI=c(-1.5994-myse,-1.5994+myse)
exp(CI)
# Similarly can obtain a confidence interval for the odds ratio for the TT genotype
exp(-.2654)
myse=1.96*(.5349)
CI=c(-.2654-myse,-.2654+myse)
exp(CI)
# Exercise 3: What happens if we change the reference genotype to be TT instead of CC?
LHON$NEWGENO=with(LHON,relevel(GENO, ref = "TT"))
levels(LHON$NEWGENO)
### Perform logistic regression
model2=glm(newpheno~NEWGENO,family=binomial(link="logit"),data=LHON)
summary(model2)
Call: glm(formula = newpheno ~ NEWGENO, family = binomial(link = "logit"), data = LHON) Deviance Residuals: Min 1Q Median 3Q Max -0.9695 -0.8701 -0.8701 1.5197 2.1093 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -0.7763 0.1395 -5.563 2.64e-08 *** NEWGENOCC 0.2654 0.5349 0.496 0.619739 NEWGENOCT -1.3340 0.3995 -3.339 0.000841 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 383.49 on 327 degrees of freedom Residual deviance: 368.48 on 325 degrees of freedom AIC: 374.48 Number of Fisher Scoring iterations: 4
# Get the odds ratio and confidence interval
exp(-1.3340 )
myse=1.96*(.3995)
CI=c(-1.3340-myse,-1.3340+myse)
exp(CI)
# Could also do the following to get 95% confidence interval of parameters from your model
MYCI=confint.default(model2)
exp(MYCI)
2.5 % | 97.5 % | |
---|---|---|
(Intercept) | 0.3500310 | 0.6048404 |
NEWGENOCC | 0.4570423 | 3.7204782 |
NEWGENOCT | 0.1203945 | 0.5764187 |
### Plot the data for a better understanding
plot(factor(PHENO)~factor(GENO), data=LHON)
# Excercise 4: Additive logistic regression model
LHON$genoadd <- with(LHON, 0 + 1*(GENO=="CT") + 2*(GENO=="TT"))
model3 <- glm(newpheno~genoadd,family=binomial(link="logit"),data=LHON)
summary(model3)
Call: glm(formula = newpheno ~ genoadd, family = binomial(link = "logit"), data = LHON) Deviance Residuals: Min 1Q Median 3Q Max -0.8436 -0.8436 -0.6854 1.5531 1.9797 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.8077 0.4554 -3.970 7.2e-05 *** genoadd 0.4787 0.2505 1.911 0.0559 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 383.49 on 327 degrees of freedom Residual deviance: 379.47 on 326 degrees of freedom AIC: 383.47 Number of Fisher Scoring iterations: 4
exp(0.4787)
MYCI2=confint.default(model3)
exp(MYCI2)
2.5 % | 97.5 % | |
---|---|---|
(Intercept) | 0.06718883 | 0.40046163 |
genoadd | 0.9879249 | 2.6369796 |