We will learn how to evaluate the prediction accuracy and demonstrate some pitfalls for genetic prediction.
Further information: https://cnsgenomics.com/data/teaching/SISG/module_10/Mod10_Session7Naomi/Practical7/
### the input data is store in this folder
= "/data/module5/sim/" infolder
### read phenotype value and prediction scores
= read.table(paste(infolder, "prac2_quan.pheno", sep=""), header=T)
pheno = read.table(paste(infolder, "prac2_quan.prs", sep=""), header=T)
prs = read.table(paste(infolder, "prac2_quan.covar", sep=""), header=T)
covar = merge(pheno, prs, by="ID")
data = merge(data, covar, by="ID")
data
### linear regression
= lm(pheno ~ age + sex, data=data) ### reduced module
lmR = lm(pheno ~ age + sex + prs, data=data) ### full module lmF
### look at the summary results for the two models
summary(lmR)
summary(lmF)
### incremental r-square
summary(lmF)$"r.square" - summary(lmR)$"r.square"
## [1] 0.09699013
### read phenotype value and prediction scores
= read.table(paste(infolder, "prac2_bin.pheno", sep=""), header=T)
pheno = read.table(paste(infolder, "prac2_bin.prs", sep=""), header=T)
prs = read.table(paste(infolder, "prac2_bin.covar", sep=""), header=T)
covar = merge(pheno, prs, by="ID")
data = merge(data, covar, by="ID")
data
### logistic regression
= glm(pheno ~ age + sex, data=data, family=binomial(logit)) ### reduced module
glmR = glm(pheno ~ age + sex + prs, data=data, family=binomial(logit)) ### full module glmF
### look at the summary results for the two models
summary(glmR)
summary(glmF)
### log-likelihood
= nrow(data)
N = logLik(glmF)
LLF = logLik(glmR)
LLR
### Cox&Snell R2
<- 1-exp((2/N)*(LLR[1]-LLF[1]))
CSv CSv
## [1] 0.065939
### Nagelkerke's R2
<- CSv/(1-exp((2/N)*LLR[1]))
NKv NKv
## [1] 0.08815463
### AUC
### install.packages('pROC')
library('pROC')
= auc(data$pheno, glmF$linear.predictors) ### AUC for full module
aucF = auc(data$pheno, glmR$linear.predictors) ### AUC for reduced module
aucR aucF; aucR
## Area under the curve: 0.6549
## Area under the curve: 0.5509
- aucR aucF
## [1] 0.104029
### draw the ROC
#install.packages("PredictABEL")
library(PredictABEL)
plotROC(data=data, cOutcome=2, predrisk=glmF$linear.predictors)
## AUC [95% CI] for the model 1 : 0.655 [ 0.644 - 0.666 ]
### cut into deciles
$decile = cut(data$prs, breaks=c(quantile(data$prs, probs=seq(0,1,by=0.1))), labels=1:10, include.lowest=T) data
### calculate manually the odds in each decile
#### install.packages("tidyverse")
library(dplyr)
%>% group_by(decile) %>%
data summarise(n_case=sum(pheno==1), n_control=sum(pheno==0)) %>%
mutate(odds = n_case/n_control) %>%
mutate(ORs = odds/odds[1])
### calculate ORs using logistic regression
<- glm(pheno ~ decile, data = data, family = binomial(logit))
glmD ### Odds for being a case compared to control in each decile
<- exp(glmD$coefficients)
ORD ORD
## (Intercept) decile2 decile3 decile4 decile5 decile6
## 0.3888889 1.4590237 2.0040671 2.0204082 2.6444509 2.7634855
## decile7 decile8 decile9 decile10
## 3.4086379 3.6098901 4.6314526 6.9877854
### Plot odds
<- exp(glmD$coefficients-1.96*summary(glmD)$coefficients[,2])
ORDL <- exp(glmD$coefficients+1.96*summary(glmD)$coefficients[,2])
ORDH plot(ORD,ylim=c(min(ORDL),max(ORDH)))
arrows(seq(1,10,1), ORD, seq(1,10,1), ORDH, angle=90,length=0.10) # Draw error bars
arrows(seq(1,10,1), ORD, seq(1,10,1), ORDL, angle=90,length=0.10) # Draw error bars
### function to convert R-square from 0-1 observed scale
### to liability scale
<- function(k, r2, p) {
h2l_R2 # k baseline disease risk
# r2 from a linear regression model of genomic profile risk score
# p proportion of sample that are cases
# calculates proportion of variance explained on the liability scale
# from ABC at http://www.complextraitgenomics.com/software/
# Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
= qnorm(1-k)
x = dnorm(x)
z = z/k
i = k*(1-k)*k*(1-k)/(z^2*p*(1-p))
C = i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x)
theta = C*r2 / (1 + C*theta*r2)
h2l_R2 return(h2l_R2)
}=0.1 ## population prevalence
K= sum(data$pheno)/nrow(data) ### propotion of cases in the sample
P P
## [1] 0.5073
### linear regression with 0/1 values
= lm(pheno~age+sex, data=data)
lmR = lm(pheno~age+sex+prs, data=data)
lmF #R2v = summary(lmF)$"r.square" - summary(lmR)$"r.square"
#To strictly follow the equation 7 in Lee et al.
= 1-exp((2/N)*(logLik(lmR)[1]-logLik(lmF)[1]))
R2v R2v
## [1] 0.06595017
### convert to liability scale
h2l_R2(K,R2v,P)
## [1] 0.07130684
### we simulate data now
set.seed(612) ### set a seed for reproducation
=1000; m=100 ### n: sample size; m: the number of SNPs
n
### simulate gneotype from binomial distribution
### minor allele frequency from uniform distribution
= runif(m, 0.05, 0.5)
mafs = do.call("cbind", lapply(1:m, function(x) rbinom(n, 2, mafs[x])))
x colnames(x) = paste("SNP", 1:m, sep="")
### simulate a phenotype from an independent standard normal distribution
### null hypothesis: prediction accuracy should be 0
= rnorm(n)
y = data.frame(y, x) data
### E(R2) = m/n when m<n
summary(lm(as.formula(paste("y~", paste("SNP", 1:m, sep="", collapse="+"), sep="")),data=data.frame(data)))$"r.square"
## [1] 0.08951179
What is the R2? Is it close to m/n
, where m is the
number of SNPs and n is the sample size?
### perform GWAS: association using R lm() function
### instead of Plink software (only practical for small data)
### select the top 10 SNPs
= sapply(1:m, function(i) coef(summary(lm(y~x[,i])))[2,4])
pvals ### No p-values passed Bonferroni threshold of 5e-4 (0.05/100)
head(sort(pvals))
## [1] 0.0007738269 0.0144049898 0.0185012440 0.0612186306 0.0612745215
## [6] 0.0616904469
= colnames(x)[head(order(pvals), 10)]
top10SNPs summary(lm(as.formula(paste("y~", paste(top10SNPs, collapse="+"), sep="")),data=data))$"r.square"
## [1] 0.04027331
What is the R2? Is it close to m/n
, where m is the
number of SNPs and n is the sample size?
### first 900 samples into discovery set
= y[1:900]
y_dis = x[1:900,]
x_dis ### the remaining 100 samples into target set
= y[901:1000]
y_target = x[901:1000,]
x_target
### estimate effect sizes in the total sample
= sapply(1:m, function(i) coef(summary(lm(y~x[,i])))[2,1])
b_total ### evaluate the PRS in the target sample
= x_target %*% b_total
prs1 summary(lm(y_target~prs1))$"r.square"
## [1] 0.1573479
### estimate effect sizes only in the discovery sample
= sapply(1:m, function(i) coef(summary(lm(y_dis~x_dis[,i])))[2,1])
b_dis ### evaluate the PRS in the target sample
= x_target %*% b_dis
prs2 summary(lm(y_target~prs2))$"r.square"
## [1] 0.001510061
What is the R2? Is it expected?
A work by Huanwei Wang (huanwei.wang@uq.edu.au)