We will learn how to evaluate the prediction accuracy and demonstrate some pitfalls for genetic prediction.

Further information: https://cnsgenomics.com/data/teaching/SISG/module_10/Mod10_Session7Naomi/Practical7/

1 Prediction accurarcy

1.1 R-square for quantitative traits

### the input data is store in this folder
infolder = "/data/module5/sim/"

### read phenotype value and prediction scores
pheno = read.table(paste(infolder, "prac2_quan.pheno", sep=""), header=T)
prs = read.table(paste(infolder, "prac2_quan.prs", sep=""), header=T)
covar = read.table(paste(infolder, "prac2_quan.covar", sep=""), header=T)
data = merge(pheno, prs, by="ID")
data = merge(data, covar, by="ID")

### linear regression
lmR = lm(pheno ~ age + sex, data=data)  ### reduced module
lmF = lm(pheno ~ age + sex + prs, data=data) ### full module

### look at the summary results for the two models
summary(lmR)
summary(lmF)

### incremental r-square
summary(lmF)$"r.square" - summary(lmR)$"r.square"

## [1] 0.09699013

1.2 Nagelkerke’s R-sqaure

### read phenotype value and prediction scores
pheno = read.table(paste(infolder, "prac2_bin.pheno", sep=""), header=T)
prs = read.table(paste(infolder, "prac2_bin.prs", sep=""), header=T)
covar = read.table(paste(infolder, "prac2_bin.covar", sep=""), header=T)
data = merge(pheno, prs, by="ID")
data = merge(data, covar, by="ID")

### logistic regression
glmR = glm(pheno ~ age + sex, data=data, family=binomial(logit))  ### reduced module
glmF = glm(pheno ~ age + sex + prs, data=data, family=binomial(logit))  ### full module

### look at the summary results for the two models
summary(glmR)
summary(glmF)

### log-likelihood
N = nrow(data)
LLF = logLik(glmF)
LLR = logLik(glmR)

### Cox&Snell R2
CSv <-  1-exp((2/N)*(LLR[1]-LLF[1]))
CSv

## [1] 0.065939

### Nagelkerke's R2
NKv <- CSv/(1-exp((2/N)*LLR[1]))
NKv

## [1] 0.08815463

1.3 AUC

### AUC
### install.packages('pROC')
library('pROC')
aucF = auc(data$pheno, glmF$linear.predictors)  ### AUC for full module
aucR = auc(data$pheno, glmR$linear.predictors)  ### AUC for reduced module
aucF; aucR

## Area under the curve: 0.6549

## Area under the curve: 0.5509

aucF - aucR

## [1] 0.104029

### draw the ROC
#install.packages("PredictABEL")
library(PredictABEL)
plotROC(data=data, cOutcome=2, predrisk=glmF$linear.predictors)

## AUC [95% CI] for the model 1 :  0.655 [ 0.644  -  0.666 ]

1.4 Decile Odds Ratio

### cut into deciles
data$decile = cut(data$prs, breaks=c(quantile(data$prs, probs=seq(0,1,by=0.1))), labels=1:10, include.lowest=T)

### calculate manually the odds in each decile
#### install.packages("tidyverse")
library(dplyr)
data %>% group_by(decile) %>% 
    summarise(n_case=sum(pheno==1), n_control=sum(pheno==0)) %>%
    mutate(odds = n_case/n_control) %>%
    mutate(ORs = odds/odds[1])

### calculate ORs using logistic regression
glmD <- glm(pheno ~ decile, data = data, family = binomial(logit))
### Odds for being a case compared to control in each decile
ORD <- exp(glmD$coefficients)
ORD

## (Intercept)     decile2     decile3     decile4     decile5     decile6 
##   0.3888889   1.4590237   2.0040671   2.0204082   2.6444509   2.7634855 
##     decile7     decile8     decile9    decile10 
##   3.4086379   3.6098901   4.6314526   6.9877854

### Plot odds
ORDL <- exp(glmD$coefficients-1.96*summary(glmD)$coefficients[,2])
ORDH <- exp(glmD$coefficients+1.96*summary(glmD)$coefficients[,2])
plot(ORD,ylim=c(min(ORDL),max(ORDH)))
arrows(seq(1,10,1), ORD, seq(1,10,1), ORDH, angle=90,length=0.10) # Draw error bars
arrows(seq(1,10,1), ORD, seq(1,10,1), ORDL, angle=90,length=0.10) # Draw error bars

1.5 Variance explained on liability scale

### function to convert R-square from 0-1 observed scale
### to liability scale
h2l_R2 <- function(k, r2, p) {
  # k baseline disease risk
  # r2 from a linear regression model of genomic profile risk score
  # p proportion of sample that are cases
  # calculates proportion of variance explained on the liability scale
  # from ABC at http://www.complextraitgenomics.com/software/
  # Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
  x = qnorm(1-k)
  z = dnorm(x)
  i = z/k
  C = k*(1-k)*k*(1-k)/(z^2*p*(1-p))
  theta = i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x)
  h2l_R2 = C*r2 / (1 + C*theta*r2)
  return(h2l_R2)
}
K=0.1  ## population prevalence
P = sum(data$pheno)/nrow(data)  ### propotion of cases in the sample
P

## [1] 0.5073

### linear regression with 0/1 values
lmR = lm(pheno~age+sex, data=data)
lmF = lm(pheno~age+sex+prs, data=data)
#R2v = summary(lmF)$"r.square" - summary(lmR)$"r.square"
#To strictly follow the equation 7 in Lee et al.
R2v = 1-exp((2/N)*(logLik(lmR)[1]-logLik(lmF)[1]))
R2v

## [1] 0.06595017

### convert to liability scale
h2l_R2(K,R2v,P)

## [1] 0.07130684

2 Pitfalls

2.1 Simulate data

### we simulate data now
set.seed(612)    ### set a seed for reproducation
n=1000; m=100    ### n: sample size; m: the number of SNPs

### simulate gneotype from binomial distribution
### minor allele frequency from uniform distribution
mafs = runif(m, 0.05, 0.5)
x = do.call("cbind", lapply(1:m, function(x) rbinom(n, 2, mafs[x]))) 
colnames(x) = paste("SNP", 1:m, sep="")

### simulate a phenotype from an independent standard normal distribution
### null hypothesis: prediction accuracy should be 0
y = rnorm(n)
data = data.frame(y, x)

2.2 Directly report R2 in the discovery sample

### E(R2) = m/n when m<n
summary(lm(as.formula(paste("y~", paste("SNP", 1:m, sep="", collapse="+"), sep="")),data=data.frame(data)))$"r.square"

## [1] 0.08951179

What is the R2? Is it close to m/n, where m is the number of SNPs and n is the sample size?

2.3 Winner’s curse

### perform GWAS: association using R lm() function
### instead of Plink software (only practical for small data)
### select the top 10 SNPs
pvals = sapply(1:m, function(i) coef(summary(lm(y~x[,i])))[2,4])
### No p-values passed Bonferroni threshold of 5e-4 (0.05/100)
head(sort(pvals))

## [1] 0.0007738269 0.0144049898 0.0185012440 0.0612186306 0.0612745215
## [6] 0.0616904469

top10SNPs = colnames(x)[head(order(pvals), 10)]
summary(lm(as.formula(paste("y~", paste(top10SNPs, collapse="+"), sep="")),data=data))$"r.square"

## [1] 0.04027331