##### SISG 2017 Module 10
# Session 7 Practical Profile Scoring Practical #############
# Naomi Wray
######################################################
# set to your own directory 
# (in Rstudio - at top Session > Set Working Directory > To Source File Location)
setwd("~/Documents/Naomi/Teaching/SISG/2017Brisbane/Session7Naomi/Practical7")

##########  FILES #####################################
## SNP ln(OR) from a "Discovery" GWAS  ###########################
#   list of SNPs with their OR have been created
#   files: Discovery_PLT_*  (PLT = pvalue of association less than)
#   files contain SNPs at different P-value thresholds:
#   P<5e8',P<1e5,P<1e2,P<5e2,P<1e1,P<5e1,P<1   

##  Target sample for profile scoring #################
# target.bed  target.bim  target.fam

########## STEP 1  ####################################
##  Create profile scores in plink  ###################
# Use the system() command in R to submit to terminal

#if your computer is very slow then reduce the number of pval alternatives 
#(copy line and hash out original)
for(Pd in c('5e8','1e5','1e2','5e2','1e1','5e1','1')){
  #for(Pd in c('5e8')){
    scoring_command<-sprintf("./plink --bfile target --score Discovery_PLT_%s.txt --out %s_scores --noweb", Pd,Pd)
    system(scoring_command)
# see http://pngu.mgh.harvard.edu/~purcell/plink/profile.shtml
}

##########  STEP 2  ###################################
##  Genomic Profile Risk Score analysis ###############

######### FUNCTIONS NEEDED  ############
# proportion variability liability scale
h2l_R2 <- function(k, r2, p) {
  # k baseline disease risk
  # r2 from a linear regression model of genomic profile risk score
  # p proportion of sample that are cases
  # calculates proportion of variance explained on the liability scale
  # from ABC at http://www.complextraitgenomics.com/software/
  # Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
  x = qnorm(1-k)
  z = dnorm(x)
  i = z/k
  C = k*(1-k)*k*(1-k)/(z^2*p*(1-p))
  theta = i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x)
  h2l_R2 = C*r2 / (1 + C*theta*r2)
} 

# se of the variance explained on the liability scale from the 
# variance explained on the linear scale (r2) and it se (se)
se_h2l_R2 <- function(k,r2,se, p) {
  # k baseline disease risk
  # r2 from a linear regression model attributable to genomic profile risk score
  # se 
  # p proportion of sample that are cases
  # calculates proportion of variance explained on the liability scale
  # from ABC at http://www.complextraitgenomics.com/software/
  # Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.
  # SE on the liability (From a Taylor series expansion)
  # var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d being calculus differentiation
  x = qnorm(1-k)
  z = dnorm(x)
  i = z/k
  C = k*(1-k)*k*(1-k)/(z^2*p*(1-p))
  theta = i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x)
  se_h2l_R2 = C*(1-r2*theta)*se
}
################################################


################################################################################
### Calculate AUC and R2 estimates for one of the polygenic score thresholds ###
#### There is code below this section to loop through all of the thresholds ####
################################################################################
K <- 0.01 # risk of disease
ri <- read.table("1_scores.profile", header=T)
head(ri)

# normalize the score
ri$PHENO<-ri$PHENO - 1 # change from 1, 2 to 0, 1
ri$ZSCORE <-  (ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE)    # z-score of the profile score
P   <-  sum(ri$PHENO)/length(ri$PHENO)                  # proportion of target sample that are cases

# check out the help page ?scale for a quick way to standardize a variable

# Plot the density of the polygenic risk score for cases and controls. 
#install.packages("sm")
library(sm)
sm.density.compare(ri$ZSCORE, ri$PHENO, xlab="Polygenic risk score (Z-scored)")
title(main="Polygenic Risk Score by Case Control Status")
leg <- factor(ri$PHENO, levels= c(1,2),
  labels = c("Control", "Case")) 
# add legend via mouse click
colfill<-c(2:(2+length(levels(leg))))
#legend(locator(1), levels(leg), fill=colfill) 
legend("top", levels(leg), fill=colfill) 


# logistic model
tst1 <- glm(PHENO ~ ZSCORE, data = ri, family = binomial(logit)) # logit model
tst0 <- glm(PHENO ~ 1, data = ri, family = binomial(logit)) # logit model

# view the summary results for the two models - is the polygenic score predictive of disease status?
summary(tst0)
summary(tst1)


# Cox&Snell R2 
# NOTE: To calculate sudo R2 we need likelihood or logLikelihood of the model
#	 	However, glm output give us -2*logLikelihood as the fit statistic
#		We therefore need to recompute it, logLike() function does just that!
#   See Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24.

N <-  length(ri$PHENO) # would this command work if there were missing data?
LL1 <-  logLik(tst1)
LL0 <-  logLik(tst0)

# NOTE: because we are using logLikelihoods we take exponent
CSv <-  1-exp((2/N)*(LL0[1]-LL1[1]))
CSv

# NOTE: Cox&Snell R2 can never reach 1 but it can do so if
#		adjusted by its maximum possible value 

# Nagelkerke's R2
NKv <- CSv/(1-exp((2/N)*LL0[1]))
NKv

#Using package 
#install.packages('fmsb')
library('fmsb')
NagelkerkeR2(tst1)


# pvalue
# NOTE: null -2*logLikelihood is also reported in the alternative model
#		tst1$null.deviance

# Interpret fit?
tst0$deviance
tst1$null.deviance
tst1$deviance

# is it significant?
devdiff <- tst0$deviance-tst1$deviance
df <- tst0$df.residual-tst1$df.residual
pval <- pchisq(devdiff,df,lower.tail=F)
pval

# linear model R2 
std_y <- ri$PHENO
ri$std_y <- (std_y-mean(std_y))/sd(std_y)
lm1 <- lm(std_y ~ ZSCORE, data = ri)
lm0 <- lm(std_y ~ 1, data = ri)
R2v <- 1-exp((2/N)*(logLik(lm0)[1]-logLik(lm1)[1]))
R2v

# standard error of R2v
# from Olkin and Finn (Psychological Bulletin, 1995, Vol. 118, No. 1, 155-164)
np <- 1                     # number of paramters
# vr is se^2 of R2v
vr <- 4/length(std_y)*R2v*(1-R2v)^2*(1-(2*np+3)/length(std_y))

# calculate liability R2
h2l_r2 <- h2l_R2(K,R2v,P)   # linear model
#SE on the liability (From a Taylor series expansion)
#var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d: calculus differentiation
se_h2l_r2 <- se_h2l_R2(K,h2l_r2,vr^.5,P)

# area under curve
#install.packages('pROC')
library('pROC')
aucv1 <- auc(ri$PHENO,tst1$linear.predictors)
aucv1
# aucv1 may be approximately close to correct value (even though covariates are ignored)

#install.packages("PredictABEL")
library(PredictABEL)
plotROC(data=ri, cOutcome=3, predrisk=tst1$linear.predictors)


# make ntiles using cut command - compare deciles to pentiles
nt=5
ri$ntil = cut(ri$ZSCORE, breaks=c(quantile(ri$ZSCORE, probs = seq(0, 1, by = 1/nt))), labels=1:nt, include.lowest=T)

tstd1 <- glm(PHENO ~ ntil, data = ri, family = binomial(logit)) # logit model  #all deciles
tstd0 <- glm(PHENO ~ 1, data = ri, family = binomial(logit)) # logit model

#Odds ratio for being a case compared to control in each decile
ORD <- exp(tstd1$coefficients)
ORDL <- exp(tstd1$coefficients-1.96*summary(tstd1)$coefficients[,2])
ORDH <- exp(tstd1$coefficients+1.96*summary(tstd1)$coefficients[,2])

plot(ORD,ylim=c(min(ORDL),max(ORDH)))
arrows(seq(1,10,1), ORD, seq(1,10,1), ORDH, angle=90,length=0.10) # Draw error bars
arrows(seq(1,10,1), ORD, seq(1,10,1), ORDL, angle=90,length=0.10) # Draw error bars

# Odd ratio: odds of being a case compared to a control in each decile compared to the 1st decile
ORDR=ORD/ORD[1]
plot(ORDR)


###########################################################
### all of the output summarised in a single data frame ###
###########################################################

# Pd    p-value cutoff for discovery cohort
# N - total sample size
# P - proportion of sample that are cases
# NKv - Nagelkerke's R2
# pval - pvalue of the R2
# K - population risk of disease used in converting NKv to liability scale
# h2l_r2 - proportion of variance explained by the score on the liability scale 
# aucv1 - what we think is the most appropriate estimate of AUC attributed to the score
# nt = number of ntiles (eg nt=10 for deciles)
# ORD - the odds ratio when comparing top to bottom decile
# ORDL - lower CI of the ORD
# ORDH - upper CI of the ORD
# pcase_con50 - proportion of cases identified when 50% controls tested
# pcase_pop50 - proportion of cases identified when 50% population tested
# s - proportion of cases identified
# pcont_case_s - proportion of controls tested to identify s proportion of cases
# ppop_case_s - proportion of population tested to identify s proportion of cases
O=data.frame(Pd,N,P,NKv,pval,K,h2l_r2,se_h2l_r2,aucv1,
             nt,ORD[nt],ORDL[nt],ORDH[nt],ORDR[nt])
print(O,digits=3)

O=data.frame("Pd","N","P","NKv","pval","K","h2l_r2","se_h2l_r2","aucv1","nt","ORD[nt]","ORDL","ORDH")
write.table(O,file="profile_out.csv",row.names=F,col.names=F,quote=F,sep=",")

#######################################################################################################
### The following code loops throguh all of the polygenic score thresholds that have been generated ###
######### There is a single output object, which houses the various AUC and R2 estimates ##############
#######################################################################################################

for(Pd in c('5e8','1e5','1e2','5e2','1e1','5e1','1')){
# read in the score file *_scores.profile
  fname=paste(Pd,"_scores.profile",sep="")
  ri <- read.table(fname, header=T)

# normalize the score
ri$ZSCORE <-  (ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE)    # z-score of the profile score
ri$PHENO<-ri$PHENO - 1 # change from 1, 2 to 0, 1
P   <-  sum(ri$PHENO)/length(ri$PHENO)                  # proportion of target sample that are cases

# logistic model
tst1 <- glm(PHENO ~ ZSCORE, data = ri, family = binomial(logit)) # logit model
tst0 <- glm(PHENO ~ 1, data = ri, family = binomial(logit)) # logit model

# area under curve

aucv1 <- auc(ri$PHENO,tst1$linear.predictors)
# aucv1 may be approximately close to correct value (even though covaraites are ignored)

# Cox&Snell R2 
# NOTE: To calculate sudo R2 we need likelihood or logLikelihood of the model
#	 	However, glm output give us -2*logLikelihood as the fit statistic
#		We therefore need to recompute it, logLike() function does just that!
N <-  length(ri$PHENO)
LL1 <-  logLik(tst1)
LL0 <-  logLik(tst0)

# NOTE: because we are using logLikelihoods we take exponent
CSv <-  1-exp((2/N)*(LL0[1]-LL1[1]))

# NOTE: Cox&Snell R2 can never reach 1 but it can do so if
#		adjusted by its maximum possible value 

# Nagelkerke's R2
NKv <- CSv/(1-exp((2/N)*LL0[1]))

# pvalue
# NOTE: null -2*logLikelihood is also reported in the alternative model
#		tst1$null.deviance

# Interpret fit?
tst0$deviance
tst1$null.deviance
tst1$deviance

# is it significant?
devdiff <- tst0$deviance-tst1$deviance
df <- tst0$df.residual-tst1$df.residual
pval <- pchisq(devdiff,df,lower.tail=F)

# linear model R2 
std_y <- ri$PHENO
ri$std_y <- (std_y-mean(std_y))/sd(std_y)
lm1 <- lm(std_y ~ ZSCORE, data = ri)
lm0 <- lm(std_y ~ 1, data = ri)
R2v <- 1-exp((2/N)*(logLik(lm0)[1]-logLik(lm1)[1]))

# standard error of R2v
# from Olkin and Finn (Psychological Bulletin, 1995, Vol. 118, No. 1, 155-164)
np <- 1                     # number of paramters
vr <- 4/length(std_y)*R2v*(1-R2v)^2*(1-(2*np+3)/length(std_y))

# calculate liability R2
K <- 0.05                   # Baseline disease risk in the population
h2l_r2 <- h2l_R2(K,R2v,P)   # linear model
#SE on the liability (From a Taylor series expansion)
#var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d: calculus differentiation
se_h2l_r2 <- se_h2l_R2(K,h2l_r2,vr^.5,P)


# make ntiles using cut command - compare deciles to pentiles
nt=5
ri$ntil = cut(ri$ZSCORE, breaks=c(quantile(ri$ZSCORE, probs = seq(0, 1, by = 1/nt))), labels=1:nt, include.lowest=T)

tstd1 <- glm(PHENO ~ ntil, data = ri, family = binomial(logit)) # logit model  #all deciles
tstd0 <- glm(PHENO ~ 1, data = ri, family = binomial(logit)) # logit model

#Odds ratio for being a case compared to control in each decile
ORD <- exp(tstd1$coefficients)
ORDL <- exp(tstd1$coefficients-1.96*summary(tstd1)$coefficients[,2])
ORDH <- exp(tstd1$coefficients+1.96*summary(tstd1)$coefficients[,2])

# Odd ratio: odds of being a case compared to a control in each ntile compared to the 1st decile
ORDR=ORD/ORD[1]

# output
# Pd     # p-value cutoff for discovery cohort
# N - total sample size
# P - proportion of sample that are cases
# NKv - Nagelkerke's R2
# pval - pvalue of the R2
# K - population risk of disease used in converting NKv to liability scale
# h2l_r2 - proportion of variance explained by the score on the liability scale 
# aucv1 - what we think is the most appropriate estimate of AUC attributed to the score
# ORD - the odds ratio when comparing top to bottom decile
# ORDL - lower CI of the ORD
# ORDH - upper CI of the ORD
O=data.frame(Pd,N,P,NKv,pval,K,h2l_r2,se_h2l_r2,aucv1,nt,ORD[nt],ORDL[nt],ORDH[nt])
print(O,digits=3,row.names=F,col.names=F)
write.table(O,file="profile_out.csv",row.names=F,col.names=F,quote=F,append=T,sep=",")
}

######################################
# Advanced
#######################################
# Calculate the proportion of the population that needs to be 
# screened to capture proportion s of cases.

K <- 0.01 # risk of disease
ri <- read.table("1_scores.profile", header=T)
head(ri)

# normalize the score
ri$PHENO<-ri$PHENO - 1 # change from 1, 2 to 0, 1
ri$ZSCORE <-  (ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE)    # z-score of the profile score

library(PredictABEL)
plotROC(data=ri, cOutcome=3, predrisk=tst1$linear.predictors)


# Probability of disease when have screened population sample
#that includes 50% of all controls
T0 = qnorm(1-K)
z = dnorm(T0)
i = z/K
v = -i*K/(1-K)      #mean liability of controls
vcase=h2l_r2*(1-h2l_r2*i*(i-T0)) # variance in cases
vcont=h2l_r2*(1-h2l_r2*v*(v-T0))  # variance in controls
t50=qnorm(0.5,v*h2l_r2,sqrt(vcont)) # controls screened
pcase_con50=1-pnorm(t50,i*h2l_r2,sqrt(vcase)) #prop of cases captured when pop screened that inc 50% controls
pcase_con50

# Probability of disease when have screened 50% of population
t50=qnorm(0.5,0,h2l_r2)
pcase_pop50=1-pnorm(t50,i*h2l_r2,sqrt(vcase)) #prop of cases captured when 50% of pop screened
pcase_pop50

# What proportion of pop is needed to capture s proportion of cases
s=0.5
ts=qnorm(s,i*h2l_r2,sqrt(vcase))
pcont_case_s=1-pnorm(ts,v*h2l_r2,sqrt(vcont)) # prop of cases needed to be screened to capture s proportion of cases
pcont_case_s
ppop_case_s=1-pnorm(ts,0,sqrt(h2l_r2)) # prop of pop screened to capture 50% of cases
ppop_case_s

# calculate auc for variance explained on liability scaled
# probability of a case being higher ranked than a control
aucchk=pnorm((i-v)*h2l_r2/(sqrt(vcase+vcont)))
aucchk