# ---------------------------------------------------------------------------------
# Summer Institute of Statistical Genetics 2016
# Module 18: Statistical & Quantitative Genetics of Disease
# Practical 3 R code
# Naomi Wray naomi.wray@uq.edu.au
#-----------------------------------------------
#Set up single locus disease risk model
#----------------------------------------------
K = 0.01    # lifetime risk of disease between 0 and 1
p = 0.2     # probability of risk allele
R = 1.2     # relative risk for heterozygote compared to homozygote
PG=c(rep(0,3)); PDgivG=c(rep(0,3)); PDandG=c(rep(0,3)) # set up vectors
PG[1] = (1-p)^2  # genotype frequency of homozygote non-risk allele
PG[2] = 2*p*(1-p)
PG[3] = p^2
f0 = K/(1+p*(R-1))^2
PDgivG[1] = f0
PDgivG[2] = f0*R
PDgivG[3] = f0*R*R
PDandG = PDgivG*PG
PGgivD = PDandG/K

PG
PDgivG
PDandG
sum(PDandG) # check equals K
PGgivD

PG <- c((1-p)^2,2*p*(1-p),p^2)
PDgivG <- c(f0,f0*R,f0*R^2)       # Probability of disease given the genotype


# ---------------------------------------------------------------------------------
# b) Power in case-control study design
# ---------------------------------------------------------------------------------
N <- 10000                        # Total sample size
v <- 0.5                          # Proportion of cases
p <- 0.2                          # Frequency of risk allele A
R <- 1.2                          # Heterozygote genotype relative risk; homozygote GRR is R^2
K <- 0.01                         # Overall risk (or probability) of disease in population
alpha <- 5e-8                     # Level of significance that will be used in the association study
t <- qnorm(alpha/2,0,1)           # Normal distribution threshold for declaring significance
# Basic single locus parameters
f0 <- K/(1+p*(R-1))^2             # Probability of disease in those homozygote for non-risk allele aa
PG <- c((1-p)^2,2*p*(1-p),p^2)    # Probability of genotypes aa, Aa, AA assumes Hardy-Weinberg equilibrium
# Cases 
PDgivG <- c(f0,f0*R,f0*R^2)       # Probability of disease given the genotype
PGandD <- PDgivG*PG               # Probability of  genotype and disease 
# PGandD <- c(f0*(1-p)^2,f0*R*2*p*(1-p),f0*R^2*p^2)
sum(PGandD); K                    # Check equals K
PGgivD <- PGandD/K                # Probability of genotype in people who are diseased - compare PGgivD to PG
pcase <- 0.5*PGgivD[2]+PGgivD[3]  # Frequency of allele A in cases
# Controls
PNDgivG <- 1-PDgivG               # Probability of being not diseased given the genotype
PGandND <- PNDgivG*PG             # Probability of  genotype and not diseased 
sum(PGandND); (1-K)               # Check equals 1-K
PGgivND <- PGandND/(1-K)          # Probability of genotype in not diseased - compare PGgivND + PG
pcont <- 0.5*PGgivND[2]+PGgivND[3]# Frequency of allele A in controls
# Checks
p;K*pcase+(1-K)*pcont             # Check that population weighted average of pcase and pcontrol is p
pbar <- v*pcase+(1-v)*pcont       # Mean allele frequency in case-control sample
#NCP
NCP <- (pcase-pcont)*(pcase-pcont)*2*N*v*(1-v)/(pbar*(1-pbar)) # Chi-square non-centrality parameter
pow <- pnorm(sqrt(NCP)+t)         # Power - normal distribution is sqrt of chi-square
pow; NCP                          # Check agrees with genetic power calculator: 
# N <- 10000,v<-0.5,p<-0.2,R<-1.2,K<-0.01,alpha<-5e-8,Dprime<-1: 
# Genetic Power Calculator
#http://pngu.mgh.harvard.edu/~purcell/gpc/
# Power <- 0.4586; NCP<-28.59
# Q: Compare power to direct calculation as coded above
# b) N =10000,v = 0.5,p=0.2,R=1.2,K=0.01,alpha=5e-8
# Q:Compare power for screened and unscreened controls
# A: - the code aboves assumes controls are screened, for unscreened controls pcont=p
# for schizophenia K =0.01 and Major depression K = 0.15
pcont=p
pbar <- v*pcase+(1-v)*pcont       # Mean allele frequency in case-control sample
#NCP
NCP <- (pcase-pcont)*(pcase-pcont)*2*N*v*(1-v)/(pbar*(1-pbar)) # Chi-square non-centrality parameter
pow <- pnorm(sqrt(NCP)+t)         # Power - normal distribution is sqrt of chi-square
pow; NCP  # 0.4469, 28.012 - hardly affected by screening because K=0.01

# Q:Compare the impact on power of screening controls for 
#   schizophrenia K =0.01 and Major depression K = 0.15
# A: Now use K=0.15 re-run from line 44
K=0.15
#screened pow; NCP 0.79; 39.1
#unscreened pow; NCP 0.44 28.0
# Q: For which disorders is screening of controls most recommended
# A: Common disorders
# ---------------------------------------------------------------------------------
# c) Power in case-control study design based on RR
# ---------------------------------------------------------------------------------
getpower=function(p,GRR,Ncase,Ncont,A,K){
  N=Ncase+Ncont
  v<-Ncase/N
  Thr<-qnorm(A/2,0,1)
  pcase<-p*GRR/(p*GRR+1-p)
  pcont<-(p/(1-K))*(1-K*GRR/(1+p*(GRR-1)))
  pdiff=pcase-pcont
  pbar<-pcase*v+pcont*(1-v)
  #exactly the same as GPC
  NCP<-pdiff*pdiff*2*N*v*(1-v)/(pbar*(1-pbar))
  pow=pnorm(sqrt(NCP)+Thr)
  pbar=0.5*(pcase+pcont)
  Nequiv=NCP*pbar*(1-pbar)/pdiff^2   # sample size of equal cases and controls that gives same power
  result=list(pow=pow,NCP=NCP,Nequiv=Nequiv)
  result
}
getpower(0.2,1.2,5000,5000,5e-8,0.01) #confirm same as programmed above

#-------------------------
# run this section to get a power graph
#------------------------
opar<-par(mfrow=c(1,1))
Ncase=5000
Ncont=5000
maxGRR=1.5
A<-5e-8
x<-c(1,maxGRR)
y<-c(0,1)
plot(x,y,xlim=c(1,maxGRR),ylim=c(0,1),xlab="Genotype Relative Risk",ylab="power",
     main="Power for sample size of",type="n",cex.lab=2,cex.main=1.5)
mtext(paste(Ncase,"cases and",Ncont,"controls"),3,0,cex =1.5)
curve(getpower(0.1,x,Ncase,Ncont,A,K)$pow,from=1,to=maxGRR,col=1,lty=1,lwd=2,add=TRUE)
curve(getpower(0.3,x,Ncase,Ncont,A,K)$pow,from=1,to=maxGRR,col=2,lty=1,lwd=2,add=TRUE)
curve(getpower(0.5,x,Ncase,Ncont,A,K)$pow,from=1,to=maxGRR,col=3,lty=1,lwd=2,add=TRUE)
legend(x=1.3,y=0.2,col=c(1:3),lty=1,legend=paste("RAF =",c(0.1,0.3,0.5)),box.col="white")

#-----------------------------
# Q: Now make a graph keeping a constant allele frequency and change the disease prevalence 
# K=0.001,K=0.01,K=0.1
#-----------------------------
Ncase=5000
Ncont=5000
maxGRR=1.5
A<-5e-8
x<-c(1,maxGRR)
y<-c(0,1)
plot(x,y,xlim=c(1,maxGRR),ylim=c(0,1),xlab="Genotype Relative Risk",ylab="power",
     main="Power for sample size of",type="n",cex.lab=2,cex.main=1.5)
mtext(paste(Ncase,"cases and",Ncont,"controls"),3,0,cex =1.5)
curve(getpower(0.1,x,Ncase,Ncont,A,0.001)$pow,from=1,to=maxGRR,col=1,lty=1,lwd=2,add=TRUE)
curve(getpower(0.1,x,Ncase,Ncont,A,0.01)$pow,from=1,to=maxGRR,col=2,lty=1,lwd=2,add=TRUE)
curve(getpower(0.1,x,Ncase,Ncont,A,0.1)$pow,from=1,to=maxGRR,col=3,lty=1,lwd=2,add=TRUE)
legend(x=1.2,y=0.2,col=c(1:3),lty=1,legend=paste("Pop Disease risk, K =",c(0.001,0.01,0.1)),box.col="white")

# Q: For a given GRR is the power bigger or smaller as disease prevalence increases
# Ans: Power is higher for higher disease prevalence
# Q: Why does this make sense?
# Ans: Intuitively it doesnt make sense, because the difference in mean liability between cases and controls
# is smaller for diseases with higher disease prevalence
# However, for the same RR, the variance explained by the locus is greater for a more common disease
# We are not comparing like with like
# It makes more sense to compare power for different diseases based on variance explained

# ---------------------------------------------------------------------------------
# d) Power in case-control study design based on variance explained
# ---------------------------------------------------------------------------------
# Code for power curve function 
power_curveV <- function(p, PP, K, Talpha, v, N01){
  P <- PP / 100
  Th <- -qnorm(K,0,1)    
  # Height of the normal distribution at T
  z <- dnorm(Th) 
  # Mean liability of affected group, often (e.g., Falconer and Mackay) called the selection intensity       
  i <- z/K
  R <- sqrt(P*i*i/(2*p*(1-p)))+1
  pcase <- p*R/(p*R+1-p)
  pcont <- (p/(1-K))*(1-K*R/(1+p*(R-1)))
  pbar <- pcase*v+pcont*(1-v)
  NCP <- (pcase-pcont)*(pcase-pcont)*2*N01*v*(1-v)/(pbar*(1-pbar))
  power_curve <- pnorm(sqrt(NCP)+Talpha)
}
# Power for given risk allele frequency and population risk K
# -----------------------------------------------------------
p <- c(0.05,0.10,0.20,0.40,0.60,0.8,0.90)
alpha=5e-08
Talpha <- qnorm(alpha/2,0,1)
P <- 0.1  # % explained
title <- paste("variant that explains ",P,"% of variance in liability",sep="")
plot(p, p, xlim=c(0,1), ylim=c(0,1),
     xlab = "risk allele frequency",
     ylab = "power",main=paste("power for",title),type="n")
Ncase <- 5000
Ncont <- 5000
mtext(paste(Ncase,"cases and",Ncont,"controls"),3,0,cex =1.5)
N01 <- Ncase+Ncont
v <- Ncase/N01

# Alter the prevalence
K <- 0.001
curve(power_curveV(x,P,K,Talpha,v,N01),from=0.001,to=0.999,add=TRUE,col=1,lty=1,lwd=3)
K=0.01
curve(power_curveV(x,P,K,Talpha,v,N01),from=0.001,to=0.999,add=TRUE,col=2,lty=1,lwd=3)
K=0.10
curve(power_curveV(x,P,K,Talpha,v,N01),from=0.001,to=0.999,add=TRUE,col=3,lty=1,lwd=3)
legend(0.3,0.8,col=c(seq(1,3)),lwd=4,lty=1,legend=paste("Population risk=",c("0.1%","1%","10%")),box.col="white")
# Power for given % variance and population risk K
# ------------------------------------------------
y <- c(0,1)
P <- c(0,0.0004*100)
Ncase <- 100000
Ncont <- 100000
title <- paste(Ncase," cases and", Ncont," controls")
plot(y,P,xlim=c(0,0.0004*100),
     ylim=c(0,1),
     xlab="% of variance in liability explained by SNP",
     ylab="power",main=paste(title),type="n")
N01 <- Ncase+Ncont
v <- Ncase/N01
P <- 0.001
K <- 0.001
curve(power_curveV(0.5,x,K,Talpha,v,N01),from=0.00001*100,to=0.0004*100,add=TRUE,col=1,lty=1,lwd=4)
K <- 0.01
curve(power_curveV(0.5,x,K,Talpha,v,N01),from=0.00001*100,to=0.0004*100,add=TRUE,col=2,lty=1,lwd=4)
K <- 0.10
curve(power_curveV(0.5,x,K,Talpha,v,N01),from=0.00001*100,to=0.0004*100,add=TRUE,col=3,lty=1,lwd=4)
legend(0.025,0.2,col=c(seq(1,3)),lwd=4,lty=1,legend=paste("Population risk=",c("0.1%","1%","10%")))