# --------------------------------------------------------------------------------- # Summer Institute of Statistical Genetics 2016 # Module 18: Statistical & Quantitative Genetics of Disease # Practical 3 R code # Naomi Wray naomi.wray@uq.edu.au #----------------------------------------------- #Set up single locus disease risk model #---------------------------------------------- K = 0.01 # lifetime risk of disease between 0 and 1 p = 0.2 # probability of risk allele R = 1.2 # relative risk for heterozygote compared to homozygote PG=c(rep(0,3)); PDgivG=c(rep(0,3)); PDandG=c(rep(0,3)) # set up vectors PG[1] = (1-p)^2 # genotype frequency of homozygote non-risk allele PG[2] = 2*p*(1-p) PG[3] = p^2 f0 = K/(1+p*(R-1))^2 PDgivG[1] = f0 PDgivG[2] = f0*R PDgivG[3] = f0*R*R PDandG = PDgivG*PG PGgivD = PDandG/K PG PDgivG PDandG sum(PDandG) # check equals K PGgivD PG <- c((1-p)^2,2*p*(1-p),p^2) PDgivG <- c(f0,f0*R,f0*R^2) # Probability of disease given the genotype # --------------------------------------------------------------------------------- # b) Power in case-control study design # --------------------------------------------------------------------------------- N <- 10000 # Total sample size v <- 0.5 # Proportion of cases p <- 0.2 # Frequency of risk allele A R <- 1.2 # Heterozygote genotype relative risk; homozygote GRR is R^2 K <- 0.01 # Overall risk (or probability) of disease in population alpha <- 5e-8 # Level of significance that will be used in the association study t <- qnorm(alpha/2,0,1) # Normal distribution threshold for declaring significance # Basic single locus parameters f0 <- K/(1+p*(R-1))^2 # Probability of disease in those homozygote for non-risk allele aa PG <- c((1-p)^2,2*p*(1-p),p^2) # Probability of genotypes aa, Aa, AA assumes Hardy-Weinberg equilibrium # Cases PDgivG <- c(f0,f0*R,f0*R^2) # Probability of disease given the genotype PGandD <- PDgivG*PG # Probability of genotype and disease # PGandD <- c(f0*(1-p)^2,f0*R*2*p*(1-p),f0*R^2*p^2) sum(PGandD); K # Check equals K PGgivD <- PGandD/K # Probability of genotype in people who are diseased - compare PGgivD to PG pcase <- 0.5*PGgivD[2]+PGgivD[3] # Frequency of allele A in cases # Controls PNDgivG <- 1-PDgivG # Probability of being not diseased given the genotype PGandND <- PNDgivG*PG # Probability of genotype and not diseased sum(PGandND); (1-K) # Check equals 1-K PGgivND <- PGandND/(1-K) # Probability of genotype in not diseased - compare PGgivND + PG pcont <- 0.5*PGgivND[2]+PGgivND[3]# Frequency of allele A in controls # Checks p;K*pcase+(1-K)*pcont # Check that population weighted average of pcase and pcontrol is p pbar <- v*pcase+(1-v)*pcont # Mean allele frequency in case-control sample #NCP NCP <- (pcase-pcont)*(pcase-pcont)*2*N*v*(1-v)/(pbar*(1-pbar)) # Chi-square non-centrality parameter pow <- pnorm(sqrt(NCP)+t) # Power - normal distribution is sqrt of chi-square pow; NCP # Check agrees with genetic power calculator: # N <- 10000,v<-0.5,p<-0.2,R<-1.2,K<-0.01,alpha<-5e-8,Dprime<-1: # Genetic Power Calculator #http://pngu.mgh.harvard.edu/~purcell/gpc/ # Power <- 0.4586; NCP<-28.59 # Q: Compare power to direct calculation as coded above # b) N =10000,v = 0.5,p=0.2,R=1.2,K=0.01,alpha=5e-8 # Q:Compare power for screened and unscreened controls # A: - the code aboves assumes controls are screened, for unscreened controls pcont=p # for schizophenia K =0.01 and Major depression K = 0.15 pcont=p pbar <- v*pcase+(1-v)*pcont # Mean allele frequency in case-control sample #NCP NCP <- (pcase-pcont)*(pcase-pcont)*2*N*v*(1-v)/(pbar*(1-pbar)) # Chi-square non-centrality parameter pow <- pnorm(sqrt(NCP)+t) # Power - normal distribution is sqrt of chi-square pow; NCP # 0.4469, 28.012 - hardly affected by screening because K=0.01 # Q:Compare the impact on power of screening controls for # schizophrenia K =0.01 and Major depression K = 0.15 # A: Now use K=0.15 re-run from line 44 K=0.15 #screened pow; NCP 0.79; 39.1 #unscreened pow; NCP 0.44 28.0 # Q: For which disorders is screening of controls most recommended # A: Common disorders # --------------------------------------------------------------------------------- # c) Power in case-control study design based on RR # --------------------------------------------------------------------------------- getpower=function(p,GRR,Ncase,Ncont,A,K){ N=Ncase+Ncont v<-Ncase/N Thr<-qnorm(A/2,0,1) pcase<-p*GRR/(p*GRR+1-p) pcont<-(p/(1-K))*(1-K*GRR/(1+p*(GRR-1))) pdiff=pcase-pcont pbar<-pcase*v+pcont*(1-v) #exactly the same as GPC NCP<-pdiff*pdiff*2*N*v*(1-v)/(pbar*(1-pbar)) pow=pnorm(sqrt(NCP)+Thr) pbar=0.5*(pcase+pcont) Nequiv=NCP*pbar*(1-pbar)/pdiff^2 # sample size of equal cases and controls that gives same power result=list(pow=pow,NCP=NCP,Nequiv=Nequiv) result } getpower(0.2,1.2,5000,5000,5e-8,0.01) #confirm same as programmed above #------------------------- # run this section to get a power graph #------------------------ opar<-par(mfrow=c(1,1)) Ncase=5000 Ncont=5000 maxGRR=1.5 A<-5e-8 x<-c(1,maxGRR) y<-c(0,1) plot(x,y,xlim=c(1,maxGRR),ylim=c(0,1),xlab="Genotype Relative Risk",ylab="power", main="Power for sample size of",type="n",cex.lab=2,cex.main=1.5) mtext(paste(Ncase,"cases and",Ncont,"controls"),3,0,cex =1.5) curve(getpower(0.1,x,Ncase,Ncont,A,K)$pow,from=1,to=maxGRR,col=1,lty=1,lwd=2,add=TRUE) curve(getpower(0.3,x,Ncase,Ncont,A,K)$pow,from=1,to=maxGRR,col=2,lty=1,lwd=2,add=TRUE) curve(getpower(0.5,x,Ncase,Ncont,A,K)$pow,from=1,to=maxGRR,col=3,lty=1,lwd=2,add=TRUE) legend(x=1.3,y=0.2,col=c(1:3),lty=1,legend=paste("RAF =",c(0.1,0.3,0.5)),box.col="white") #----------------------------- # Q: Now make a graph keeping a constant allele frequency and change the disease prevalence # K=0.001,K=0.01,K=0.1 #----------------------------- Ncase=5000 Ncont=5000 maxGRR=1.5 A<-5e-8 x<-c(1,maxGRR) y<-c(0,1) plot(x,y,xlim=c(1,maxGRR),ylim=c(0,1),xlab="Genotype Relative Risk",ylab="power", main="Power for sample size of",type="n",cex.lab=2,cex.main=1.5) mtext(paste(Ncase,"cases and",Ncont,"controls"),3,0,cex =1.5) curve(getpower(0.1,x,Ncase,Ncont,A,0.001)$pow,from=1,to=maxGRR,col=1,lty=1,lwd=2,add=TRUE) curve(getpower(0.1,x,Ncase,Ncont,A,0.01)$pow,from=1,to=maxGRR,col=2,lty=1,lwd=2,add=TRUE) curve(getpower(0.1,x,Ncase,Ncont,A,0.1)$pow,from=1,to=maxGRR,col=3,lty=1,lwd=2,add=TRUE) legend(x=1.2,y=0.2,col=c(1:3),lty=1,legend=paste("Pop Disease risk, K =",c(0.001,0.01,0.1)),box.col="white") # Q: For a given GRR is the power bigger or smaller as disease prevalence increases # Ans: Power is higher for higher disease prevalence # Q: Why does this make sense? # Ans: Intuitively it doesnt make sense, because the difference in mean liability between cases and controls # is smaller for diseases with higher disease prevalence # However, for the same RR, the variance explained by the locus is greater for a more common disease # We are not comparing like with like # It makes more sense to compare power for different diseases based on variance explained # --------------------------------------------------------------------------------- # d) Power in case-control study design based on variance explained # --------------------------------------------------------------------------------- # Code for power curve function power_curveV <- function(p, PP, K, Talpha, v, N01){ P <- PP / 100 Th <- -qnorm(K,0,1) # Height of the normal distribution at T z <- dnorm(Th) # Mean liability of affected group, often (e.g., Falconer and Mackay) called the selection intensity i <- z/K R <- sqrt(P*i*i/(2*p*(1-p)))+1 pcase <- p*R/(p*R+1-p) pcont <- (p/(1-K))*(1-K*R/(1+p*(R-1))) pbar <- pcase*v+pcont*(1-v) NCP <- (pcase-pcont)*(pcase-pcont)*2*N01*v*(1-v)/(pbar*(1-pbar)) power_curve <- pnorm(sqrt(NCP)+Talpha) } # Power for given risk allele frequency and population risk K # ----------------------------------------------------------- p <- c(0.05,0.10,0.20,0.40,0.60,0.8,0.90) alpha=5e-08 Talpha <- qnorm(alpha/2,0,1) P <- 0.1 # % explained title <- paste("variant that explains ",P,"% of variance in liability",sep="") plot(p, p, xlim=c(0,1), ylim=c(0,1), xlab = "risk allele frequency", ylab = "power",main=paste("power for",title),type="n") Ncase <- 5000 Ncont <- 5000 mtext(paste(Ncase,"cases and",Ncont,"controls"),3,0,cex =1.5) N01 <- Ncase+Ncont v <- Ncase/N01 # Alter the prevalence K <- 0.001 curve(power_curveV(x,P,K,Talpha,v,N01),from=0.001,to=0.999,add=TRUE,col=1,lty=1,lwd=3) K=0.01 curve(power_curveV(x,P,K,Talpha,v,N01),from=0.001,to=0.999,add=TRUE,col=2,lty=1,lwd=3) K=0.10 curve(power_curveV(x,P,K,Talpha,v,N01),from=0.001,to=0.999,add=TRUE,col=3,lty=1,lwd=3) legend(0.3,0.8,col=c(seq(1,3)),lwd=4,lty=1,legend=paste("Population risk=",c("0.1%","1%","10%")),box.col="white") # Power for given % variance and population risk K # ------------------------------------------------ y <- c(0,1) P <- c(0,0.0004*100) Ncase <- 100000 Ncont <- 100000 title <- paste(Ncase," cases and", Ncont," controls") plot(y,P,xlim=c(0,0.0004*100), ylim=c(0,1), xlab="% of variance in liability explained by SNP", ylab="power",main=paste(title),type="n") N01 <- Ncase+Ncont v <- Ncase/N01 P <- 0.001 K <- 0.001 curve(power_curveV(0.5,x,K,Talpha,v,N01),from=0.00001*100,to=0.0004*100,add=TRUE,col=1,lty=1,lwd=4) K <- 0.01 curve(power_curveV(0.5,x,K,Talpha,v,N01),from=0.00001*100,to=0.0004*100,add=TRUE,col=2,lty=1,lwd=4) K <- 0.10 curve(power_curveV(0.5,x,K,Talpha,v,N01),from=0.00001*100,to=0.0004*100,add=TRUE,col=3,lty=1,lwd=4) legend(0.025,0.2,col=c(seq(1,3)),lwd=4,lty=1,legend=paste("Population risk=",c("0.1%","1%","10%")))