##### SISG 2017 Module 10 # Session 7 Practical Profile Scoring Practical ############# # Naomi Wray ###################################################### # set to your own directory # (in Rstudio - at top Session > Set Working Directory > To Source File Location) setwd("~/Documents/Naomi/Teaching/SISG/2017Brisbane/Session7Naomi/Practical7") ########## FILES ##################################### ## SNP ln(OR) from a "Discovery" GWAS ########################### # list of SNPs with their OR have been created # files: Discovery_PLT_* (PLT = pvalue of association less than) # files contain SNPs at different P-value thresholds: # P<5e8',P<1e5,P<1e2,P<5e2,P<1e1,P<5e1,P<1 ## Target sample for profile scoring ################# # target.bed target.bim target.fam ########## STEP 1 #################################### ## Create profile scores in plink ################### # Use the system() command in R to submit to terminal #if your computer is very slow then reduce the number of pval alternatives #(copy line and hash out original) for(Pd in c('5e8','1e5','1e2','5e2','1e1','5e1','1')){ #for(Pd in c('5e8')){ scoring_command<-sprintf("./plink --bfile target --score Discovery_PLT_%s.txt --out %s_scores --noweb", Pd,Pd) system(scoring_command) # see http://pngu.mgh.harvard.edu/~purcell/plink/profile.shtml } ########## STEP 2 ################################### ## Genomic Profile Risk Score analysis ############### ######### FUNCTIONS NEEDED ############ # proportion variability liability scale h2l_R2 <- function(k, r2, p) { # k baseline disease risk # r2 from a linear regression model of genomic profile risk score # p proportion of sample that are cases # calculates proportion of variance explained on the liability scale # from ABC at http://www.complextraitgenomics.com/software/ # Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24. x = qnorm(1-k) z = dnorm(x) i = z/k C = k*(1-k)*k*(1-k)/(z^2*p*(1-p)) theta = i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x) h2l_R2 = C*r2 / (1 + C*theta*r2) } # se of the variance explained on the liability scale from the # variance explained on the linear scale (r2) and it se (se) se_h2l_R2 <- function(k,r2,se, p) { # k baseline disease risk # r2 from a linear regression model attributable to genomic profile risk score # se # p proportion of sample that are cases # calculates proportion of variance explained on the liability scale # from ABC at http://www.complextraitgenomics.com/software/ # Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24. # SE on the liability (From a Taylor series expansion) # var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d being calculus differentiation x = qnorm(1-k) z = dnorm(x) i = z/k C = k*(1-k)*k*(1-k)/(z^2*p*(1-p)) theta = i*((p-k)/(1-k))*(i*((p-k)/(1-k))-x) se_h2l_R2 = C*(1-r2*theta)*se } ################################################ ################################################################################ ### Calculate AUC and R2 estimates for one of the polygenic score thresholds ### #### There is code below this section to loop through all of the thresholds #### ################################################################################ K <- 0.01 # risk of disease ri <- read.table("1_scores.profile", header=T) head(ri) # normalize the score ri$PHENO<-ri$PHENO - 1 # change from 1, 2 to 0, 1 ri$ZSCORE <- (ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE) # z-score of the profile score P <- sum(ri$PHENO)/length(ri$PHENO) # proportion of target sample that are cases # check out the help page ?scale for a quick way to standardize a variable # Plot the density of the polygenic risk score for cases and controls. #install.packages("sm") library(sm) sm.density.compare(ri$ZSCORE, ri$PHENO, xlab="Polygenic risk score (Z-scored)") title(main="Polygenic Risk Score by Case Control Status") leg <- factor(ri$PHENO, levels= c(1,2), labels = c("Control", "Case")) # add legend via mouse click colfill<-c(2:(2+length(levels(leg)))) #legend(locator(1), levels(leg), fill=colfill) legend("top", levels(leg), fill=colfill) # logistic model tst1 <- glm(PHENO ~ ZSCORE, data = ri, family = binomial(logit)) # logit model tst0 <- glm(PHENO ~ 1, data = ri, family = binomial(logit)) # logit model # view the summary results for the two models - is the polygenic score predictive of disease status? summary(tst0) summary(tst1) # Cox&Snell R2 # NOTE: To calculate sudo R2 we need likelihood or logLikelihood of the model # However, glm output give us -2*logLikelihood as the fit statistic # We therefore need to recompute it, logLike() function does just that! # See Lee SH, Goddard ME, Wray NR, Visscher PM. (2012) A better coefficient of determination for genetic profile analysis. Genet Epidemiol. 2012 Apr;36(3):214-24. N <- length(ri$PHENO) # would this command work if there were missing data? LL1 <- logLik(tst1) LL0 <- logLik(tst0) # NOTE: because we are using logLikelihoods we take exponent CSv <- 1-exp((2/N)*(LL0[1]-LL1[1])) CSv # NOTE: Cox&Snell R2 can never reach 1 but it can do so if # adjusted by its maximum possible value # Nagelkerke's R2 NKv <- CSv/(1-exp((2/N)*LL0[1])) NKv #Using package #install.packages('fmsb') library('fmsb') NagelkerkeR2(tst1) # pvalue # NOTE: null -2*logLikelihood is also reported in the alternative model # tst1$null.deviance # Interpret fit? tst0$deviance tst1$null.deviance tst1$deviance # is it significant? devdiff <- tst0$deviance-tst1$deviance df <- tst0$df.residual-tst1$df.residual pval <- pchisq(devdiff,df,lower.tail=F) pval # linear model R2 std_y <- ri$PHENO ri$std_y <- (std_y-mean(std_y))/sd(std_y) lm1 <- lm(std_y ~ ZSCORE, data = ri) lm0 <- lm(std_y ~ 1, data = ri) R2v <- 1-exp((2/N)*(logLik(lm0)[1]-logLik(lm1)[1])) R2v # standard error of R2v # from Olkin and Finn (Psychological Bulletin, 1995, Vol. 118, No. 1, 155-164) np <- 1 # number of paramters # vr is se^2 of R2v vr <- 4/length(std_y)*R2v*(1-R2v)^2*(1-(2*np+3)/length(std_y)) # calculate liability R2 h2l_r2 <- h2l_R2(K,R2v,P) # linear model #SE on the liability (From a Taylor series expansion) #var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d: calculus differentiation se_h2l_r2 <- se_h2l_R2(K,h2l_r2,vr^.5,P) # area under curve #install.packages('pROC') library('pROC') aucv1 <- auc(ri$PHENO,tst1$linear.predictors) aucv1 # aucv1 may be approximately close to correct value (even though covariates are ignored) #install.packages("PredictABEL") library(PredictABEL) plotROC(data=ri, cOutcome=3, predrisk=tst1$linear.predictors) # make ntiles using cut command - compare deciles to pentiles nt=5 ri$ntil = cut(ri$ZSCORE, breaks=c(quantile(ri$ZSCORE, probs = seq(0, 1, by = 1/nt))), labels=1:nt, include.lowest=T) tstd1 <- glm(PHENO ~ ntil, data = ri, family = binomial(logit)) # logit model #all deciles tstd0 <- glm(PHENO ~ 1, data = ri, family = binomial(logit)) # logit model #Odds ratio for being a case compared to control in each decile ORD <- exp(tstd1$coefficients) ORDL <- exp(tstd1$coefficients-1.96*summary(tstd1)$coefficients[,2]) ORDH <- exp(tstd1$coefficients+1.96*summary(tstd1)$coefficients[,2]) plot(ORD,ylim=c(min(ORDL),max(ORDH))) arrows(seq(1,10,1), ORD, seq(1,10,1), ORDH, angle=90,length=0.10) # Draw error bars arrows(seq(1,10,1), ORD, seq(1,10,1), ORDL, angle=90,length=0.10) # Draw error bars # Odd ratio: odds of being a case compared to a control in each decile compared to the 1st decile ORDR=ORD/ORD[1] plot(ORDR) ########################################################### ### all of the output summarised in a single data frame ### ########################################################### # Pd p-value cutoff for discovery cohort # N - total sample size # P - proportion of sample that are cases # NKv - Nagelkerke's R2 # pval - pvalue of the R2 # K - population risk of disease used in converting NKv to liability scale # h2l_r2 - proportion of variance explained by the score on the liability scale # aucv1 - what we think is the most appropriate estimate of AUC attributed to the score # nt = number of ntiles (eg nt=10 for deciles) # ORD - the odds ratio when comparing top to bottom decile # ORDL - lower CI of the ORD # ORDH - upper CI of the ORD # pcase_con50 - proportion of cases identified when 50% controls tested # pcase_pop50 - proportion of cases identified when 50% population tested # s - proportion of cases identified # pcont_case_s - proportion of controls tested to identify s proportion of cases # ppop_case_s - proportion of population tested to identify s proportion of cases O=data.frame(Pd,N,P,NKv,pval,K,h2l_r2,se_h2l_r2,aucv1, nt,ORD[nt],ORDL[nt],ORDH[nt],ORDR[nt]) print(O,digits=3) O=data.frame("Pd","N","P","NKv","pval","K","h2l_r2","se_h2l_r2","aucv1","nt","ORD[nt]","ORDL","ORDH") write.table(O,file="profile_out.csv",row.names=F,col.names=F,quote=F,sep=",") ####################################################################################################### ### The following code loops throguh all of the polygenic score thresholds that have been generated ### ######### There is a single output object, which houses the various AUC and R2 estimates ############## ####################################################################################################### for(Pd in c('5e8','1e5','1e2','5e2','1e1','5e1','1')){ # read in the score file *_scores.profile fname=paste(Pd,"_scores.profile",sep="") ri <- read.table(fname, header=T) # normalize the score ri$ZSCORE <- (ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE) # z-score of the profile score ri$PHENO<-ri$PHENO - 1 # change from 1, 2 to 0, 1 P <- sum(ri$PHENO)/length(ri$PHENO) # proportion of target sample that are cases # logistic model tst1 <- glm(PHENO ~ ZSCORE, data = ri, family = binomial(logit)) # logit model tst0 <- glm(PHENO ~ 1, data = ri, family = binomial(logit)) # logit model # area under curve aucv1 <- auc(ri$PHENO,tst1$linear.predictors) # aucv1 may be approximately close to correct value (even though covaraites are ignored) # Cox&Snell R2 # NOTE: To calculate sudo R2 we need likelihood or logLikelihood of the model # However, glm output give us -2*logLikelihood as the fit statistic # We therefore need to recompute it, logLike() function does just that! N <- length(ri$PHENO) LL1 <- logLik(tst1) LL0 <- logLik(tst0) # NOTE: because we are using logLikelihoods we take exponent CSv <- 1-exp((2/N)*(LL0[1]-LL1[1])) # NOTE: Cox&Snell R2 can never reach 1 but it can do so if # adjusted by its maximum possible value # Nagelkerke's R2 NKv <- CSv/(1-exp((2/N)*LL0[1])) # pvalue # NOTE: null -2*logLikelihood is also reported in the alternative model # tst1$null.deviance # Interpret fit? tst0$deviance tst1$null.deviance tst1$deviance # is it significant? devdiff <- tst0$deviance-tst1$deviance df <- tst0$df.residual-tst1$df.residual pval <- pchisq(devdiff,df,lower.tail=F) # linear model R2 std_y <- ri$PHENO ri$std_y <- (std_y-mean(std_y))/sd(std_y) lm1 <- lm(std_y ~ ZSCORE, data = ri) lm0 <- lm(std_y ~ 1, data = ri) R2v <- 1-exp((2/N)*(logLik(lm0)[1]-logLik(lm1)[1])) # standard error of R2v # from Olkin and Finn (Psychological Bulletin, 1995, Vol. 118, No. 1, 155-164) np <- 1 # number of paramters vr <- 4/length(std_y)*R2v*(1-R2v)^2*(1-(2*np+3)/length(std_y)) # calculate liability R2 K <- 0.05 # Baseline disease risk in the population h2l_r2 <- h2l_R2(K,R2v,P) # linear model #SE on the liability (From a Taylor series expansion) #var(h2l_r2) = [d(h2l_r2)/d(R2v)]^2*var(R2v) with d: calculus differentiation se_h2l_r2 <- se_h2l_R2(K,h2l_r2,vr^.5,P) # make ntiles using cut command - compare deciles to pentiles nt=5 ri$ntil = cut(ri$ZSCORE, breaks=c(quantile(ri$ZSCORE, probs = seq(0, 1, by = 1/nt))), labels=1:nt, include.lowest=T) tstd1 <- glm(PHENO ~ ntil, data = ri, family = binomial(logit)) # logit model #all deciles tstd0 <- glm(PHENO ~ 1, data = ri, family = binomial(logit)) # logit model #Odds ratio for being a case compared to control in each decile ORD <- exp(tstd1$coefficients) ORDL <- exp(tstd1$coefficients-1.96*summary(tstd1)$coefficients[,2]) ORDH <- exp(tstd1$coefficients+1.96*summary(tstd1)$coefficients[,2]) # Odd ratio: odds of being a case compared to a control in each ntile compared to the 1st decile ORDR=ORD/ORD[1] # output # Pd # p-value cutoff for discovery cohort # N - total sample size # P - proportion of sample that are cases # NKv - Nagelkerke's R2 # pval - pvalue of the R2 # K - population risk of disease used in converting NKv to liability scale # h2l_r2 - proportion of variance explained by the score on the liability scale # aucv1 - what we think is the most appropriate estimate of AUC attributed to the score # ORD - the odds ratio when comparing top to bottom decile # ORDL - lower CI of the ORD # ORDH - upper CI of the ORD O=data.frame(Pd,N,P,NKv,pval,K,h2l_r2,se_h2l_r2,aucv1,nt,ORD[nt],ORDL[nt],ORDH[nt]) print(O,digits=3,row.names=F,col.names=F) write.table(O,file="profile_out.csv",row.names=F,col.names=F,quote=F,append=T,sep=",") } ###################################### # Advanced ####################################### # Calculate the proportion of the population that needs to be # screened to capture proportion s of cases. K <- 0.01 # risk of disease ri <- read.table("1_scores.profile", header=T) head(ri) # normalize the score ri$PHENO<-ri$PHENO - 1 # change from 1, 2 to 0, 1 ri$ZSCORE <- (ri$SCORE-mean(ri$SCORE))/sd(ri$SCORE) # z-score of the profile score library(PredictABEL) plotROC(data=ri, cOutcome=3, predrisk=tst1$linear.predictors) # Probability of disease when have screened population sample #that includes 50% of all controls T0 = qnorm(1-K) z = dnorm(T0) i = z/K v = -i*K/(1-K) #mean liability of controls vcase=h2l_r2*(1-h2l_r2*i*(i-T0)) # variance in cases vcont=h2l_r2*(1-h2l_r2*v*(v-T0)) # variance in controls t50=qnorm(0.5,v*h2l_r2,sqrt(vcont)) # controls screened pcase_con50=1-pnorm(t50,i*h2l_r2,sqrt(vcase)) #prop of cases captured when pop screened that inc 50% controls pcase_con50 # Probability of disease when have screened 50% of population t50=qnorm(0.5,0,h2l_r2) pcase_pop50=1-pnorm(t50,i*h2l_r2,sqrt(vcase)) #prop of cases captured when 50% of pop screened pcase_pop50 # What proportion of pop is needed to capture s proportion of cases s=0.5 ts=qnorm(s,i*h2l_r2,sqrt(vcase)) pcont_case_s=1-pnorm(ts,v*h2l_r2,sqrt(vcont)) # prop of cases needed to be screened to capture s proportion of cases pcont_case_s ppop_case_s=1-pnorm(ts,0,sqrt(h2l_r2)) # prop of pop screened to capture 50% of cases ppop_case_s # calculate auc for variance explained on liability scaled # probability of a case being higher ranked than a control aucchk=pnorm((i-v)*h2l_r2/(sqrt(vcase+vcont))) aucchk