############################################################################################################################## ################################################ DATA HARMONIZATION PRACTICAL ################################################ ############################################################################################################################## #With thanks to Kaitlin Wade rm(list=ls()) #Options(width = 500) #################### # PRELIMINARY STEPS # #################### #If you haven't already installed the necessary packages with libraries, please do so! For this practical, you will need: #install.packages("metafor") #install.packages("plyr") #install.packages("meta") #install.packages("rmeta") library(metafor) library(plyr) library(meta) library(rmeta) # Set working directory #setwd("/home/cflatl/data/module4/Practical 3 Two sample Mendelian randomization/") # Set the file path to your local directory ################################################################# PART 1 ################################################################# ################################################################### # UNDERSTANDING DATA AND DEFINING OUR GENETIC INSTRUMENTS FOR BMI # ################################################################### ### 1. We will be using results from the Locke et al. 2015 paper using data from the GIANT Consortium (downloaded from https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files) # Note: If you want to have a look at the full GIANT data, then download, save to your working directory and load in the full results (this is a fairly large file) using the following code: # giant_full <- read.table("./GIANT_raw.uniq", header = T) # In the Locke et al. paper, the authors describe a certain number of SNPs that are "approximately independently associated with BMI" across all ancestries. Let's have a look at these SNPs. # Note: This file was generated by taking the first rows from the Supplementary Table 8 of the Locke et al. paper snps <- read.csv("giant_snps_all.csv", header = T) # a. How many SNPs do authors describe as being independently associated with BMI in all ancestries? dim(snps) head(snps) # b. What information does this file contain that are needed for Mendelian randomization analyses? colnames(snps) # c. How many are associated within only Europeans? table(snps$Sig_Analysis) # d. Why might it be best to use the SNPs that have been identified as being associated with BMI in Europeans only? ### 2. Read in the second sheet of this file to get the estimates of the SNPs associated with BMI in Europeans. euro_snps <- read.csv("./giant_snps_euro.csv", header = T) dim(euro_snps) head(euro_snps) # a. Check that these are all associated with BMI at a conventional level of genome-wide significance sort(euro_snps$P) length(which(euro_snps$P<=5E-8)) # b. Are all of these SNPs "good instruments"? What else might we want to check to see if they are strongly and independently associated with BMI? ### 3. We're going to make sure the effect allele is the allele that increases BMI using the effect allele and beta column # Browse the data, are all SNPs coded so that the effect allele increases BMI? # a. Are all SNP effects in the same direction? euro_snps[,c("SNP","Effect_Allele","Beta","SE","P")] summary(euro_snps$Beta) ################################################################# PART 2 ################################################################# ############################################################ #SNP LOOKUP IN CARDIOGRAM, A GWAS OF CORONARY HEART DISEASE# ############################################################ ### 1. These were downloaded from the CARDIOGRAM website http://www.cardiogramplusc4d.org/ and provided for you (however, this is quite a large file so we have truncated for you using the following code): # CARDIOGRAM <- read.table("./CARDIoGRAM_GWAS_RESULTS.txt", header = T) # BMI_SNPS_in_CARDIOGRAM <- CARDIOGRAM[CARDIOGRAM$SNP %in% euro_snps$SNP,] # top_1000 <- CARDIOGRAM[c(1:1000),] # bottom_1000 <- CARDIOGRAM[c((nrow(CARDIOGRAM)-1000):(nrow(CARDIOGRAM))),] # CARDIOGRAM_TRUNC <- rbind(BMI_SNPS_in_CARDIOGRAM, top_1000, bottom_1000) # write.table(CARDIOGRAM_TRUNC, "./CARDIOGRAM_CLEANED.txt", quote = F, col.names = T, row.names = F, sep = "\t") # a. How many SNPs are in this truncated CARDIOGRAM dataset? CARDIOGRAM <- read.table("./CARDIOGRAM_CLEANED.txt", sep="\t", header=T, colClasses = "character") dim(CARDIOGRAM) head(CARDIOGRAM) # b. Does this file contain everything that is required to perform a two-sample Mendelian randomization analysis? colnames(CARDIOGRAM) ### 2. How many of the BMI SNPs are included in the CARDIOGRAM dataset? BMI_SNPs <- euro_snps$SNP BMI_SNPs <- as.vector(BMI_SNPs) matches <- unique(grep(paste(BMI_SNPs, collapse="|"), CARDIOGRAM$SNP, value=TRUE)) matches # View the data from CARDIOGRAM for our BMI SNPs CARDIOGRAM_BMI <- CARDIOGRAM[grepl(paste(BMI_SNPs, collapse="|"), CARDIOGRAM$SNP),] CARDIOGRAM_BMI # 3. Merge the GIANT and CARDIOGRAM SNP summary associations # First, make sure the column headings are easy to understand (i.e., add "BMI" and "CHD" onto the respective datasets) colnames(euro_snps) <- paste("BMI", colnames(euro_snps), sep = "_") head(euro_snps) colnames(CARDIOGRAM) <- paste("CHD", colnames(CARDIOGRAM), sep = "_") head(CARDIOGRAM) merged <- merge(euro_snps,CARDIOGRAM, by.x="BMI_SNP", by.y="CHD_SNP") dim(merged) head(merged) ################################################################# PART 3 ################################################################# ##################################################################### #HARMONIZING THE EFFECT ALLELES IN THE GIANT AND CARDIOGRAM DATASETS# ##################################################################### ### 1. Make sure that the effect alleles in the CARDIOGRAM and GIANT datasets are the same. We want the cardiogram effect allele to be the allele that increases BMI # But be careful of palindromic SNPs or SNPs on different strands # First we need to see whether the effect alleles are the same # Browse the data merged[,c("BMI_SNP", "BMI_Effect_Allele","CHD_reference_allele","BMI_Other_Allele","CHD_other_allele","BMI_EAF","CHD_ref_allele_frequency")] # a. How can we tell if the CARDIOGRAM and GIANT SNPs are coded using the same reference strand? # b. Are CARDIOGRAM and the GIANT SNPs coded using the same reference strand? # c. Are there any palindromic SNPs? palindromic_at<-subset(merged,BMI_Effect_Allele %in% "A" & BMI_Other_Allele %in% "T") palindromic_ta<-subset(merged,BMI_Effect_Allele %in% "T" & BMI_Other_Allele %in% "A") palindromic_gc<-subset(merged,BMI_Effect_Allele %in% "G" & BMI_Other_Allele %in% "C") palindromic_cg<-subset(merged,BMI_Effect_Allele %in% "C" & BMI_Other_Allele %in% "G") dim(palindromic_at) dim(palindromic_ta) dim(palindromic_gc) dim(palindromic_cg) # d. How can we tell whether the effect alleles are the same in both datasets for palindromic SNPs (i.e., the allele that increases BMI is the same as the reference allele in CARDIOGRAM)? ### 2. Make sure the CARDIOGRAM log odds ratio reflects the allele that increases BMI in the GIANT data # First, find the positions of SNPs with different effect alleles head(merged) effect_diff <- which(merged$BMI_Effect_Allele != merged$CHD_reference_allele) # The position of SNPs where effect alleles are different merged[effect_diff,c("BMI_SNP", "BMI_Effect_Allele","CHD_reference_allele","BMI_Other_Allele","CHD_other_allele","BMI_EAF","CHD_ref_allele_frequency")] # a. How many SNPs have effect alleles that are coded in the opposite direction? # b. Where the effect alleles are different, flip the direction of the log odds ratio by multiplying it by -1 # If you want, you can also generate new columns that reflect the effect allele change but this isn't used in the causal estimate merged$CHD_flip_log_odds <- as.numeric(merged$CHD_log_odds) # Mke log odds ratio numeric merged$CHD_log_odds_se <- as.numeric(merged$CHD_log_odds_se) # Make standard error numeric merged$CHD_flip_log_odds[effect_diff] <- merged$CHD_flip_log_odds[effect_diff]*(-1) head(merged) dim(merged) # c. Check that all of the effect estimates have been flipped appropriately merged[effect_diff,c("BMI_SNP", "BMI_Effect_Allele","CHD_reference_allele", "CHD_log_odds", "CHD_flip_log_odds")] merged[-effect_diff,c("BMI_SNP", "BMI_Effect_Allele","CHD_reference_allele", "CHD_log_odds", "CHD_flip_log_odds")] # d. Check that the effect allele frequencies are correlated # Check correlation of effect allele frequency between BMI and CARDIOGRAM datasets before harmonising alleles merged$BMI_EAF <- as.numeric(merged$BMI_EAF) merged$CHD_ref_allele_frequency <- as.numeric(merged$CHD_ref_allele_frequency) cor(merged$BMI_EAF,merged$CHD_ref_allele_frequency) # Check correlation of effect allele frequency between BMI and CARDIOGRAM datasets after harmonising alleles merged$CHD_ref_allele_frequency[effect_diff] <- 1-merged$CHD_ref_allele_frequency[effect_diff] cor(merged$BMI_EAF,merged$CHD_ref_allele_frequency) # e. What happened and why? ################################################################# PART 4 ################################################################# ################################### #ESTIMATE THE EFFECT OF BMI ON CHD# ################################### ### 1. Estimate the Wald ratios for each SNP and their delta approximated standard errors gp <- merged$BMI_Beta # The effect of the SNP on BMI segp <- merged$BMI_SE # The standard error of the SNP effect on BMI gd <- merged$CHD_flip_log_odds # The log odds ratio for CHD (that were harmonized to reflect an increase in BMI) segd <- merged$CHD_log_odds_se # Standard error of the log odds ratio wald_ratio <- gd/gp # The log odds ratios of CHD per unit change in BMI Cov <- 0 # Only required when the SNP-BMI and SNP-CHD associations are estimated in the same participants (therefore for two-sample MR with non-overlapping samples, this is set to 0) wald_ratio_se <- sqrt((segd^2/gp^2) + (gd^2/gp^4)*segp^2 - 2*(gd/gp^3)*Cov) # Delta approximated standard error of the wald ratio; see Thomas, D. C., Lawlor, D. a, & Thompson, J. R. (2007). Re: Estimation of bias in nongenetic observational studies using Mendelian triangulation by Bautista et al. Annals of Epidemiology, 17(7), 5113. doi:10.1016/j.annepidem.2006.12.005 z <- wald_ratio/wald_ratio_se # Z statistic for the wald ratio p <- 2*pnorm(abs(z) ,lower.tail=F) # P value for the z statistics under the null hypothesis that there is not effect wald_ratio_var = wald_ratio_se^2 # Variance weight <- 1/wald_ratio_var # Inverse variance weight snps <- merged$BMI_SNP # SNPs that we will use in the estimates ### 2. Combine the Wald ratios by fixed effects meta-analysis meta_results <- metagen(wald_ratio,wald_ratio_se,comb.fixed=T,sm="OR") #combine the SNPs by fixed effects meta-analysis meta_results ### 3. Create a nice table of the results, which you could export to other programs e.g. excel, STATA etc mr_results <- data.frame(matrix(c(as.character(merged$BMI_SNP),round(wald_ratio,2),round(wald_ratio_se,2),round(p,3)),nrow=length(merged$BMI_SNP),ncol=4)) names(mr_results) <- c("SNP","log_odds","se","p") mr_results_order <- mr_results[order(mr_results$log_odds),] overall_genetic_effect <- data.frame(matrix(c("Overall genetic effect",meta_results$TE.fixed,meta_results$seTE.fixed,meta_results$pval.fixed),nrow=1,ncol=4)) names(overall_genetic_effect) <- c("SNP","log_odds","se","p") overall_genetic_effect twosampleResults <- rbind.fill(mr_results_order,overall_genetic_effect) twosampleResults # Calculate heterogeneity p value p_het <- pchisq(meta_results$Q,meta_results$df.Q,lower.tail=F) twosampleResults$p_chi<-NA # Add the p value for heterogeneity to the results table twosampleResults$p_chi[twosampleResults$SNP=="Overall genetic effect"] <- p_het #write.table(twosampleResults,"./twosample_results_BMI_CHD.txt",sep="\t",col.names=T,row.names=F,quote=F) twosampleResults ### 4. Create a forest plot of the results and compare the genetic and observational associations # The observational effect is 1.23 (95% CI: 1.17, 1.29) per 4.56 kg/m2 (i.e., per SD) increase in BMI # Formula for SE from 95% confidence interval: (log(uci)-log(lci))/(1.96*2) effect <- c(wald_ratio[order(wald_ratio)],meta_results$TE.fixed,log(1.23)) se <- c(wald_ratio_se[order(wald_ratio)],meta_results$seTE.fixed,(log(1.29)-log(1.17))/(1.96*2)) snps <- c(as.character(merged$BMI_SNP)[order(wald_ratio)],"Overall genetic effect","Observational effect") metaplot(effect,se,labels=snps,conf.level=0.95,logeffect=T,nn=0.1,boxsize=1,xlab="Odds ratio & 95% confidence interval",ylab="SNP",cex=0.7) ### 5. Interpret the results # a. Is the MR-derived effect similar to the observational association? # b. Is there evidence of heterogeneity in the genetic effects? How do you interpret this? # c. Can you think of reasons for caution? ################################################################# PART 5 ################################################################# ###################### #SENSITIVITY ANALYSES# ###################### ## All that is required is summary level results for each SNP (remember gp, segp, gd, segd from PART 4) #gp <- merged$BMI_Beta # The effect of the SNP on BMI #segp <- merged$BMI_SE # The standard error of the SNP effect on BMI #gd <- merged$CHD_flip_log_odds # The log odds ratio for CHD (that were harmonized to reflect an increase in BMI) #segd <- merged$CHD_log_odds_se # Standard error of the log odds ratio ### 1. These functions define the IVW, MR-Egger, weighted median and weighted mode estimators, respectively, and a function that wraps up the results #set seed for replication purposes set.seed(50) two.sample.iv.ivw <- function(x, y, sigmax, sigmay) { beta.ivw.fit = summary(lm(y~x-1, weights=sigmay^-2)) beta.ivw.fit.only = lm(y~x-1, weights=sigmay^-2) beta.ivw = beta.ivw.fit$coef[1,1] beta.se.ivw = beta.ivw.fit$coef[1,2]/min(beta.ivw.fit$sigma,1) beta.df.ivw = length(y) - 1 beta.p.ivw = 2*(1-pt(abs(beta.ivw/beta.se.ivw),beta.df.ivw)) beta.lower.ivw = beta.ivw + (-1*qt(df=beta.df.ivw, 0.975)*beta.se.ivw) beta.upper.ivw = beta.ivw + (1*qt(df=beta.df.ivw, 0.975)*beta.se.ivw) return(list(beta.ivw=beta.ivw,beta.se.ivw=beta.se.ivw,beta.lower.ivw=beta.lower.ivw,beta.upper.ivw=beta.upper.ivw,beta.t.ivw=beta.ivw/beta.se.ivw,beta.p.ivw=beta.p.ivw, beta.ivw.fit.only=beta.ivw.fit.only,beta.df.ivw=beta.df.ivw,beta.ivw.fit=beta.ivw.fit)) } weighted.median <- function(x, w) { N = length(x) ord = order(x); x = x[ord]; w = w[ord]; Sn = cumsum(w) S_N = Sn[N] Pn = (100/S_N)*(Sn-w/2) if(sort(abs(Pn-50))[1] == 0){M = which(Pn==50); return(x[M])} Q = length(Pn[sign(Pn-50)==-1]) V1 = Q; V2 = Q+1 M = x[V1] + (50 - Pn[V1])*(x[V2]-x[V1])/(Pn[V2]-Pn[V1]) return(list(beta.median=M,CumSum.median=Sn,ordX.median=x)) } weighted.median.boot <- function(x, y, sigmax, sigmay, Nsim, alpha, W) { med = NULL for (i in 1:Nsim){ y_boot = rnorm(length(y), mean=y, sd=sigmay) x_boot = rnorm(length(x), mean=x, sd=sigmax) iv_boot = y_boot/x_boot run = weighted.median(iv_boot,W) med[i] = run$beta.median } lower = Nsim*alpha/2 upper = Nsim*(1-alpha/2) Sort = sort(med) lowerCI = Sort[lower] upperCI = Sort[upper] se = sd(med) t = mean(med)/se p = 2*(1-pt(abs(t),length(y)-1)) return(list(beta.se.median=se,beta.lower.median=lowerCI,beta.upper.median=upperCI,beta.t.median=t,beta.p.median=p)) } two.sample.iv.egger <- function(x, y, sigmax, sigmay) { egger.fit = summary(lm(y~x, weights=sigmay^-2)) df.egger = length(y) - 2 beta.egger = egger.fit$coef[2,1] beta.se.egger = egger.fit$coef[2,2] / min(egger.fit$sigma, 1) beta.p.egger = 2*(1-pt(abs(beta.egger/beta.se.egger),df.egger)) beta.lower.egger = beta.egger + (-1*qt(df=df.egger, 0.975)*beta.se.egger) beta.upper.egger = beta.egger + (1*qt(df=df.egger, 0.975)*beta.se.egger) alpha.egger = egger.fit$coef[1,1] alpha.se.egger = egger.fit$coef[1,2] / min(egger.fit$sigma, 1) alpha.p.egger = 2*(1-pt(abs(alpha.egger/alpha.se.egger),df.egger)) alpha.lower.egger = alpha.egger + (-1*qt(df=df.egger, 0.975)*alpha.se.egger) alpha.upper.egger = alpha.egger + (1*qt(df=df.egger, 0.975)*alpha.se.egger) return(list(beta.egger=beta.egger,beta.se.egger=beta.se.egger,beta.lower.egger=beta.lower.egger,beta.upper.egger=beta.upper.egger,beta.t.egger=beta.egger/beta.se.egger,beta.p.egger=beta.p.egger, alpha.egger=alpha.egger,alpha.se.egger=alpha.se.egger,alpha.lower.egger=alpha.lower.egger,alpha.upper.egger=alpha.upper.egger,alpha.t.egger=alpha.egger/alpha.se.egger,alpha.p.egger=alpha.p.egger)) } ModeEstimator <- function(x, y, sigmax, sigmay, phi=c(1,0.5,0.25), n_boot=1e4, alpha=0.05) { beta <- function(BetaIV.in, seBetaIV.in) { s <- 0.9*(min(sd(BetaIV.in), mad(BetaIV.in)))/length(BetaIV.in)^(1/5) weights <- seBetaIV.in^-2/sum(seBetaIV.in^-2) beta <- NULL for(cur_phi in phi) { h <- s*cur_phi densityIV <- density(BetaIV.in, weights=weights, bw=h) beta[length(beta)+1] <- densityIV$x[densityIV$y==max(densityIV$y)] } return(beta) } boot <- function(BetaIV.in, seBetaIV.in, beta_Mode.in) { beta.boot <- matrix(nrow=n_boot, ncol=length(beta_Mode.in)) for(i in 1:n_boot) { BetaIV.boot <- rnorm(length(BetaIV.in), mean=BetaIV.in, sd=seBetaIV.in[,1]) BetaIV.boot_NOME <- rnorm(length(BetaIV.in), mean=BetaIV.in, sd=seBetaIV.in[,2]) beta.boot[i,1:length(phi)] <- beta(BetaIV.in=BetaIV.boot, seBetaIV.in=rep(1, length(BetaIV))) beta.boot[i,(length(phi)+1):(2*length(phi))] <- beta(BetaIV.in=BetaIV.boot, seBetaIV.in=seBetaIV.in[,1]) beta.boot[i,(2*length(phi)+1):(3*length(phi))] <- beta(BetaIV.in=BetaIV.boot_NOME, seBetaIV.in=rep(1, length(BetaIV))) beta.boot[i,(3*length(phi)+1):(4*length(phi))] <- beta(BetaIV.in=BetaIV.boot_NOME, seBetaIV.in=seBetaIV.in[,2]) } return(beta.boot) } BetaIV <- y/x seBetaIV <- cbind(sqrt((sigmay^2)/(x^2) + ((y^2)*(sigmax^2))/(x^4)), sigmay/abs(x)) beta_SimpleMode <- beta(BetaIV.in=BetaIV, seBetaIV.in=rep(1, length(BetaIV))) beta_WeightedMode <- beta(BetaIV.in=BetaIV, seBetaIV.in=seBetaIV[,1]) beta_WeightedMode_NOME <- beta(BetaIV.in=BetaIV, seBetaIV.in=seBetaIV[,2]) beta_Mode <- rep(c(beta_SimpleMode, beta_WeightedMode, beta_SimpleMode, beta_WeightedMode_NOME)) beta_Mode.boot <- boot(BetaIV.in=BetaIV, seBetaIV.in=seBetaIV, beta_Mode.in=beta_Mode) se_Mode <- apply(beta_Mode.boot, 2, mad) CIlow_Mode <- beta_Mode-qnorm(1-alpha/2)*se_Mode CIupp_Mode <- beta_Mode+qnorm(1-alpha/2)*se_Mode P_Mode <- pt(abs(beta_Mode/se_Mode), df=length(x)-1, lower.tail=F)*2 Method <- rep(c('Simple', 'Weighted', 'Simple (NOME)', 'Weighted (NOME)'), each=length(phi)) Results <- data.frame(Method, phi, beta_Mode, se_Mode, CIlow_Mode, CIupp_Mode, P_Mode) colnames(Results) <- c('Method', 'phi', 'Estimate', 'SE', 'CI_low', 'CI_upp', 'P') return(Results) } MR_output <- function(ivw,egger,median, mode) { output = data.frame(matrix(NA, nrow=5, ncol=7)) names(output) = c("test", "parameter", "estimate", "se", "lower_CI", "upper_CI","p_value") output[1:5,1] = c("IVW","MR-Egger","MR-Egger","Weighted_median","Weighted_mode") output[1:5,2] = c("beta","beta","alpha","beta","beta") output[1,3:7] = c(IVW$beta.ivw,IVW$beta.se.ivw,IVW$beta.lower.ivw,IVW$beta.upper.ivw,IVW$beta.p.ivw) output[2,3:7] = c(Egger$beta.egger,Egger$beta.se.egger,Egger$beta.lower.egger,Egger$beta.upper.egger,Egger$beta.p.egger) output[3,3:7] = c(Egger$alpha.egger,Egger$alpha.se.egger,Egger$alpha.lower.egger,Egger$alpha.upper.egger,Egger$alpha.p.egger) output[4,3:7] = c(Median$beta.median,MedianBoot$beta.se.median,MedianBoot$beta.lower.median,MedianBoot$beta.upper.median,MedianBoot$beta.p.median) output[5,3:7] = c(Mode$Estimate[Mode$Method=="Weighted (NOME)" & Mode$phi==1.00],Mode$SE[Mode$Method=="Weighted (NOME)" & Mode$phi==1.00],Mode$CI_low[Mode$Method=="Weighted (NOME)" & Mode$phi==1.00],Mode$CI_upp[Mode$Method=="Weighted (NOME)" & Mode$phi==1.00],Mode$P[Mode$Method=="Weighted (NOME)" & Mode$phi==1.00]) return(output) } ### 2. Use the functions to estimate the results IVW <- two.sample.iv.ivw(gp,gd,segp,segd) Egger <- two.sample.iv.egger(gp,gd,segp,segd) Median <- weighted.median(wald_ratio,weight) MedianBoot <- weighted.median.boot(gp,gd,segp,segd,1000,0.05,weight) Mode <- ModeEstimator(gp,gd,segp,segd) sensitivity <- MR_output(IVW,Egger,Median,Mode) sensitivity #write.table(sensitivity,"./twosample_sensitivity_BMI_CHD.txt",sep="\t",col.names=T,row.names=F,quote=F) ###3 Interpret the results # a. How consistent do the estimates of the causal effect look across the different approaches? # b. Does a p value of p=0.64 for the MR Egger intercept indicate the pleiotropy is not present in the data? # c. Should we be concerned that the p value for the slope of MR Egger is > 0.05?