########################################################## ########## mQTL Analysis using MatrixEQTL ################ ########################################################## # Load required packages library(MatrixEQTL) library(stringr) library(ggplot2) library(ggpubr) library(snpStats) setwd("/working/sally/mQTL_Data/") # Replace with your working directory ### 1. File Preparation ### # Load methylation data meth <- read.table("/data/module2/mqtl/BSGS_Methylation_chr2_Norm_Beta.txt", header=T, check.names = F) meth[1:10,1:10] # Load probe information probe <- read.csv("/data/module2/mqtl/BSGS_Probe_Info.csv", header=T) probe[1:10,1:13] # We only require the probe/CpG site identifier, the chromosome number and the bp position probe_info <- probe[,c(1,12:13)] probe_info[1:10,] # Load genotype data (this data is in plink variant-major additive format to display the allele dosages, produced by "--recode A-transpose") geno <- read.table("/data/module2/mqtl/BSGS_Genotypes_chr2_clean.traw", header=T, check.names = F) geno[1:10,1:10] # We require the variant identifier, the chromosome number and the bp position in a separate file rownames(geno) <- geno$SNP geno[1:10,1:10] snp_pos <- geno[,c(2,1,4)] snp_pos[1:10,] # We require the genotype information for each individual in a separate file geno <- geno[,7:237] geno[1:10,1:10] # Sample ids need to be consistent between the files. In this dataset the number preceding "_" was already removed from the methylation sample IDs and so we will remove this from the genotype IDs. colnames(geno) <- str_split_fixed(colnames(geno), "_", 2)[,2] geno[1:10,1:10] # Load covariate data cov <- read.table("/data/module2/mqtl/BSGS_Cov.txt", header = T, check.names = F) cov[1:10,1:12] # Check that the same individuals are in the methylation, genotype and covariate tables. length(which(colnames(meth)%in%colnames(geno))) length(which(colnames(meth)%in%cov$IID)) # Check that the same probes are in the methylation and probe information tables. length(which(rownames(meth)%in%probe_info$IlmnID)) # Label columns and order the variant information table colnames(snp_pos) <- c("snp","chr","pos") snp_pos[1:10,] snp_pos <- snp_pos[order(snp_pos$snp),] write.table(snp_pos,"SNP_Pos.txt", row.names = F, quote = F, sep = "\t") # Label columns and order the probe information table colnames(probe_info) <- c("geneid","chr","s1") # We require a start and stop bp position for the probes. We have the start position and each probe is 50bp long so we can calculate the stop position. probe_info$s2 <- probe_info$s1+50 probe_info[1:10,] probe_info <- probe_info[order(probe_info$geneid),] write.table(probe_info,"Probe_Pos.txt", row.names = F, quote = F, sep = "\t") # The first column of the genotype table needs to contain variant identifiers. Both columns and rows are ordered. geno$id <- rownames(geno) geno <- geno[order(rownames(geno)),order(colnames(geno))] geno <- geno[,c(232,1:231)] geno[1:10,1:10] # Check that the variant identifiers are in the same order between the variant information table and the genotype table length(which(geno$id==snp_pos$snp)) write.table(geno,"BSGS_Genotype.txt", row.names = F, quote = F, sep = "\t") # The first column of the methylation table needs to contain probe identifiers. Both columns and rows are ordered. meth$id <- rownames(meth) meth <- meth[order(rownames(meth)),order(colnames(meth))] meth <- meth[,c(232,1:231)] meth[1:10,1:10] # Check that the probe identifiers are in the same order between the probe information table and the methylation table length(which(meth$id==probe_info$geneid)) write.table(meth,"BSGS_Methylation.txt", row.names = F, quote = F, sep = "\t") # We prepare the covariate file by removing unnecessary columns and recoding categorical variables as binary variables. # Individuals should be orientated as columns and covariates as rows. cov$FID <- NULL colnames(cov)[1] <- "id" cov$Sex[which(cov$Sex=="M")] <- 1 cov$Sex[which(cov$Sex=="F")] <- 0 cov[1:10,1:11] # A quick way of recoding a categorical variable into a series of binary variables is to use the model.matrix function cov$Slide <- as.factor(cov$Slide) Slide <- cov$Slide slide <- as.data.frame(model.matrix(~ Slide - 1)) slide[1:10,] cov2 <- cbind(cov,slide) cov2$Slide <- NULL # collinear variables must be removed cov2$Slide7512578087 <- NULL cov3 <- as.data.frame(t(cov2)) cov3[,1:5] write.table(cov3,"BSGS_covariates.txt", row.names = T, col.names=F, quote = F, sep = "\t") #### 2. Run MatrixEQTL Analysis #### # Location of the data files. base.dir = "/working/sally/mQTL_Data"; # Replace with your working directory ## Settings # Genotype and variant information file names SNP_file_name = paste(base.dir,"/BSGS_Genotype.txt",sep=""); snps_location_file_name = paste(base.dir,"/SNP_Pos.txt",sep=""); snps <- read.table(SNP_file_name, sep="\t", header=T, row.names = 1, check.names = F) snps[1,] # Methylation and probe information file names expression_file_name = paste(base.dir,"/BSGS_Methylation.txt",sep=""); gene_location_file_name = paste(base.dir,"/Probe_Pos.txt",sep=""); expr = read.table(expression_file_name, sep="\t", header=T, row.names = 1, check.names = F) expr[1,] # Covariate file name covariates_file_name = paste(base.dir,"/BSGS_covariates.txt",sep=""); cvrt = read.table(covariates_file_name, sep = "\t", header=T, row.names = 1, check.names = F) cvrt[1,] # Example of basic linear regression m1 =as.numeric(expr[1,]) s1 =as.numeric(snps[1,]) lm1 = lm(m1~s1) summary(lm1) plot(m1~jitter(s1), col=(s1+1), xaxt="n", xlab="Genotype", ylab="Methylation") axis(1,at=(0:2),labels=c("AA","Aa","aa")) lines(lm1$fitted.values~s1,type="b",pch=15,col="darkgrey") png("Example.png", width=3000, height=3000, res = 300) plot(m1~jitter(s1), col=(s1+1), xaxt="n", xlab="Genotype", ylab="Methylation") axis(1,at=(0:2),labels=c("AA","Aa","aa")) lines(lm1$fitted.values~s1,type="b",pch=15,col="darkgrey") dev.off() # Output file names output_file_name_cis = tempfile(); output_file_name_tra = tempfile(); # Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS # Only associations significant at this level will be saved pvOutputThreshold_cis = 1e-5; pvOutputThreshold_tra = 0; # Error covariance matrix # Set to numeric() for identity. errorCovariance = numeric(); # Distance for local gene-SNP pairs cisDist = 2.5e5; ## Load genotype data snps = SlicedData$new(); snps$fileDelimiter = "\t"; # the TAB character snps$fileOmitCharacters = "NA"; # denote missing values; snps$fileSkipRows = 1; # one row of column labels snps$fileSkipColumns = 1; # one column of row labels snps$fileSliceSize = 2000; # read file in slices of 2,000 rows snps$LoadFile(SNP_file_name); ## Load methylation data gene = SlicedData$new(); gene$fileDelimiter = "\t"; # the TAB character gene$fileOmitCharacters = "NA"; # denote missing values; gene$fileSkipRows = 1; # one row of column labels gene$fileSkipColumns = 1; # one column of row labels gene$fileSliceSize = 2000; # read file in slices of 2,000 rows gene$LoadFile(expression_file_name); ## Load covariates cvrt = SlicedData$new(); cvrt$fileDelimiter = "\t"; # the TAB character cvrt$fileOmitCharacters = "NA"; # denote missing values; cvrt$fileSkipRows = 1; # one row of column labels cvrt$fileSkipColumns = 1; # one column of row labels if(length(covariates_file_name)>0) { cvrt$LoadFile(covariates_file_name); } ## Run the analysis snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE); genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE); me = Matrix_eQTL_main( snps = snps, gene = gene, cvrt = cvrt, output_file_name = output_file_name_tra, pvOutputThreshold = pvOutputThreshold_tra, useModel = useModel, errorCovariance = errorCovariance, verbose = TRUE, output_file_name.cis = output_file_name_cis, pvOutputThreshold.cis = pvOutputThreshold_cis, snpspos = snpspos, genepos = genepos, cisDist = cisDist, pvalue.hist = "qqplot", min.pv.by.genesnp = FALSE, noFDRsaveMemory = FALSE); ## Results: unlink(output_file_name_tra); unlink(output_file_name_cis); dim(me$cis$eqtls) me$cis$eqtls[1:10,] results <- me$cis$eqtls ## Apply a more stringent correction for multiple testing (Bonferroni) # Running it without a significance threshold will return all SNP x Probe combinations within cis distance specified. # Total number of cis tests is 38,735,041. This number can be used when applying a Bonferroni significance cutoff. sig <- which(results$pvalue<(0.05/38735041)) sig_results <- results[sig,] write.csv(sig_results,"mQTL_output.csv", quote = F, row.names = F) ## Select the lead mQTL for each probe/CpG site sig_results <- sig_results[order(sig_results$pvalue),] head(sig_results) sig_results_lead <- sig_results[!duplicated(sig_results$gene),] head(sig_results_lead) #### 3. Plotting mQTLs #### # Format methylation data meth[1:10,1:10] meth$id <- NULL meth[1:10,1:10] # Set variant and probe of interest variant <- "rs842663" cpg <- "cg22276800" # Read in genotype data for SNP of interest (genotypes are in plink binary format) snp <- read.plink("/data/module2/mqtl/BSGS_unrelated_chr2.bed", "/data/module2/mqtl/BSGS_unrelated_chr2.bim", "/data/module2/mqtl/BSGS_unrelated_chr2.fam", na.strings = c("0", "-9"), sep = "." , select.subjects = NULL, select.snps = variant) genotypes <- as.data.frame(snp$genotypes) genotypes[1:10,] snp$map # Recode the numerical genotype with the alleles genotypes$genotype <- 0 index<- which(genotypes[,1]==1) allele1 <- paste(snp$map$allele.1,snp$map$allele.1,sep = "") genotypes$genotype[index] <- allele1 index<- which(genotypes[,1]==2) allele12 <- paste(snp$map$allele.1,snp$map$allele.2,sep = "") genotypes$genotype[index] <- allele12 index<- which(genotypes[,1]==3) allele2 <- paste(snp$map$allele.2,snp$map$allele.2,sep = "") genotypes$genotype[index] <- allele2 genotypes <- genotypes[sort(rownames(genotypes)),] head(genotypes) # Subset methylation data for probe of interest index <- which(rownames(meth)==cpg) probe <- meth[index,] head(probe) probe <- t(probe) probe <- probe[sort(rownames(probe)),] probe <- as.data.frame(probe) # Merge genotype and methylation data genotypes$ID <- rownames(genotypes) probe$ID <- rownames(probe) data <- merge(genotypes, probe, by.x = "ID", by.y = "ID") head(data) # Plot plot <- ggplot(data, aes(x = genotype, y = probe)) + geom_boxplot(fill = "mediumpurple1") + ylab("Methylation") + xlab("Genotype") + theme(text = element_text(size=20), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(colour = "black")) + ggtitle(paste(variant,"_",cpg,sep = "")) + geom_point() plot png(paste(cpg,"_",variant,"_plot.png",sep=""), width=3000, height=3000, res = 300) plot dev.off()