##########################################################
########## mQTL Analysis using MatrixEQTL ################
##########################################################

# Load required packages
library(MatrixEQTL)
library(stringr)
library(ggplot2)
library(ggpubr)
library(snpStats)

setwd("/working/sally/mQTL_Data/") # Replace with your working directory

### 1. File Preparation ###

# Load methylation data
meth <- read.table("/data/module2/mqtl/BSGS_Methylation_chr2_Norm_Beta.txt", header=T, check.names = F)
meth[1:10,1:10]

# Load probe information
probe <- read.csv("/data/module2/mqtl/BSGS_Probe_Info.csv", header=T)
probe[1:10,1:13]
# We only require the probe/CpG site identifier, the chromosome number and the bp position
probe_info <- probe[,c(1,12:13)]
probe_info[1:10,]

# Load genotype data (this data is in plink variant-major additive format to display the allele dosages, produced by "--recode A-transpose")
geno <- read.table("/data/module2/mqtl/BSGS_Genotypes_chr2_clean.traw", header=T, check.names = F)
geno[1:10,1:10]
# We require the variant identifier, the chromosome number and the bp position in a separate file
rownames(geno) <- geno$SNP
geno[1:10,1:10]
snp_pos <- geno[,c(2,1,4)]
snp_pos[1:10,]
# We require the genotype information for each individual in a separate file
geno <- geno[,7:237]
geno[1:10,1:10]
# Sample ids need to be consistent between the files. In this dataset the number preceding "_" was already removed from the methylation sample IDs and so we will remove this from the genotype IDs.
colnames(geno) <- str_split_fixed(colnames(geno), "_", 2)[,2]
geno[1:10,1:10]

# Load covariate data
cov <- read.table("/data/module2/mqtl/BSGS_Cov.txt", header = T, check.names = F)
cov[1:10,1:12]

# Check that the same individuals are in the methylation, genotype and covariate tables.
length(which(colnames(meth)%in%colnames(geno)))
length(which(colnames(meth)%in%cov$IID))

# Check that the same probes are in the methylation and probe information tables.
length(which(rownames(meth)%in%probe_info$IlmnID))

# Label columns and order the variant information table
colnames(snp_pos) <- c("snp","chr","pos")
snp_pos[1:10,]
snp_pos <- snp_pos[order(snp_pos$snp),]
write.table(snp_pos,"SNP_Pos.txt", row.names = F, quote = F, sep = "\t")

# Label columns and order the probe information table
colnames(probe_info) <- c("geneid","chr","s1")
# We require a start and stop bp position for the probes. We have the start position and each probe is 50bp long so we can calculate the stop position.
probe_info$s2 <- probe_info$s1+50
probe_info[1:10,]
probe_info <- probe_info[order(probe_info$geneid),]
write.table(probe_info,"Probe_Pos.txt", row.names = F, quote = F, sep = "\t")

# The first column of the genotype table needs to contain variant identifiers. Both columns and rows are ordered.
geno$id <- rownames(geno)
geno <- geno[order(rownames(geno)),order(colnames(geno))]
geno <- geno[,c(232,1:231)]
geno[1:10,1:10]
# Check that the variant identifiers are in the same order between the variant information table and the genotype table
length(which(geno$id==snp_pos$snp))
write.table(geno,"BSGS_Genotype.txt", row.names = F, quote = F, sep = "\t")

# The first column of the methylation table needs to contain probe identifiers. Both columns and rows are ordered.
meth$id <- rownames(meth)
meth <- meth[order(rownames(meth)),order(colnames(meth))]
meth <- meth[,c(232,1:231)]
meth[1:10,1:10]
# Check that the probe identifiers are in the same order between the probe information table and the methylation table
length(which(meth$id==probe_info$geneid))
write.table(meth,"BSGS_Methylation.txt", row.names = F, quote = F, sep = "\t")

# We prepare the covariate file by removing unnecessary columns and recoding categorical variables as binary variables.
# Individuals should be orientated as columns and covariates as rows.
cov$FID <- NULL
colnames(cov)[1] <- "id"
cov$Sex[which(cov$Sex=="M")] <- 1
cov$Sex[which(cov$Sex=="F")] <- 0
cov[1:10,1:11]
# A quick way of recoding a categorical variable into a series of binary variables is to use the model.matrix function
cov$Slide <- as.factor(cov$Slide)
Slide <- cov$Slide
slide <- as.data.frame(model.matrix(~ Slide - 1))
slide[1:10,]
cov2 <- cbind(cov,slide)
cov2$Slide <- NULL
# collinear variables must be removed
cov2$Slide7512578087 <- NULL
cov3 <- as.data.frame(t(cov2))
cov3[,1:5]
write.table(cov3,"BSGS_covariates.txt", row.names = T, col.names=F, quote = F, sep = "\t")

#### 2. Run MatrixEQTL Analysis ####

# Location of the data files.
base.dir = "/working/sally/mQTL_Data"; # Replace with your working directory

## Settings

# Genotype and variant information file names
SNP_file_name = paste(base.dir,"/BSGS_Genotype.txt",sep="");
snps_location_file_name = paste(base.dir,"/SNP_Pos.txt",sep="");

snps <- read.table(SNP_file_name, sep="\t", header=T, row.names = 1, check.names = F)
snps[1,]

# Methylation and probe information file names
expression_file_name = paste(base.dir,"/BSGS_Methylation.txt",sep="");
gene_location_file_name = paste(base.dir,"/Probe_Pos.txt",sep="");

expr = read.table(expression_file_name, sep="\t", header=T, row.names = 1, check.names = F)
expr[1,]

# Covariate file name
covariates_file_name = paste(base.dir,"/BSGS_covariates.txt",sep="");
cvrt = read.table(covariates_file_name, sep = "\t", header=T, row.names = 1, check.names = F)
cvrt[1,]

# Example of basic linear regression
m1 =as.numeric(expr[1,])
s1 =as.numeric(snps[1,])
lm1 = lm(m1~s1)
summary(lm1)

plot(m1~jitter(s1), col=(s1+1), xaxt="n", xlab="Genotype", ylab="Methylation")
axis(1,at=(0:2),labels=c("AA","Aa","aa"))
lines(lm1$fitted.values~s1,type="b",pch=15,col="darkgrey")

png("Example.png", width=3000, height=3000, res = 300)
plot(m1~jitter(s1), col=(s1+1), xaxt="n", xlab="Genotype", ylab="Methylation")
axis(1,at=(0:2),labels=c("AA","Aa","aa"))
lines(lm1$fitted.values~s1,type="b",pch=15,col="darkgrey")
dev.off()

# Output file names
output_file_name_cis = tempfile();
output_file_name_tra = tempfile();

# Linear model to use, modelANOVA, modelLINEAR, or modelLINEAR_CROSS
useModel = modelLINEAR; # modelANOVA, modelLINEAR, or modelLINEAR_CROSS

# Only associations significant at this level will be saved
pvOutputThreshold_cis = 1e-5;
pvOutputThreshold_tra = 0;

# Error covariance matrix
# Set to numeric() for identity.
errorCovariance = numeric();

# Distance for local gene-SNP pairs
cisDist = 2.5e5;

## Load genotype data

snps = SlicedData$new();
snps$fileDelimiter = "\t";      # the TAB character
snps$fileOmitCharacters = "NA"; # denote missing values;
snps$fileSkipRows = 1;          # one row of column labels
snps$fileSkipColumns = 1;       # one column of row labels
snps$fileSliceSize = 2000;      # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name);

## Load methylation data

gene = SlicedData$new();
gene$fileDelimiter = "\t";      # the TAB character
gene$fileOmitCharacters = "NA"; # denote missing values;
gene$fileSkipRows = 1;          # one row of column labels
gene$fileSkipColumns = 1;       # one column of row labels
gene$fileSliceSize = 2000;      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name);

## Load covariates

cvrt = SlicedData$new();
cvrt$fileDelimiter = "\t";      # the TAB character
cvrt$fileOmitCharacters = "NA"; # denote missing values;
cvrt$fileSkipRows = 1;          # one row of column labels
cvrt$fileSkipColumns = 1;       # one column of row labels
if(length(covariates_file_name)>0) {
  cvrt$LoadFile(covariates_file_name);
}

## Run the analysis
snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE);
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);

me = Matrix_eQTL_main(
  snps = snps, 
  gene = gene, 
  cvrt = cvrt,
  output_file_name     = output_file_name_tra,
  pvOutputThreshold     = pvOutputThreshold_tra,
  useModel = useModel, 
  errorCovariance = errorCovariance, 
  verbose = TRUE, 
  output_file_name.cis = output_file_name_cis,
  pvOutputThreshold.cis = pvOutputThreshold_cis,
  snpspos = snpspos, 
  genepos = genepos,
  cisDist = cisDist,
  pvalue.hist = "qqplot",
  min.pv.by.genesnp = FALSE,
  noFDRsaveMemory = FALSE);

## Results:
unlink(output_file_name_tra);
unlink(output_file_name_cis);
dim(me$cis$eqtls)
me$cis$eqtls[1:10,]
results <- me$cis$eqtls

## Apply a more stringent correction for multiple testing (Bonferroni)
# Running it without a significance threshold will return all SNP x Probe combinations within cis distance specified.
# Total number of cis tests is 38,735,041. This number can be used when applying a Bonferroni significance cutoff.
sig <- which(results$pvalue<(0.05/38735041))
sig_results <- results[sig,]
write.csv(sig_results,"mQTL_output.csv", quote = F, row.names = F)

## Select the lead mQTL for each probe/CpG site
sig_results <- sig_results[order(sig_results$pvalue),]
head(sig_results)
sig_results_lead <- sig_results[!duplicated(sig_results$gene),]
head(sig_results_lead)

#### 3. Plotting mQTLs ####

# Format methylation data
meth[1:10,1:10]
meth$id <- NULL
meth[1:10,1:10]

# Set variant and probe of interest
variant <- "rs842663"
cpg <- "cg22276800"

# Read in genotype data for SNP of interest (genotypes are in plink binary format)
snp <- read.plink("/data/module2/mqtl/BSGS_unrelated_chr2.bed", "/data/module2/mqtl/BSGS_unrelated_chr2.bim", "/data/module2/mqtl/BSGS_unrelated_chr2.fam", na.strings = c("0", "-9"), sep = "." , select.subjects = NULL, select.snps = variant)
genotypes <- as.data.frame(snp$genotypes)
genotypes[1:10,]
snp$map
# Recode the numerical genotype with the alleles
genotypes$genotype <- 0
index<- which(genotypes[,1]==1)
allele1 <- paste(snp$map$allele.1,snp$map$allele.1,sep = "")
genotypes$genotype[index] <- allele1
index<- which(genotypes[,1]==2)
allele12 <- paste(snp$map$allele.1,snp$map$allele.2,sep = "")
genotypes$genotype[index] <- allele12
index<- which(genotypes[,1]==3)
allele2 <- paste(snp$map$allele.2,snp$map$allele.2,sep = "")
genotypes$genotype[index] <- allele2
genotypes <- genotypes[sort(rownames(genotypes)),]
head(genotypes)

# Subset methylation data for probe of interest
index <- which(rownames(meth)==cpg)
probe <- meth[index,]
head(probe)
probe <- t(probe)
probe <- probe[sort(rownames(probe)),]
probe <- as.data.frame(probe)

# Merge genotype and methylation data
genotypes$ID <- rownames(genotypes)
probe$ID <- rownames(probe)
data <- merge(genotypes, probe, by.x = "ID", by.y = "ID")
head(data)

# Plot 
plot <- ggplot(data, aes(x = genotype, y = probe)) + geom_boxplot(fill = "mediumpurple1") + ylab("Methylation") + xlab("Genotype") + theme(text = element_text(size=20), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(colour = "black")) + ggtitle(paste(variant,"_",cpg,sep = "")) + geom_point()
plot
png(paste(cpg,"_",variant,"_plot.png",sep=""), width=3000, height=3000, res = 300)
plot
dev.off()