#############################
## BayesR
## Author: Jian Zeng
## Created: 16 June 2022
## Updated: 13 June 2024
#############################

library(MCMCpack)

bayesr = function(X, y, niter = 1000, gamma = c(0, 0.01, 0.1, 1), startPi = c(0.5, 0.3, 0.15, 0.05), startH2 = 0.5){
  # X: genotype matrix
  # y: phenotype vector
  # niter: number of iterations (MCMC chain length)
  # gamma: a vector of scalers for the variance in the multi-normal mixture prior
  # startPi: starting value for the distribution membership propabilities
  # startH2: starting value for the SNP-based heritability
  
  # Step 1. Declare and initialise variables
  n = nrow(X)          # number of observations
  m = ncol(X)          # number of SNPs
  ndist = length(startPi)  # number of mixture distributions
  pi = startPi         # starting value for pi
  h2 = startH2         # starting value for heritability
  vary = var(y)        # phenotypic variance
  varg = vary*h2       # starting value for genetic variance
  vare = vary*(1-h2)   # starting value for residual variance
  sigmaSq = varg/(m*sum(gamma*pi))    # common factor of SNP effect variance
  nub = 4              # prior degrees of freedom for SNP effect variance
  nue = 4              # prior degrees of freedom for residual variance
  scaleb = (nub-2)/nub*sigmaSq  # prior scale parameter for SNP effect variance
  scalee = (nue-2)/nue*vare     # prior scale parameter for residual variance
  beta = array(0,m)    # vector of SNP effects
  beta_mcmc = matrix(0,niter,m) # MCMC samples of SNP effects
  mu = mean(y)         # overall mean
  xpx = apply(X, 2, crossprod)  ## diagonal elements of X'X
  probDelta = vector("numeric", ndist)
  logDelta = array(0,2)
  keptIter = NULL      # keep MCMC samples
  
  # adjust/correct y for population mean and all SNP effects (which all have zero as initial value)
  ycorr = y - mu 

  # MCMC begins
  for (iter in 1:niter){
    # Step 2. Sample the mean from a normal posterior
    ycorr = ycorr + mu   # unadjust y with the old sample
    rhs = sum(ycorr)     # right hand side of the mixed model equation
    invLhs = 1/n         # inverse of left hand side of the mixed model equation
    muHat = invLhs*rhs   # BLUE solution
    mu = rnorm(1, muHat, sqrt(invLhs*vare))   # posterior is a normal distribution
    ycorr = ycorr - mu   # adjust y with the new sample

    # Step 3. Sample each SNP effect from a multi-normal mixture distribution
    logPi = log(pi)
    logPiComp = log(1-pi)
    invSigmaSq = 1/(gamma*c(sigmaSq))
    logSigmaSq = log(gamma*c(sigmaSq))
    nsnpDist = rep(0, ndist)  # number of SNPs asigned to each mixture component
    ssq = 0  # weighted sum of squares of SNP effects 
    nnz = 0  # number of nonzero effects
    ghat = array(0,n)  # individual genetic value
    # loop over SNPs
    for (j in 1:m){
      oldSample = beta[j]
      rhs = crossprod(X[,j], ycorr) + xpx[j]*oldSample  # right hand side of the mixed model equation
      rhs = rhs/vare
      invLhs = 1.0/(xpx[j]/c(vare) + invSigmaSq)        # inverse of left hand side of the mixed model equation
      betaHat = invLhs*c(rhs)                           # BLUP solution
      
      # sample the mixture distribution membership
      logDelta = 0.5*(log(invLhs) - logSigmaSq + betaHat*c(rhs)) + logPi  # log likelihood + prior for nonzero effects
      logDelta[1] = logPi[1]											                        # log likelihood + prior for zero effect
      for (k in 1:ndist) {
        probDelta[k] = 1.0/sum(exp(logDelta - logDelta[k]))   # posterior probability for each distribution membership
      }
      
      delta = sample(1:ndist, 1, prob = probDelta)   # indicator variable for the distribution membership
      nsnpDist[delta] = nsnpDist[delta] + 1
      
      if (delta > 1) {
        beta[j] = rnorm(1, betaHat[delta], sqrt(invLhs[delta]))  # given the distribution membership, the posterior is a normal distribution
        ycorr = ycorr + X[,j]*(oldSample - beta[j])              # update ycorr with the new sample
        ghat = ghat + X[,j]*beta[j]                              
        ssq = ssq + beta[j]^2 / gamma[delta]
        nnz = nnz + 1
      } else {
        if (oldSample) ycorr = ycorr + X[,j]*oldSample           # update ycorr with the new sample which is zero
        beta[j] = 0
      }
    }	
    beta_mcmc[iter,] = beta
    
    # Step 4. Sample the distribution membership probabilities from a Dirichlet distribution
    pi = rdirichlet(1, nsnpDist + 1)
    
    # Step 5. Sample the SNP effect variance from a scaled inverse chi-square distribution
    sigmaSq = (ssq + nub*scaleb)/rchisq(1, nnz+nub)
    
    # Step 6. Sample the residual variance from a scaled inverse chi-square distribution
    vare = (crossprod(ycorr) + nue*scalee)/rchisq(1, n+nue)
    
    # Step 7. Compute the SNP-based heritability
    varg = var(ghat)
    h2  = varg/(varg + vare)

	# The final step in each iteration is to store the MCMC samples for the parameter values
    keptIter <- rbind(keptIter,c(pi, nnz, sigmaSq, h2, vare, varg))
    
    if (!(iter%%100)) {
      cat (sprintf("\n iter %4s, nnz = %4s, sigmaSq = %6.3f, h2 = %6.3f, vare = %6.3f, varg = %6.3f \n", iter, nnz, sigmaSq, h2, vare, varg))
    }
  }
  
  colnames(keptIter) <- c(paste0("Pi", 1:ndist),"Nnz","SigmaSq","h2","Vare","Varg")
  postMean = apply(keptIter, 2, mean)  # posterior mean of MCMC samples is used as the point estimate for each parameter
  cat("\nPosterior mean:\n")
  print(postMean)
  return(list(par=keptIter, beta=beta_mcmc))
}