#####################################
# Summer Institute of Statistical Genetics 2016
# Module 18: Statistical & Quantitative Genetics of Disease
# Naomi Wray
# Practical 5 
# Genetic Risk Models
# Purpose to compare genetic risk models and show that they are exchangeable based on available data
#         to demonstrate why the liability threshold model is the model of choice
####################################
#Section 1
#Additive risk model.
#Section 1a) Set up the simulation and explore the data
####################################
N = 1e5                         # number of families
n = 100                         # number of loci
R = 1.1                         # relative risk of each risk allele
p = 0.2                         # allele frequency of each risk allele
K = 0.01                        # probability of disease
b=K/(2*n*p*R)                   # weight of each risk allele to be consistent with the observed population risk of disease of disease
indA=rbinom(N,2*n,p)            # number of risk loci carried by each individual
indR=indA*b*R                   # probability of disease (number of risk alleles * RR of each allele* linear regression)
indu=runif(N,0,1)               # random uniform number between 0 and 1
indD=c(rep(0,N))                # disease status set all to not diseased
indD[indR>indu]=1               # identify diseased individuals
round(length(indD[indD==1])/N,digits =3)         # check disease frequency is correct

opar<-par(mfrow=c(2,2))
hist(indA,xlab="indA = # risk alleles",main="Histogram of # risk alleles")
hist(indR,xlab="indR = probability of disease given # risk alleles ",main="Histogram of probability of disease")
plot(indA[1:min(1e5,N)],indR[1:min(1e5,N)],xlab='indA = # risk alleles',ylab="indR = probability of disease",main="Relationship between # alleles & prob of disease")
hist(indA[indD==0],col='black',xlab='indA = # risk alleles',main='Histogram of # risk alleles',xlim=c(min(indA),max(indA)))
hist(indA[indD==1],col='red',add=T)
legend('topleft',legend=c('Not diseased','Diseased'),fill=c('black','red'))
par(opar)
####################################
#Section 1b) Test under different scenarios
####################################
# Run the script in section 1a several times changing n,R,p,K e.g R=1.5, p=0.5, K=0.10
# Look at relationship between number of risk alleles and probability of disease, 
# could this be consistent with increased risk to relatives
# Pay particular attention to the maximum value of the probability of disease


####################################
#Section 2
#Multiplicative risk model. Slatkin 2008
#Section 2a) Set up the simulation and explore the data
####################################
N = 1e5                         # number of families
n = 100                         # number of loci
R = 1.1                         # relative risk of each risk allele
p = 0.2                         # allele frequency of each locus
K = 0.1                        # probabilty of disease
k0 = K/(1+p*(R-1))^(2*n)        # probability of disease in those that carry no risk alleles
indA=rbinom(N,2*n,p)            # number of risk loci carried by each individual
indR=exp(log(k0)+indA*log(R))   # probability of disease, same as indR=k0*R^indA, but avoids numerical issues of power 

# multiplicative on the disease scale but additive on the log scale
indu=runif(N,0,1)               # uniform number
indD=c(rep(0,N))                # disease status set all to not diseased
indD[indR>indu]=1               # identify diseased individuals
#indR[indR>1]=1
round(length(indD[indD==1])/N,digits =3)         # check disease frequency is correct
round(k0,digits=6)              # print probability of disease in those that carry no risk alleles

opar<-par(mfrow=c(2,2))
hist(indA,xlab="indA = # risk alleles",main="Histogram of # risk alleles")
hist(indR,xlab="indR = probability of disease given # risk alleles ",main="Histogram of probability of disease")
plot(indA[1:min(1e5,N)],indR[1:min(1e5,N)],pch=".",xlab='indA = # risk alleles',ylab="indR = probability of disease",main="Relationship between # alleles & prob of disease")
hist(indA[indD==0],col='black',xlab='indA = # risk alleles',main='Histogram of # risk alleles',xlim=c(min(indA),max(indA)))
hist(indA[indD==1],col='red',add=T)
legend('topleft',legend=c('Not diseased','Diseased'),fill=c('black','red'))
par(opar)
####################################
#Section 2b) Test under different scenarios
####################################
# Run the script in section 2b several times changing n,R,p,K, eg i) K=0.1, ii) K=0.1,R=1.2
# Pay particular attention to the maximum value of the probability of disease
# What do you conclude?

# After the line indR= add the line indR[indR>1]=1
# What does this do and what impact does this have?

####################################
#Section 3
#Logistic risk model. Janssens et al (2006) Pedictive testing for complex diseases using multiple genes: Fact or fiction? Genetics in Medicine
#Section 3a) Set up the simulation and explore the data
####################################
N = 1e5                         # number of families
n = 100                         # number of loci
R = 1.5                         # odds ratio of each risk allele
p = 0.2                         # allele frequency of each locus
K = 0.1                         # probabilty of disease                
prOdds=1/(n*R)^10               #Starting value for odds of disease for those carrying no risk loci 
indA=rbinom(N,2*n,p)            # number of risk loci carried by each individual
K_est=0
while (K_est< K){               #iterarte to find a prOdds for those carrying no risk loci that is consistent with K
poOdds=prOdds*R^indA
indR=poOdds/(1+poOdds)
indu=runif(N,0,1)               # uniform number
indD=c(rep(0,N))                # disease status set all to not diseased
indD[indR>indu]=1               # identify diseased individuals
K_est =length(indD[indD==1])/N  # trying to get this to match K  
prOdds=prOdds*1.05
}
round(length(indD[indD==1])/N,digits =3) # check disease frequency is correct
opar<-par(mfrow=c(2,2))
hist(indA,xlab="indA = # risk alleles",main="Histogram of # risk alleles")
hist(indR,xlab="indR = probability of disease given # risk alleles ",main="Histogram of probability of disease")
plot(indA[1:min(1e5,N)],indR[1:min(1e5,N)],pch=".",xlab='indA = # risk alleles',ylab="indR = probability of disease",main="Relationship between # alleles & prob of disease")
hist(indA[indD==0],col='black',xlab='indA = # risk alleles',main='Histogram of # risk alleles',xlim=c(min(indA),max(indA)))
hist(indA[indD==1],col='red',add=T)
legend('topleft',legend=c('Not diseased','Diseased'),fill=c('black','red'))
par(opar)
####################################
#Section 3b) Test under different scenarios
####################################
# Run the script in section 3a several times changing n,R,p,K
# Dont go higher than n=100, R =2 ..otherwise goestoo slowly
# Pay particular attention to the maximum value of the probability of disease
# Is this a useful parameterization to model disease

##################################
#Section 4 Liability threshold model
##################################
N = 1e5                         # number of families
h2 = 0.8                        # heritability of liability that reflects many combinations of p, R and n
K = 0.10                         # probabilty of disease                
indA=rnorm(N,0,sqrt(h2))          # N(0,1) representation of number of risk allele
t=qnorm(1-K)
indR=pnorm((indA-t)/sqrt(1-h2))
indu=runif(N,0,1)               # uniform number
indD=c(rep(0,N))                # disease status set all to not diseased
indD[indR>indu]=1               # identify diseased individuals
round(length(indD[indD==1])/N,digits =3) # check disease frequency is correct
                
opar<-par(mfrow=c(2,2))
hist(indA,xlab="indA = # risk alleles",main="Histogram of # risk alleles")
hist(indR,xlab="indR = probability of disease given # risk alleles ",main="Histogram of probability of disease")
plot(indA[1:min(1e5,N)],indR[1:min(1e5,N)],pch=".",xlab='indA = # risk alleles',ylab="indR = probability of disease",main="Relationship between # alleles & prob of disease")
hist(indA[indD==0],col='black',xlab='indA = # risk alleles',main='Histogram of # risk alleles',xlim=c(min(indA),max(indA)))
hist(indA[indD==1],col='red',add=T)
legend('topleft',legend=c('Not diseased','Diseased'),fill=c('black','red'))
par(opar)
#################################
# Run the script several times changing the parameters
# Is this a useful parameterization to model disease?
##################################