The goal of fine-mapping is to prioritise SNPs that are most likely to be causal (or in LD with causal variants).
In this practical session will apply the Sum of Single Effects (SuSiE) fine-mapping method to investigate the genetic effects underlying height.
We will use summary statistics from Wood et al (2014). This is a GWAS meta-analysis of adult height from 79 studies consisting of 253,288 individuals of European ancestry. This study identified 697 SNPs that reached genome-wide significance (P < 5 × 10−8) that were approximately conditionally independent. We will fine-map some of the regions surrounding these loci.
The data can be found in the directory
/data/module1/11_fineMapping/
on the cluster.
Read the GWAS summary statistics in R
. The alleles have
already been aligned to 1000Genomes which will be our reference
dataset.
Wood = read.table("/data/module1/11_fineMapping/Height_Wood_2014_aligned100G.ma", header=T)
Extract the 1Mb region surrounding SNP rs955748
locus1 = Wood[Wood$CHR==4 & Wood$POS < 184215675 + 5e5 & Wood$POS > 184215675-5e5,]
Export the locus for fine-mapping and the list of SNPs to construct the LD reference
# Export the SNPs in this locus
write.table(locus1$SNP, file="locus1_snps.txt", col.names=F, row.names=F, quote=F)
# Export the summary stats for the locus
write.table(locus1, file="locus1_sumstats.txt", col.names=T, row.names=F, quote=F)
We can create a locus plot of this region. This helps us to understand the genetic architecture underlying this signal.
library(ggplot2)
library(ggrepel)
ggplot(locus1, aes(x=POS, y=-log10(P))) +
geom_point(alpha=0.8, size=1.3) +
geom_label_repel(data=subset(locus1, locus1$SNP=="rs955748"),
aes(label=SNP), size=4, max.overlaps = 20)
As the original genotypes are not available we can use a reference panel to estimate the LD matrix using Plink.
Here we use 1000Genomes as the LD reference using European samples to match the ancestry of our summary statistics. We create an LD matrix from those SNPs in our locus. Note --r calculates and reports raw inter-variant allele count correlations, while --r2 reports squared correlations. This can be run in the command line.
plink \
--bfile /data/module1/Ref_bfile/g1000_eur \
--keep-allele-order \
--r square \
--extract locus1_snps.txt \
--out locus1_1000G
susieR
using locus summary statistics
and reference LD matrixRead in the input locus and reference LD matrix in R
.
Check these have the same number of SNPs present in both.
library(susieR)
locus1 = read.table("locus1_sumstats.txt", header=T)
colnames(locus1) <- c("SNP", "A1", "A2", "freq", "b", "se", "P", "N", "CHR", "POS")
ld = read.csv("locus1_1000G.ld",sep="\t",header=F)
ld.mat = matrix(as.vector(data.matrix(ld)), nrow=nrow(locus1), ncol=nrow(locus1))
We use the function susie_rss
. This performs variable
selection under a sparse Bayesian multiple linear regression. There are
an unknown number of causal SNPs in our region. By default
susieR
assumes at most L = 10
.
fitted_rss = susie_rss(bhat = locus1$b, shat = locus1$se, R = ld.mat, n = median(locus1$N))
For the susieR
method credible sets are defined as a set
of SNPs containing one of the causal SNPs with some fixed
coverage (95% by default). We can output the 95% credible set.
Constructing the credible set involves sorting SNPs by decreasing PIP
and then including variables in the credible set until their cumulative
probability exceeds the threshold.
print(fitted_rss$sets)
## $cs
## $cs$L1
## [1] 338 361 362 364 367
##
##
## $purity
## min.abs.corr mean.abs.corr median.abs.corr
## L1 0.929309 0.9633603 0.9643675
##
## $cs_index
## [1] 1
##
## $coverage
## [1] 0.9917749
##
## $requested_coverage
## [1] 0.95
susieR
reports one 95% credible set ($cs
containing 5 variants). The minimum absolute correlation
$purity$min.abs.corr
between SNPs in the set is 0.929. This
tells us the credible set contains highly correlated variables.
Merge the credible set with the locus summary statistics to get the SNP IDs
locus1$variable = rownames(locus1)
locus1_susie = merge(x=locus1, y=summary(fitted_rss)$vars,
by="variable", all.x=TRUE)
locus1_susie[locus1_susie$cs!=-1,]
Identify the SNPs from credible set in the locus plot
library(ggplot2)
library(ggrepel)
ggplot(locus1_susie, aes(x=POS, y=-log10(P))) +
geom_point(alpha=0.8, size=1.3) +
geom_point(data=subset(locus1_susie, locus1_susie$cs!=-1), aes(color=cs), size=4) +
geom_label_repel( data=subset(locus1_susie, locus1_susie$cs!=-1), aes(label=SNP), size=4, max.overlaps = 20)
Using summary
, we can examine the PIP for each variable
in the credible set. PIP is defined as the probability for a SNP to be
one of the causal SNPs in the region.
summary(fitted_rss)
##
## Variables in credible sets:
##
## variable variable_prob cs
## 362 0.75082567 1
## 361 0.07989141 1
## 367 0.06965246 1
## 364 0.06951996 1
## 338 0.05953722 1
##
## Credible sets summary:
##
## cs cs_log10bf cs_avg_r2 cs_min_r2 variable
## 1 10.10509 0.9280631 0.8636152 338,361,362,364,367
Question: If two SNPs have the same (or almost the
same) PIP what does this mean in terms of the LD between these SNPs? You
can estimate the LD between two SNPs using the --ld
command
in plink
Plot the PIP for each SNP within the locus
ggplot(locus1_susie, aes(x=POS, y=variable_prob)) +
geom_point(alpha=0.8, size=1.3) +
geom_point(data=subset(locus1_susie, locus1_susie$cs!=-1), aes(color=cs), size=4) +
geom_label_repel(data=subset(locus1_susie, locus1_susie$cs!=-1), aes(label=SNP), size=4, max.overlaps = 20)
Question: Consider the 95% credible set above and the PIP of each SNP in the set. Why was the coverage of this set greater than the 95% requested?
Question: What happens if we change the credible set coverage threshold from 95% to 80%? What is the probability that this set contains at least one causal SNP ?
fitted_rss_0.8 = susie_rss(bhat = locus1$b, shat = locus1$se, R = ld.mat, n = median(locus1$N), coverage = 0.8)
print(fitted_rss_0.8$sets)
Extract the 1Mb region surrounding SNP rs17075869
Wood = read.table("/data/module1/11_fineMapping/Height_Wood_2014_aligned100G.ma", header=T)
locus2 = Wood[Wood$CHR==5 & Wood$POS < 172811280 + 5e5 & Wood$POS > 172811280-5e5,]
write.table(locus2$SNP, file="locus2_snps.txt", col.names=F, row.names=F, quote=F)
write.table(locus2, file="locus2_sumstats.txt", col.names=T, row.names=F, quote=F)
Construct LD matrix using reference data
plink \
--bfile /data/module1/Ref_bfile/g1000_eur \
--keep-allele-order \
--r square \
--extract locus2_snps.txt \
--out locus2_1000G
Fine-mapping with susieR
library(susieR)
locus2 = read.table("locus2_sumstats.txt", header=T)
colnames(locus2) <- c("SNP", "A1", "A2", "freq", "b", "se", "P", "N", "CHR", "POS")
ld2 = read.csv("locus2_1000G.ld",sep="\t",header=F)
ld.mat2 = matrix(as.vector(data.matrix(ld2)), nrow=nrow(locus2), ncol=nrow(locus2))
fitted_rss2 = susie_rss(bhat = locus2$b, shat = locus2$se, R = ld.mat2, n = median(locus2$N))
## HINT: For large R or large XtX, consider installing the Rfast package for better performance.
Credible sets
print(fitted_rss2$sets)
## $cs
## $cs$L1
## [1] 642 645 656 660
##
## $cs$L3
## [1] 714 725
##
## $cs$L2
## [1] 367 374 376 377 409
##
##
## $purity
## min.abs.corr mean.abs.corr median.abs.corr
## L1 0.997801 0.998534 0.997801
## L3 0.990243 0.990243 0.990243
## L2 0.854276 0.940031 0.935560
##
## $cs_index
## [1] 1 3 2
##
## $coverage
## [1] 0.9857992 0.9633436 0.9560693
##
## $requested_coverage
## [1] 0.95
Question: How many credible sets are there?
Question: What does it mean to have more than one credible set?
Merge this with the locus summary statistics to get the SNP ID
locus2$variable = rownames(locus2)
locus2_susie = merge(x=locus2, y=summary(fitted_rss2)$vars,
by="variable", all.x=TRUE)
locus2_susie[locus2_susie$cs!=-1,]
Identify the SNPs from credible set in the locus plot
library(ggplot2)
library(ggrepel)
ggplot(locus2_susie, aes(x=POS, y=-log10(P))) +
geom_point(alpha=0.8, size=1.3) +
geom_point(data=subset(locus2_susie, locus2_susie$cs!=-1), aes(color=cs), size=4) +
geom_label_repel( data=subset(locus2_susie, locus2_susie$cs!=-1), aes(label=SNP), size=4, max.overlaps = 20)
Posterior inclusion probabilities
summary(fitted_rss2)
##
## Variables in credible sets:
##
## variable variable_prob cs
## 714 0.73014071 3
## 409 0.66962497 2
## 656 0.33147718 1
## 660 0.33147718 1
## 725 0.23441525 3
## 642 0.16317456 1
## 645 0.16317456 1
## 367 0.08941335 2
## 376 0.08941335 2
## 377 0.08941335 2
## 374 0.02316548 2
##
## Credible sets summary:
##
## cs cs_log10bf cs_avg_r2 cs_min_r2 variable
## 1 19.687063 0.9970701 0.9956068 642,645,656,660
## 3 5.083564 0.9805812 0.9805812 714,725
## 2 2.927427 0.8836583 0.7297875 367,374,376,377,409
Question: Which SNPs have the greatest evidence of having a non-zero effect on height?
ggplot(locus2_susie, aes(x=POS, y=variable_prob)) +
geom_point(alpha=0.8, size=1.3) +
geom_point(data=subset(locus2_susie, locus2_susie$cs!=-1), aes(color=cs), size=4) +
geom_label_repel(data=subset(locus2_susie, locus2_susie$cs!=-1), aes(label=SNP), size=4, max.overlaps = 20)
Question: What factors may affect the fine-mapping results in the above example?
Locus 1: Extract the 1Mb region surrounding SNP rs955748
UKB = read.table("/data/module1/11_fineMapping/Height_UKB_EURu_aligned100G.ma", header=T)
locus1_UKB = UKB[UKB$CHR==4 & UKB$POS < 184215675 + 5e5 & UKB$POS > 184215675-5e5,]
# Export the SNPs in this locus
write.table(locus1_UKB$SNP, file="locus1_UKB_snps.txt", col.names=F, row.names=F, quote=F)
# Export the summary stats for the locus
write.table(locus1_UKB, file="locus1_UKB_sumstats.txt", col.names=T, row.names=F, quote=F)
Construct LD matrix using reference data
plink \
--bfile /data/module1/Ref_bfile/g1000_eur \
--keep-allele-order \
--r square \
--extract locus1_UKB_snps.txt \
--out locus1_UKB_1000G
Fine-mapping with susieR
library(susieR)
locus1_UKB = read.table("locus1_UKB_sumstats.txt", header=T)
colnames(locus1_UKB) <- c("SNP", "A1", "A2", "freq", "b", "se", "P", "N", "CHR", "POS")
ld1_UKB = read.csv("locus1_UKB_1000G.ld",sep="\t",header=F)
ld.mat1_UKB = matrix(as.vector(data.matrix(ld1_UKB)), nrow=nrow(locus1_UKB), ncol=nrow(locus1_UKB))
fitted_rss_UKB = susie_rss(bhat = locus1_UKB$b, shat = locus1_UKB$se, R = ld.mat1_UKB, n = median(locus1_UKB$N))
print(fitted_rss_UKB$sets)
locus1_UKB$variable = rownames(locus1_UKB)
locus1_UKB_susie = merge(x=locus1_UKB, y=summary(fitted_rss_UKB)$vars,
by="variable", all.x=TRUE)
locus1_UKB_susie[locus1_UKB_susie$cs!=-1,]
Question: Why might we obtain different results when using different studies?
G. Wang, G., Sarkar, A., Carbonetto, P. & Stephens, M. (2020). A simple new approach to variable selection in regression, with application to genetic fine mapping. Journal of the Royal Statistical Society, Series B 82, 1273–1300. https://doi.org/10.1111/rssb.12388
susieR
package in R: https://cran.r-project.org/web/packages/susieR/vignettes/finemapping_summary_statistics.html
Wood, A., Esko, T., Yang, J. et al. Defining the role of common
variation in the genomic and biological architecture of adult human
height. Nat Genet 46, 1173–1186 (2014). https://doi.org/10.1038/ng.3097