Pathway activity functions • bixverse

Single-sample pathway activity

A common question in transcriptomics is whether the genes belonging to a given pathway are concordantly up- or down-regulated in each sample. Several algorithms tackle this by collapsing a gene-by-sample expression matrix into a pathway-by-sample score matrix, the most widely used being GSVA and ssGSEA. biverse provides fast versions of both with some tricks under the hood to make them faster than the original implementations. Moreover, if you have multiple contrasts (different drugs vs. DMSO for example). bixverse also provides Rust-accelerated implementations from mitch.

library(bixverse)

Data

Following the GSVA vignette convention, we generate some synthetic expression data (Gaussian-distributed to mimic normalised microarray or RNA-seq log counts, and Poisson-distributed to mimic raw counts) together with a collection of random gene sets.

p <- 10000L # genes
n <- 100L # samples

# Gaussian expression matrix
X <- matrix(
  rnorm(p * n),
  nrow = p,
  dimnames = list(paste0("g", seq_len(p)), paste0("s", seq_len(n)))
)

# Poisson count matrix
X_counts <- matrix(
  rpois(p * n, lambda = 10),
  nrow = p,
  dimnames = list(paste0("g", seq_len(p)), paste0("s", seq_len(n)))
)
storage.mode(X_counts) <- "numeric"

# Random gene sets of varying size
gs <- as.list(sample(10:100, size = 250, replace = TRUE))
gs <- lapply(
  gs,
  function(n, p) paste0("g", sample(seq_len(p), size = n, replace = FALSE)),
  p
)
names(gs) <- paste0("gs", seq_along(gs))

GSVA

Gaussian kernel

The Gaussian kernel version of GSVA is appropriate for continuous expression values (log-CPM, microarray intensities, etc.). Running it in bixverse is a single call:

bixverse_res_gaussian <- calc_gsva(
  exp = X,
  pathways = gs,
  gaussian = TRUE
)

bixverse_res_gaussian[1:5, 1:5]
#>               s1          s2          s3            s4          s5
#> gs1  0.009863134 -0.13265560  0.03860958 -0.0007354569  0.09979839
#> gs2  0.145475387  0.06788168  0.16272776 -0.0484834371 -0.07349338
#> gs3 -0.141597186 -0.09199815 -0.03758254 -0.0770037606 -0.05471550
#> gs4 -0.186252365 -0.08253650 -0.02488495  0.0419060853  0.01726811
#> gs5 -0.016465263 -0.03264040 -0.18299280  0.0884396512  0.08989500

If the original GSVA Bioconductor package is available we can verify that the results are essentially identical. Minor differences in numerical precision arise from optimisations on the Rust side, but per-pathway correlations between the two implementations are consistently above 0.99.

library(GSVA)

gsvaPar <- gsvaParam(X, gs)
gsva_res_gaussian <- as.matrix(gsva(gsvaPar, verbose = FALSE))

correlations <- diag(cor(bixverse_res_gaussian, gsva_res_gaussian))

print(sprintf(
  "All pathway correlations >= 0.99: %s (min: %.4f)",
  all(correlations >= 0.99),
  min(correlations)
))
#> [1] "All pathway correlations >= 0.99: TRUE (min: 1.0000)"

Let’s compare speed (differences will be more pronounced, the more computational fire power your system has.)

microbenchmark::microbenchmark(
  gsva = {
    gsvaPar <- gsvaParam(X, gs)
    gsva(gsvaPar, verbose = FALSE)
  },
  bixverse = calc_gsva(exp = X, pathways = gs, gaussian = TRUE),
  times = 3L
)
#> Unit: milliseconds
#>      expr      min        lq      mean    median        uq       max neval
#>      gsva 2460.576 2465.6239 2481.7239 2470.6716 2492.2977 2513.9238     3
#>  bixverse  580.856  581.0158  581.8882  581.1756  582.4043  583.6329     3

Poisson kernel

For count data a Poisson kernel is more appropriate. The only change is setting gaussian = FALSE:

bixverse_res_poisson <- calc_gsva(
  exp = X_counts,
  pathways = gs,
  gaussian = FALSE
)

bixverse_res_poisson[1:5, 1:5]
#>              s1          s2          s3          s4          s5
#> gs1  0.22094438 -0.13988088  0.07786652  0.12042579 -0.14646733
#> gs2  0.00717930 -0.03540182 -0.03552337  0.04872137  0.02476266
#> gs3  0.07494343 -0.06666265  0.02612589 -0.20409502 -0.11010636
#> gs4 -0.17342691  0.16554904 -0.21089376  0.31440297 -0.09433910
#> gs5 -0.08230767  0.16365369 -0.18232393 -0.01777528 -0.13746922

And vs. the original

gsvaParPoisson <- gsvaParam(X_counts, gs, kcdf = "Poisson")
gsva_res_poisson <- as.matrix(gsva(gsvaParPoisson, verbose = FALSE))

correlations <- diag(cor(bixverse_res_poisson, gsva_res_poisson))

print(sprintf(
  "All pathway correlations >= 0.99: %s (min: %.4f)",
  all(correlations >= 0.99),
  min(correlations)
))
#> [1] "All pathway correlations >= 0.99: TRUE (min: 1.0000)"

And speed:

microbenchmark::microbenchmark(
  gsva = {
    gsvaPar <- gsvaParam(X_counts, gs, kcdf = "Poisson")
    gsva(gsvaPar, verbose = FALSE)
  },
  bixverse = calc_gsva(exp = X_counts, pathways = gs, gaussian = FALSE),
  times = 3L
)
#> Unit: seconds
#>      expr       min        lq      mean    median        uq       max neval
#>      gsva 18.832861 18.842568 18.848288 18.852276 18.856002 18.859728     3
#>  bixverse  2.786899  2.787046  2.792641  2.787193  2.795512  2.803831     3

ssGSEA

ssGSEA, first described in Barbie et al., takes a similar approach but does not apply a kernel-based normalisation across samples. bixverse provides a Rust-optimised version here as well:

bixverse_res_ssgsea <- calc_ssgsea(
  exp = X,
  pathways = gs
)

bixverse_res_ssgsea[1:5, 1:5]
#>             s1         s2           s3         s4         s5
#> gs1 0.10762277 0.05601719  0.138749022 0.14962965 0.16567135
#> gs2 0.18438116 0.09872824  0.213621747 0.10770997 0.12138715
#> gs3 0.04110268 0.03513645  0.067135131 0.02300473 0.07935425
#> gs4 0.02771066 0.12243466  0.095543228 0.10847501 0.08636451
#> gs5 0.10298632 0.07888029 -0.002787812 0.13500292 0.12466942

Let’s compare again against the GSVA version:

ssgseaPar <- ssgseaParam(X, gs)
ssgsea_res <- as.matrix(gsva(ssgseaPar, verbose = FALSE))

correlations <- diag(cor(bixverse_res_ssgsea, ssgsea_res))

print(sprintf(
  "All pathway correlations >= 0.99: %s (min: %.4f)",
  all(correlations >= 0.99),
  min(correlations)
))
#> [1] "All pathway correlations >= 0.99: TRUE (min: 1.0000)"

And check the underlying speed differences:

microbenchmark::microbenchmark(
  gsva = {
    ssgseaPar <- ssgseaParam(X, gs)
    gsva(ssgseaPar, verbose = FALSE)
  },
  bixverse = calc_ssgsea(exp = X, pathways = gs),
  times = 3L
)
#> Unit: milliseconds
#>      expr      min       lq     mean   median       uq      max neval
#>      gsva 611.5410 612.2522 614.6657 612.9634 616.2281 619.4928     3
#>  bixverse 110.5719 110.8645 111.4187 111.1572 111.8422 112.5271     3

Multi-contrast enrichment (mitch)

When you have multiple contrasts - say, differential expression results from several comparisons or different omics layers - you often want to know which pathways show coordinated changes across contrasts. The mitch method does exactly this: it ranks genes within each contrast, computes mean ranks per pathway, and then uses a MANOVA test to identify pathways that are enriched in one or more contrasts simultaneously. bixverse re-implements the core algorithm so that it slots into the same workflow as the other pathway activity functions… Also, heavily multi-threaded and designed to go brrrrrr in terms of speed.

calc_mitch expects a numeric matrix of contrast statistics (genes in rows, contrasts in columns) and a named list of gene sets. It returns a data.table containing per-pathway MANOVA p-values, FDR-adjusted p-values, and the individual contrast-level enrichment scores and p-values.

set.seed(42L)

contrast_data <- matrix(rnorm(3 * 26), nrow = 26)
colnames(contrast_data) <- sprintf("contrast_%i", 1:3)
rownames(contrast_data) <- letters

gene_sets <- list(
  pathway_A = sample(letters, 4),
  pathway_B = sample(letters, 5),
  pathway_C = sample(letters, 6),
  pathway_D = sample(letters, 7)
)

res <- calc_mitch(
  contrast_mat = contrast_data,
  gene_set_list = gene_sets
)

res

Note that pathways smaller than the internal minimum set size are dropped from the output, and the results are sorted by significance. The function also validates its input: passing a matrix containing NA values will throw an error rather than silently producing nonsense.

Comparison with the mitch package

When the original mitch package is available we can verify that the two implementations produce identical results. Here we use the example data bundled with mitch:

data(myImportedData, genesetsExample, package = "mitch")

mitch_res <- suppressMessages(mitch::mitch_calc(
  myImportedData,
  genesetsExample,
  priority = "significance",
  minsetsize = 5,
  cores = 2
))

bixverse_res <- calc_mitch(
  contrast_mat = as.matrix(myImportedData),
  gene_set_list = genesetsExample
)

# Pathway ordering and FDR values should match exactly
all(bixverse_res$pathway_names == mitch_res$enrichment_result$set) &&
  all.equal(bixverse_res$manova_fdr, mitch_res$enrichment_result$p.adjustMANOVA)

And let’s check the speed differences:

microbenchmark::microbenchmark(
  mitch = suppressMessages(mitch::mitch_calc(
    myImportedData,
    genesetsExample,
    priority = "significance",
    minsetsize = 5,
    cores = 2
  )),
  bixverse = calc_mitch(
    contrast_mat = as.matrix(myImportedData),
    gene_set_list = genesetsExample
  ),
  times = 5L
)

As with the GSVA and ssGSEA implementations, you should observe meaningful speed improvements from Rust, particularly as the number of gene sets or contrasts grows and the more cores/oomph your system has.