Browse code

Updated vignette, documentation and NEWS file, corrected some messages and warnings.

Robert Castelo authored on 18/05/2021 21:31:15
Showing12 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: GSVA
2
-Version: 1.39.33
2
+Version: 1.39.34
3 3
 Title: Gene Set Variation Analysis for microarray and RNA-seq data
4 4
 Authors@R: c(person("Justin", "Guinney", role=c("aut", "cre"), email="justin.guinney@sagebase.org"),
5 5
              person("Robert", "Castelo", role="aut", email="robert.castelo@upf.edu"),
... ...
@@ -1,3 +1,18 @@
1
+CHANGES IN VERSION 1.40
2
+-----------------------
3
+
4
+USER VISIBLE CHANGES
5
+
6
+   o The vignette has been rewritten in R Markdown to produce an HTML vignette page and make it shorter and faster to produce.
7
+
8
+   o Development of a shiny app available through the function 'igsva()'.
9
+
10
+BUG FIXES
11
+
12
+   o Replaced fastmatch::fmatch() by IRanges::match,CharacterList-method after disscussion at https://github.com/rcastelo/GSVA/issues/39 to avoid the row names of an input expression matrix being altered by fastmatch::fmatch() adding an attribute.
13
+
14
+   o Fixed wrong call to .mapGeneSetsToFeatures() when gene sets are given in a GeneSetCollection object.
15
+
1 16
 CHANGES IN VERSION 1.36
2 17
 -----------------------
3 18
 
... ...
@@ -22,6 +22,8 @@ setMethod("gsva", signature(expr="HDF5Array", gset.idx.list="list"),
22 22
   method <- match.arg(method)
23 23
   kcdf <- match.arg(kcdf)
24 24
   
25
+  warning("Using 'HDF5Array' objects as input is still in an experimental stage.")
26
+
25 27
   ## filter genes according to verious criteria,
26 28
   ## e.g., constant expression
27 29
   expr <- .filterFeatures(expr, method)
... ...
@@ -69,6 +71,8 @@ setMethod("gsva", signature(expr="SingleCellExperiment", gset.idx.list="list"),
69 71
   method <- match.arg(method)
70 72
   kcdf <- match.arg(kcdf)
71 73
   
74
+  warning("Using 'SingleCellExperiment' objects as input is still in an experimental stage.")
75
+
72 76
   if (length(assays(expr)) == 0L)
73 77
     stop("The input SummarizedExperiment object has no assay data.")
74 78
   
... ...
@@ -139,6 +143,8 @@ setMethod("gsva", signature(expr="dgCMatrix", gset.idx.list="list"),
139 143
   method <- match.arg(method)
140 144
   kcdf <- match.arg(kcdf)
141 145
   
146
+  warning("Using 'dgCMatrix' objects as input is still in an experimental stage.")
147
+
142 148
   ## filter genes according to verious criteria,
143 149
   ## e.g., constant expression
144 150
   expr <- .filterFeatures(expr, method)
... ...
@@ -208,8 +214,8 @@ setMethod("gsva", signature(expr="SummarizedExperiment", gset.idx.list="GeneSetC
208 214
 
209 215
   annotpkg <- metadata(se)$annotation
210 216
   if (!is.null(annotpkg) && length(annotpkg) > 0 && is.character(annotpkg) && annotpkg != "") {
211
-    if (!annotpkg %in% installed.packages())
212
-      stop(sprintf("Please install the nnotation package %s", annotpkg))
217
+    if (all(!c(annotpkg, paste0(annotpkg, ".db")) %in% installed.packages()))
218
+      stop(sprintf("Please install the annotation package %s. If %s does not seem to exist as a package, please try to append the suffix .db to its name.", annotpkg, annotpkg))
213 219
 
214 220
     if (verbose)
215 221
       cat("Mapping identifiers between gene sets and feature names\n")
... ...
@@ -404,8 +410,8 @@ setMethod("gsva", signature(expr="ExpressionSet", gset.idx.list="GeneSetCollecti
404 410
 
405 411
   annotpkg <- Biobase::annotation(eset)
406 412
   if (length(annotpkg) > 0 && annotpkg != "") {
407
-    if (!annotpkg %in% installed.packages())
408
-      stop(sprintf("Please install the nnotation package %s", annotpkg))
413
+    if (all(!c(annotpkg, paste0(annotpkg, ".db")) %in% installed.packages()))
414
+      stop(sprintf("Please install the annotation package %s. If %s does not seem to exist as a package, please try to append the suffix .db to its name.", annotpkg, annotpkg))
409 415
 
410 416
     if (verbose)
411 417
       cat("Mapping identifiers between gene sets and feature names\n")
... ...
@@ -573,6 +579,8 @@ setMethod("gsva", signature(expr="matrix", gset.idx.list="list"),
573 579
   BPPARAM=SerialParam(progressbar=verbose)) {
574 580
   
575 581
   if(is(expr, "DelayedArray")){
582
+    warning("Using 'DelayedArray' objects as input is still in an experimental stage.")
583
+
576 584
     return(.gsvaDelayedArray(expr, gset.idx.list, method, kcdf, rnaseq, abs.ranking,
577 585
                              parallel.sz, mx.diff, tau, kernel, ssgsea.norm, verbose, BPPARAM))
578 586
   }
... ...
@@ -62,7 +62,8 @@
62 62
     
63 63
     return(plageDelayed(expr, gset.idx.list, parallel.sz, verbose, BPPARAM=BPPARAM))
64 64
   }
65
-  
65
+
66
+  stop("Not yet implemented for 'method=\"gsva\"'")
66 67
 }
67 68
 
68 69
 h5BackendRealization <- function(gSetIdx, FUN, Z, bpp) {
... ...
@@ -7,12 +7,12 @@ geneSetsUI <- function(id){
7 7
                    "From workspace" = "varGeneset")),
8 8
     conditionalPanel(
9 9
       condition = "input.genesetSourceType == 'fileGeneset'", ns = ns,
10
-      fileInput(ns("genesetFile"), "Choose GeneSet file:",
10
+      fileInput(ns("genesetFile"), "Choose gene sets file:",
11 11
                 accept = c(".gmt", "text/csv", ".csv"))
12 12
     ),
13 13
     conditionalPanel(
14 14
       condition = "input.genesetSourceType == 'varGeneset'", ns = ns, 
15
-      selectInput(ns("genesetVar"), "Choose GeneSet var:",
15
+      selectInput(ns("genesetVar"), "Choose gene sets object:",
16 16
                   ls(envir=.GlobalEnv))
17 17
     )
18 18
   )
... ...
@@ -42,4 +42,4 @@ geneSetsServer <- function(id){
42 42
       }
43 43
     })
44 44
   })
45
-}
46 45
\ No newline at end of file
46
+}
... ...
@@ -7,7 +7,7 @@ matrixUI <- function(id){
7 7
                    "From workspace" = "varMatrix")),
8 8
     conditionalPanel(
9 9
       condition = "input.matrixSourceType == 'fileMatrix'", ns = ns,
10
-      fileInput(ns("matrixFile"), "Choose matrix file:",
10
+      fileInput(ns("matrixFile"), "Choose expression data matrix file:",
11 11
                 accept = c(
12 12
                   "text/csv",
13 13
                   "text/comma-separated-values,text/plain",
... ...
@@ -15,7 +15,7 @@ matrixUI <- function(id){
15 15
     ),
16 16
     conditionalPanel(
17 17
       condition = "input.matrixSourceType == 'varMatrix'", ns= ns,
18
-      selectInput(ns("matrixVar"), "Choose matrix var:",
18
+      selectInput(ns("matrixVar"), "Choose expression data matrix object:",
19 19
                   ls(envir=.GlobalEnv))
20 20
     )
21 21
   )
... ...
@@ -40,4 +40,4 @@ matrixServer <- function(id){
40 40
       }
41 41
     })
42 42
   })
43
-}
44 43
\ No newline at end of file
44
+}
... ...
@@ -21,11 +21,11 @@ plot2_Server <- function(id, eventData1, rv){
21 21
         data <- rv$dat.t[Sample==rv$sample.c]
22 22
         p <- ggplot(data = data, aes(x=value, color=Sample)) +
23 23
           stat_ecdf(geom="point") + theme(legend.position = "none") +
24
-          labs(x=paste0(method, " Scores in selected sample"), y="Empirical Cumulative Density") +
24
+          labs(x=paste0(method, " scores in selected sample"), y="Empirical cumulative distribution") +
25 25
           scale_color_manual("Legend", values = rv$dd.col)
26 26
         rv$p2 <- ggplotly(p, source="click2") %>% style(text=data$gene.sets)
27 27
         rv$p2
28 28
       })
29 29
     }
30 30
   )
31
-}
32 31
\ No newline at end of file
32
+}
... ...
@@ -24,7 +24,7 @@ plot3_Server <- function(id, eventData2, rv, matrix, genesets){
24 24
         rv$p3 <- ggplot(data = df, aes(x=x, color = Sample, label = Gene)) +
25 25
           stat_density(geom="line", position = "identity") +
26 26
           geom_rug() + theme(legend.position = "none") +
27
-          labs(x="Gene Expressions in selected sample", y="Density") +
27
+          labs(x="Gene expression values in selected gene set and sample", y="Density") +
28 28
           xlim(as.numeric(range(matrix()))) +
29 29
           scale_color_manual("legend", values= rv$dd.col)
30 30
         ggplotly(rv$p3, tooltip = c("Gene", "x")) %>% style(hoverinfo="none", traces = 1) %>%
... ...
@@ -33,4 +33,4 @@ plot3_Server <- function(id, eventData2, rv, matrix, genesets){
33 33
       })
34 34
     }
35 35
   )
36
-}
37 36
\ No newline at end of file
37
+}
... ...
@@ -188,17 +188,20 @@ function(input, output, session) {
188 188
     req(rv$gs)
189 189
     tagList(
190 190
       br(),
191
-      div("To see the Empirical Cumulative Distribution Function 
192
-      of a Sample, click on its line in this plot and go to the 
193
-      'Gene.Set' Panel", style="text-align: center;")
191
+      div("Non-parametric kernel density estimation of sample
192
+          profiles of GSVA enrichment scores. Clicking on the
193
+          line of a sample will display the empirical cumulative
194
+          distribution of GSVA scores for that sample on the
195
+          'GeneSets' tab", style="text-align: center;")
194 196
     )
195 197
   })
196 198
   
197 199
   # TABLE
198 200
   output$result <- renderTable({
199 201
     req(rv$gs)
200
-    resultInformation <- data.frame("Nr of gene sets" = nrow(rv$gs),
201
-                                    "Nr of samples" = ncol(rv$gs))
202
+    resultInformation <- data.frame("Nr. of gene sets" = nrow(rv$gs),
203
+                                    "Nr. of samples" = ncol(rv$gs),
204
+                                    check.names=FALSE)
202 205
     resultInformation
203 206
   }, bordered = TRUE)
204 207
   
... ...
@@ -212,9 +215,11 @@ function(input, output, session) {
212 215
   output$text3 <- renderUI({
213 216
     tagList(
214 217
       br(),
215
-      div("To see the Kernel Density Estimation of genes of any given
216
-      Gene Set in this Sample,  click on any point in this plot and a
217
-      second plot will appear bellow it", style = "text-align: center;")
218
+      div("Empirical cumulative distribution of GSVA scores, where each
219
+          point is a gene set. Clicking on a gene set will display below
220
+          the individual gene expression values of its constituent genes
221
+          and the non-parametric kernel density estimation of their
222
+          distribution", style = "text-align: center;")
218 223
     )
219 224
   })
220 225
   
... ...
@@ -256,7 +256,7 @@ In general, the default values for the previous parameters are suitable for
256 256
 most analysis settings, which usually consist of some kind of normalized
257 257
 continuous expression values.
258 258
 
259
-# Gene set definitions and gene identifier mapping
259
+# Gene set definitions
260 260
 
261 261
 Gene sets constitute a simple, yet useful, way to define pathways because we
262 262
 use pathway membership definitions only, neglecting the information on molecular
... ...
@@ -297,6 +297,173 @@ length(genesbygo)
297 297
 head(genesbygo)
298 298
 ```
299 299
 
300
+A more sophisticated container for gene sets is the `GeneSetCollection`
301
+object class defined in the `r Biocpkg("GSEABase")` package, which also
302
+provides the function `getGmt()` to import
303
+[gene matrix transposed (GMT)](http://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29) files such as those
304
+provided by [MSigDB](https://www.gsea-msigdb.org/gsea/msigdb) into a
305
+`GeneSetCollection` object. The experiment data package
306
+`r Biocpkg("GSVAdata")` provides one such object with the old (3.0) version
307
+of the C2 collection of curated genesets from [MSigDB](https://www.gsea-msigdb.org/gsea/msigdb),
308
+which can be loaded as follows.
309
+
310
+```{r}
311
+library(GSEABase)
312
+library(GSVAdata)
313
+
314
+data(c2BroadSets)
315
+class(c2BroadSets)
316
+c2BroadSets
317
+```
318
+
319
+The documentation of `r Biocpkg("GSEABase")` contains a description of the
320
+`GeneSetCollection` class and its associated methods.
321
+
322
+# Quantification of pathway activity in bulk microarray and RNA-seq data
323
+
324
+Here we illustrate how GSVA provides an analogous quantification of pathway
325
+activity in both microarray and RNA-seq data by using two such datasets that
326
+have been derived from the same biological samples. More concretely, we will
327
+use gene expression data of lymphoblastoid cell lines (LCL) from HapMap
328
+individuals that have been profiled using both technologies
329
+[@huang_genome-wide_2007, @pickrell_understanding_2010]. These data form part
330
+of the experimental package `r Biocpkg("GSVAdata")` and the corresponding help
331
+pages contain details on how the data were processed. We start loading these
332
+data and verifying that they indeed contain expression data for the same genes
333
+and samples, as follows:
334
+
335
+```{r}
336
+library(Biobase)
337
+
338
+data(commonPickrellHuang)
339
+
340
+stopifnot(identical(featureNames(huangArrayRMAnoBatchCommon_eset),
341
+                    featureNames(pickrellCountsArgonneCQNcommon_eset)))
342
+stopifnot(identical(sampleNames(huangArrayRMAnoBatchCommon_eset),
343
+                    sampleNames(pickrellCountsArgonneCQNcommon_eset)))
344
+```
345
+Next, for the current analysis we use the subset of canonical pathways from the C2
346
+collection of MSigDB Gene Sets. These correspond to the following pathways from
347
+KEGG, REACTOME and BIOCARTA:
348
+
349
+```{r}
350
+canonicalC2BroadSets <- c2BroadSets[c(grep("^KEGG", names(c2BroadSets)),
351
+                                      grep("^REACTOME", names(c2BroadSets)),
352
+                                      grep("^BIOCARTA", names(c2BroadSets)))]
353
+canonicalC2BroadSets
354
+```
355
+Additionally, we extend this collection of gene sets with two formed by genes
356
+with sex-specific expression, which also form part of the `r Biocpkg("GSVAdata")`
357
+experiment data package. Here we use the constructor function `GeneSet` from the
358
+`r Biocpkg("GSEABase")` package to build the objects that we add to the
359
+`GeneSetCollection` object `canonicalC2BroadSets`.
360
+
361
+```{r}
362
+data(genderGenesEntrez)
363
+
364
+MSY <- GeneSet(msYgenesEntrez, geneIdType=EntrezIdentifier(),
365
+                              collectionType=BroadCollection(category="c2"), setName="MSY")
366
+MSY
367
+XiE <- GeneSet(XiEgenesEntrez, geneIdType=EntrezIdentifier(),
368
+                              collectionType=BroadCollection(category="c2"), setName="XiE")
369
+XiE
370
+
371
+canonicalC2BroadSets <- GeneSetCollection(c(canonicalC2BroadSets, MSY, XiE))
372
+canonicalC2BroadSets
373
+```
374
+We calculate now GSVA enrichment scores for these gene sets using first the
375
+microarray data and then the RNA-seq integer count data. Note that the only
376
+requirement to do the latter is to set the argument `kcdf="Poisson"`, which is
377
+`"Gaussian"` by default. Note, however, that if our RNA-seq derived expression
378
+levels would be continuous, such as log-CPMs, log-RPKMs or log-TPMs, the default
379
+value of the `kcdf` argument should remain unchanged.
380
+
381
+```{r, results="hide"}
382
+esmicro <- gsva(huangArrayRMAnoBatchCommon_eset, canonicalC2BroadSets, min.sz=5, max.sz=500)
383
+esrnaseq <- gsva(pickrellCountsArgonneCQNcommon_eset, canonicalC2BroadSets, min.sz=5, max.sz=500,
384
+                 kcdf="Poisson")
385
+```
386
+We are going to assess how gene expression profiles correlate between microarray
387
+and RNA-seq data and compare those correlations with the ones derived at pathway
388
+level. To compare gene expression values of both technologies, we will transform
389
+first the RNA-seq integer counts into log-CPM units of expression using the
390
+`cpm()` function from the `r Biocpkg("edgeR")` package.
391
+
392
+```{r}
393
+library(edgeR)
394
+
395
+lcpms <- cpm(exprs(pickrellCountsArgonneCQNcommon_eset), log=TRUE)
396
+```
397
+We calculate Spearman correlations between gene expression profiles of the
398
+previous log-CPM values and the microarray RMA values.
399
+
400
+```{r}
401
+genecorrs <- sapply(1:nrow(lcpms),
402
+                    function(i, expmicro, exprnaseq) cor(expmicro[i, ], exprnaseq[i, ],
403
+                                                         method="spearman"),
404
+                    exprs(huangArrayRMAnoBatchCommon_eset), lcpms)
405
+names(genecorrs) <- rownames(lcpms)
406
+```
407
+Now calculate Spearman correlations between GSVA enrichment scores derived
408
+from the microarray and the RNA-seq data.
409
+
410
+```{r}
411
+pwycorrs <- sapply(1:nrow(esmicro),
412
+                   function(i, esmicro, esrnaseq) cor(esmicro[i, ], esrnaseq[i, ],
413
+                                                      method="spearman"),
414
+                   exprs(esmicro), exprs(esrnaseq))
415
+names(pwycorrs) <- rownames(esmicro)
416
+```
417
+Figure \@ref(fig:compcorrs) below shows the two distributions of these
418
+correlations and we can see that GSVA enrichment scores provide an agreement
419
+between microarray and RNA-seq data comparable to the one observed between
420
+gene-level units of expression.
421
+
422
+```{r compcorrs, height=500, width=1000, fig.cap="Comparison of correlation values of gene and pathway expression profiles derived from microarray and RNA-seq data."}
423
+par(mfrow=c(1, 2), mar=c(4, 5, 3, 2))
424
+hist(genecorrs, xlab="Spearman correlation", main="Gene level\n(RNA-seq log-CPMs vs microarray RMA)",
425
+     xlim=c(-1, 1), col="grey", las=1)
426
+hist(pwycorrs, xlab="Spearman correlation", main="Pathway level\n(GSVA enrichment scores)",
427
+     xlim=c(-1, 1), col="grey", las=1)
428
+```
429
+
430
+Finally, in Figure \@ref(fig:compsexgenesets) we compare the actual GSVA
431
+enrichment scores for two gene sets formed by genes with sex-specific expression.
432
+Concretely, one gene set (XIE) formed by genes that escape chromosome
433
+X-inactivation in females [@carrel_x-inactivation_2005] and another gene set
434
+(MSY) formed by genes located on the male-specific region of chromosome Y
435
+[@skaletsky_male-specific_2003].
436
+
437
+```{r compsexgenesets, height=500, width=1000, fig.cap="Comparison of GSVA enrichment scores obtained from microarray and RNA-seq data for two gene sets formed by genes with sex-specific expression."}
438
+par(mfrow=c(1, 2))
439
+rmsy <- cor(exprs(esrnaseq)["MSY", ], exprs(esmicro)["MSY", ])
440
+plot(exprs(esrnaseq)["MSY", ], exprs(esmicro)["MSY", ], xlab="GSVA scores RNA-seq",
441
+     ylab="GSVA scores microarray", main=sprintf("MSY R=%.2f", rmsy), las=1, type="n")
442
+abline(lm(exprs(esmicro)["MSY", ] ~ exprs(esrnaseq)["MSY", ]), lwd=2, lty=2, col="grey")
443
+points(exprs(esrnaseq["MSY", pickrellCountsArgonneCQNcommon_eset$Gender == "Female"]),
444
+       exprs(esmicro)["MSY", huangArrayRMAnoBatchCommon_eset$Gender == "Female"],
445
+       col="red", pch=21, bg="red", cex=1)
446
+points(exprs(esrnaseq)["MSY", pickrellCountsArgonneCQNcommon_eset$Gender == "Male"],
447
+       exprs(esmicro)["MSY", huangArrayRMAnoBatchCommon_eset$Gender == "Male"],
448
+       col="blue", pch=21, bg="blue", cex=1)
449
+legend("topleft", c("female", "male"), pch=21, col=c("red", "blue"), pt.bg=c("red", "blue"), inset=0.01)
450
+rxie <- cor(exprs(esrnaseq)["XiE", ], exprs(esmicro)["XiE", ])
451
+plot(exprs(esrnaseq)["XiE", ], exprs(esmicro)["XiE", ], xlab="GSVA scores RNA-seq",
452
+     ylab="GSVA scores microarray", main=sprintf("XiE R=%.2f", rxie), las=1, type="n")
453
+abline(lm(exprs(esmicro)["XiE", ] ~ exprs(esrnaseq)["XiE", ]), lwd=2, lty=2, col="grey")
454
+points(exprs(esrnaseq["XiE", pickrellCountsArgonneCQNcommon_eset$Gender == "Female"]),
455
+       exprs(esmicro)["XiE", huangArrayRMAnoBatchCommon_eset$Gender == "Female"],
456
+       col="red", pch=21, bg="red", cex=1)
457
+points(exprs(esrnaseq)["XiE", pickrellCountsArgonneCQNcommon_eset$Gender == "Male"],
458
+       exprs(esmicro)["XiE", huangArrayRMAnoBatchCommon_eset$Gender == "Male"],
459
+       col="blue", pch=21, bg="blue", cex=1)
460
+legend("topleft", c("female", "male"), pch=21, col=c("red", "blue"), pt.bg=c("red", "blue"), inset=0.01)
461
+```
462
+We can see how microarray and RNA-seq single-sample GSVA enrichment scores
463
+correlate very well in these gene sets, with $\rho=0.80$ for the male-specific
464
+gene set and $\rho=0.79$ for the female-specific gene set. Male and female
465
+samples show higher GSVA enrichment scores in their corresponding gene set.
466
+
300 467
 # Example applications
301 468
 
302 469
 ## Molecular signature identification
... ...
@@ -313,8 +480,6 @@ into account. We start by having a quick glance to the data, which forms part of
313 480
 the `r Biocpkg("GSVAdata")` package:
314 481
 
315 482
 ```{r}
316
-library(GSVAdata)
317
-
318 483
 data(gbm_VerhaakEtAl)
319 484
 gbm_eset
320 485
 head(featureNames(gbm_eset))
... ...
@@ -327,8 +492,8 @@ lapply(brainTxDbSets, head)
327 492
 GSVA enrichment scores for the gene sets contained in `brainTxDbSets`
328 493
 are calculated, in this case using `mx.diff=FALSE`,  as follows:
329 494
 
330
-```{r}
331
-gbm_es <- gsva(gbm_eset, brainTxDbSets, mx.diff=FALSE, verbose=FALSE)
495
+```{r, results="hide"}
496
+gbm_es <- gsva(gbm_eset, brainTxDbSets, mx.diff=FALSE)
332 497
 ```
333 498
 
334 499
 Figure \@ref(fig:gbmSignature) shows the GSVA enrichment scores obtained for the
... ...
@@ -374,6 +539,117 @@ mtext("Gene sets", side=4, line=0, cex=1.5)
374 539
 mtext("Samples          ", side=1, line=4, cex=1.5)
375 540
 ```
376 541
 
542
+## Differential expression at pathway level
543
+
544
+We illustrate here how to conduct a differential expression analysis at
545
+pathway level. We will use an example gene expression microarray data from
546
+@armstrong_mll_2002, which consists of 37 different individuals with human
547
+acute leukemia, where 20 of them have conventional childhood acute
548
+lymphoblastic leukemia (ALL) and the other 17 are affected with the MLL
549
+(mixed-lineage leukemia gene) translocation. This leukemia data set is stored as
550
+an `ExpressionSet` object called `leukemia` in the `r Biocpkg("GSVAdata")`
551
+package and and details on how the data was pre-processed can be found in the
552
+corresponding help page. Jointly with the RMA expression values, we
553
+provide some metadata including the main phenotype corresponding to the leukemia
554
+sample subtype.
555
+
556
+```{r}
557
+data(leukemia)
558
+leukemia_eset
559
+```
560
+
561
+Next, we calculate GSVA enrichment scores setting the minimum and maximum
562
+gene set size to 10 and 500 genes, respectively.
563
+
564
+```{r, results="hide"}
565
+leukemia_es <- gsva(leukemia_eset, c2BroadSets, min.sz=10, max.sz=500)
566
+```
567
+The object returned by the function `gsva()` is always of the same class
568
+as the input object with the expression data. Therefore, in this case,
569
+we obtain an `ExpressionSet` object with features corresponding to gene
570
+sets.
571
+
572
+```{r}
573
+class(leukemia_es)
574
+leukemia_es
575
+head(featureNames(leukemia_es))
576
+```
577
+We will perform now a differential expression analysis using `r Biocpkg("limma")`
578
+[@Smyth_2004] between the two different leukemia subtypes.
579
+
580
+```{r}
581
+library(limma)
582
+
583
+mod <- model.matrix(~ factor(leukemia_es$subtype))
584
+colnames(mod) <- c("ALL", "MLLvsALL")
585
+fit <- lmFit(leukemia_es, mod)
586
+fit <- eBayes(fit)
587
+res <- decideTests(fit, p.value=0.01)
588
+summary(res)
589
+```
590
+
591
+There are `r sum(res[, 2] != 0)` MSigDB C2 differentially expressed pathways with
592
+FDR < 1%. Figure @\ref(fig:leukemiavolcano) below shows a volcano plot of the
593
+expression changes.
594
+
595
+```{r leukemiavolcano, height=700, width=500, fig.cap="Volcano plot for the differential expression analysis at pathway level between two leukemia subtypes."}
596
+tt <- topTable(fit, coef=2, n=Inf)
597
+DEpwys <- rownames(tt)[tt$adj.P.Val <= 0.01]
598
+plot(tt$logFC, -log10(tt$P.Value), pch=".", cex=4, col=grey(0.75),
599
+     main="", xlab="GSVA enrichment score difference", ylab=expression(-log[10]~~Raw~P-value))
600
+abline(h=-log10(max(tt$P.Value[tt$adj.P.Val <= 0.01])), col=grey(0.5), lwd=1, lty=2)
601
+points(tt$logFC[match(DEpwys, rownames(tt))],
602
+       -log10(tt$P.Value[match(DEpwys, rownames(tt))]), pch=".", cex=5, col="darkred")
603
+text(max(tt$logFC)*0.85, -log10(max(tt$P.Value[tt$adj.P.Val <= 0.01])), "1% FDR", pos=3)
604
+```
605
+
606
+Figure @\ref(fig:leukemiaheatmap} below shows a heatmap of GSVA enrichment scores for
607
+the `r sum(res[, 2] != 0)` differentially expressed pathways.
608
+
609
+```{r leukemiaheatmap, height=500, width=1200, fig.cap="Heatmap of GSVA enrichment scores for the differentially expressed pathways between two leukemia subtypes."}
610
+DEpwys_es <- exprs(leukemia_es[DEpwys, ])
611
+colorLegend <- c("darkred", "darkblue")
612
+names(colorLegend) <- c("ALL", "MLL")
613
+sample.color.map <- colorLegend[pData(leukemia_es)[, "subtype"]]
614
+names(sample.color.map) <- colnames(DEpwys_es)
615
+sampleClustering <- hclust(as.dist(1-cor(DEpwys_es, method="spearman")),
616
+                           method="complete")
617
+geneSetClustering <- hclust(as.dist(1-cor(t(DEpwys_es), method="pearson")),
618
+                            method="complete")
619
+heatmap(DEpwys_es, ColSideColors=sample.color.map, xlab="samples",
620
+        ylab="Pathways", margins=c(2, 20),
621
+        labRow=substr(gsub("_", " ", gsub("^KEGG_|^REACTOME_|^BIOCARTA_", "",
622
+                                          rownames(DEpwys_es))), 1, 35),
623
+        labCol="", scale="row", Colv=as.dendrogram(sampleClustering),
624
+        Rowv=as.dendrogram(geneSetClustering))
625
+legend("topleft", names(colorLegend), fill=colorLegend, inset=0.01, bg="white")
626
+```
627
+
628
+# Interactive web app
629
+
630
+The `gsva()` function can be also used through an interactive web app developed
631
+with `r CRANpkg("shiny")`. To start it just type on the R console:
632
+
633
+```{r, eval=FALSE}
634
+res <- igsva()
635
+```
636
+
637
+It will open your browser with the web app shown here below. The button
638
+`SAVE & CLOSE` will close the app and return the resulting object on the
639
+R console. Hence, the need to call igsva() on the right-hand side of an
640
+assignment if you want to store the result in your workspace. Alternatively,
641
+you can use the `DOWNLOAD` button to download the result in a CSV file.
642
+
643
+![](webapp1.png)
644
+
645
+In the starting window of the web app, after running GSVA, a non-parametric
646
+kernel density estimation of sample profiles of GSVA scores will be shown.
647
+By clicking on one of the lines, the cumulative distribution of GSVA scores
648
+for the corresponding samples will be shown in the `GeneSets` tab, as
649
+illustrated in the image below.
650
+
651
+![](webapp2.png)
652
+
377 653
 # Session information {.unnumbered}
378 654
 
379 655
 Here is the output of `sessionInfo()` on the system on which this document was
380 656
new file mode 100644
381 657
Binary files /dev/null and b/vignettes/webapp1.png differ
382 658
new file mode 100644
383 659
Binary files /dev/null and b/vignettes/webapp2.png differ