Browse code

adding dgCMatrix/list support

pablo-rodr-bio2 authored on 02/11/2020 18:20:08
Showing 5 changed files

... ...
@@ -7,7 +7,7 @@ Authors@R: c(person("Justin", "Guinney", role=c("aut", "cre"), email="justin.gui
7 7
              person("Pablo Sebastian", "Rodriguez", role="ctb", email="pablosebastian.rodriguez@upf.edu"))
8 8
 Depends: R (>= 3.5.0)
9 9
 Imports: methods, stats, utils, graphics, BiocGenerics, S4Vectors,
10
-         Biobase, SummarizedExperiment, GSEABase,
10
+         Biobase, SummarizedExperiment, GSEABase, Matrix,
11 11
          parallel, BiocParallel, fastmatch, SingleCellExperiment
12 12
 Suggests: RUnit, limma, RColorBrewer, genefilter, edgeR, GSVAdata,
13 13
           shiny, shinythemes, ggplot2, data.table, plotly
... ...
@@ -7,6 +7,7 @@ importClassesFrom(Biobase, ExpressionSet)
7 7
 importClassesFrom(SummarizedExperiment, SummarizedExperiment)
8 8
 importClassesFrom(GSEABase, GeneSetCollection)
9 9
 importClassesFrom(SingleCellExperiment, SingleCellExperiment)
10
+importClassesFrom(Matrix, dgCMatrix)
10 11
 
11 12
 importMethodsFrom(Biobase, featureNames,
12 13
                            phenoData,
... ...
@@ -81,6 +81,61 @@ setMethod("gsva", signature(expr="SingleCellExperiment", gset.idx.list="list"),
81 81
   
82 82
   rval
83 83
           })
84
+          
85
+setMethod("gsva", signature(expr="dgCMatrix", gset.idx.list="list"),
86
+          function(expr, gset.idx.list, annotation,
87
+  method=c("gsva", "ssgsea", "zscore", "plage"),
88
+  kcdf=c("Gaussian", "Poisson", "none"),
89
+  abs.ranking=FALSE,
90
+  min.sz=1,
91
+  max.sz=Inf,
92
+  parallel.sz=1L,
93
+  mx.diff=TRUE,
94
+  tau=switch(method, gsva=1, ssgsea=0.25, NA),
95
+  ssgsea.norm=TRUE,
96
+  verbose=TRUE,
97
+  BPPARAM=SerialParam(progressbar=verbose))
98
+{
99
+  method <- match.arg(method)
100
+  kcdf <- match.arg(kcdf)
101
+  
102
+  ## filter genes according to verious criteria,
103
+  ## e.g., constant expression
104
+  expr <- .filterFeaturesSparse(expr, method)
105
+  
106
+  if (nrow(expr) < 2)
107
+    stop("Less than two genes in the input assay object\n")
108
+  
109
+  ## map to the actual features for which expression data is available
110
+  mapped.gset.idx.list <- lapply(gset.idx.list,
111
+                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
112
+                                 rownames(expr))
113
+  
114
+  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
115
+    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
116
+  
117
+  ## remove gene sets from the analysis for which no features are available
118
+  ## and meet the minimum and maximum gene-set size specified by the user
119
+  mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
120
+                                         min.sz=max(1, min.sz),
121
+                                         max.sz=max.sz)
122
+  
123
+  if (!missing(kcdf)) {
124
+    if (kcdf == "Gaussian") {
125
+      rnaseq <- FALSE
126
+      kernel <- TRUE
127
+    } else if (kcdf == "Poisson") {
128
+      rnaseq <- TRUE
129
+      kernel <- TRUE
130
+    } else
131
+      kernel <- FALSE
132
+  }
133
+  
134
+  rval <- .gsva(expr, mapped.gset.idx.list, method, kcdf, rnaseq, abs.ranking,
135
+                parallel.sz, mx.diff, tau, kernel, ssgsea.norm, verbose, BPPARAM)
136
+  
137
+  rval
138
+})
84 139
 
85 140
 setMethod("gsva", signature(expr="SummarizedExperiment", gset.idx.list="GeneSetCollection"),
86 141
           function(expr, gset.idx.list, annotation,
... ...
@@ -13,3 +13,42 @@
13 13
 
14 14
   expr
15 15
 }
16
+
17
+## transforms a dgCMatrix into a list of its
18
+## non-zero values by MARGIN (1 for row, 2 for column)
19
+.sparseToList <-function(dgCMat, MARGIN){
20
+  MARGIN <- as.integer(MARGIN)
21
+  J <- rep(1:ncol(dgCMat), diff(dgCMat@p))
22
+  I <- dgCMat@i + 1
23
+  x <- dgCMat@x
24
+  if (MARGIN == 1L) {
25
+    result <- split(x, I)
26
+    names(result) <- rownames(dgCMat)[as.numeric(names(result))]
27
+  } else if (MARGIN == 2L) {
28
+    result <- split(x, J)
29
+    names(result) <- colnames(dgCMat)[as.numeric(names(result))]
30
+  }
31
+  else {
32
+    warning("invalid MARGIN; return NULL")
33
+    result <- NULL
34
+  }
35
+  result
36
+}
37
+
38
+## filter out genes which non-zero values have
39
+## constant expression values
40
+.filterFeaturesSparse <- function(expr, method) {
41
+  
42
+  sdGenes <- sapply(.sparseToList(expr, 1), sd)
43
+  
44
+  if (any(sdGenes == 0) || any(is.na(sdGenes))) {
45
+    warning(sum(sdGenes == 0 | is.na(sdGenes)),
46
+            " genes with constant expression values throuhgout the samples.")
47
+    if (method != "ssgsea") {
48
+      warning("Since argument method!=\"ssgsea\", genes with constant expression values are discarded.")
49
+      expr <- expr[sdGenes > 0 & !is.na(sdGenes), ]
50
+    }
51
+  }
52
+  
53
+  expr
54
+}
16 55
\ No newline at end of file
... ...
@@ -1,6 +1,7 @@
1 1
 \name{gsva}
2 2
 \alias{gsva}
3 3
 \alias{gsva,SingleCellExperiment,list-method}
4
+\alias{gsva,dgCMatrix,list-method}
4 5
 \alias{gsva,SummarizedExperiment,list-method}
5 6
 \alias{gsva,SummarizedExperiment,GeneSetCollection-method}
6 7
 \alias{gsva,ExpressionSet,list-method}
... ...
@@ -29,6 +30,18 @@ Estimates GSVA enrichment scores.
29 30
     ssgsea.norm=TRUE,
30 31
     verbose=TRUE,
31 32
     BPPARAM=SerialParam(progressbar=verbose))
33
+\S4method{gsva}{dgCMatrix,list}(expr, gset.idx.list, annotation,
34
+    method=c("gsva", "ssgsea", "zscore", "plage"),
35
+    kcdf=c("Gaussian", "Poisson", "none"),
36
+    abs.ranking=FALSE,
37
+    min.sz=1,
38
+    max.sz=Inf,
39
+    parallel.sz=1L,
40
+    mx.diff=TRUE,
41
+    tau=switch(method, gsva=1, ssgsea=0.25, NA),
42
+    ssgsea.norm=TRUE,
43
+    verbose=TRUE,
44
+    BPPARAM=SerialParam(progressbar=verbose))
32 45
 \S4method{gsva}{SummarizedExperiment,GeneSetCollection}(expr, gset.idx.list, annotation,
33 46
     method=c("gsva", "ssgsea", "zscore", "plage"),
34 47
     kcdf=c("Gaussian", "Poisson", "none"),
... ...
@@ -106,7 +119,8 @@ Estimates GSVA enrichment scores.
106 119
   \item{expr}{Gene expression data which can be given either as a
107 120
               \code{SummarizedExperiment}, \code{SingleCellExperiment}
108 121
               \code{ExpressionSet} object, or as a matrix of expression
109
-              values where rows correspond to genes and columns correspond to samples.}
122
+              values where rows correspond to genes and columns correspond to samples.
123
+              This matrix can be also in a sparse format, as a \code{dgCMatrix}.}
110 124
   \item{gset.idx.list}{Gene sets provided either as a \code{list} object or as a
111 125
                        \code{GeneSetCollection} object.}
112 126
   \item{annotation}{In the case of calling \code{gsva()} on a
... ...
@@ -190,10 +204,11 @@ However, then the input gene sets in \code{gset.idx.list} is provided as a
190 204
 those identifiers to the type of identifier in the input expression data \code{expr}.
191 205
 Such an automatic conversion, however, will only occur in three scenarios: 1. when
192 206
 \code{expr} is an \code{ExpressionSet} object with an appropriately set
193
-\code{annotation} slot; 2. when \code{expr} is a \code{SummarizedExperiment} object
194
-with an appropriately set \code{annotation} slot in the metadata of \code{expr};
195
-3. when \code{expr} is a \code{matrix} and the \code{annotation} argument of the
196
-\code{gsva()} function is set to the name of the annotation package that provides
207
+\code{annotation} slot; 2. when \code{expr} is a \code{SummarizedExperiment} or a
208
+\code{SingleCellExperiment} object with an appropriately set \code{annotation} slot
209
+in the metadata of \code{expr}; 3. when \code{expr} is a \code{matrix} or a 
210
+\code{dgCMatrix} and the \code{annotation} argument of the \code{gsva()} function
211
+is set to the name of the annotation package that provides
197 212
 the relationships between the type of identifiers in \code{expr} and \code{gset.idx.list}.
198 213
 
199 214
 The collection of gene sets resulting from the previous identifier matching,
... ...
@@ -201,7 +216,8 @@ can be further filtered to require a minimun and/or maximum size by using the
201 216
 arguments \code{min.sz} and \code{max.sz}.
202 217
 }
203 218
 \value{
204
-A gene-set by sample matrix of GSVA enrichment scores.
219
+A gene-set by sample matrix (of \code{matrix} or \code{dgCMatrix} type, 
220
+depending on the input) of GSVA enrichment scores.
205 221
 }
206 222
 \references{
207 223
 Barbie, D.A. et al. Systematic RNA interference reveals that oncogenic KRAS-driven