Browse code

added hdf5 support for plage method

pablo-rodr-bio2 authored on 20/11/2020 18:38:41
Showing 7 changed files

... ...
@@ -1,512 +1,512 @@
1
-x1 <- t(scale(t(m)))
2
-x1
3
-m[gene.sets[[1]],]
4
-x1[gene.sets[[1]], ]
5
-x1 <- t(scale(t(m)))
6
-x1 <- svd(x1)
7
-x1 <- x1$v[,1]
8
-x1
9
-x2 <- runExactSVD(m, scale = TRUE, center = TRUE)
10
-x2 <- x2$v[,1]
11
-x2
12
-x2 <- runExactSVD(t(m), scale = TRUE, center = TRUE)
13
-x2 <- x2$v[,1]
14
-x2
15
-x1
16
-t(m)
17
-scale(t(m))
18
-x2 <- runExactSVD(m, scale = TRUE, center = TRUE)
19
-x1 <- t(scale(t(m)))
20
-x1 <- svd(x1)
21
-x1 <- x1$v[,1]
22
-x1
23
-x2 <- runExactSVD(m, scale = colMeans(m), center = TRUE)
24
-x2 <- x2$v[,1]
25
-x2
26
-colMeans(m)
27
-x2 <- runExactSVD(m, scale = colSds(m), center = TRUE)
28
-x2 <- x2$v[,1]
29
-x2
30
-x1
31
-x2
32
-x2 <- runExactSVD(m, scale = colSds(m), center = colMeans(m))
33
-x2 <- x2$v[,1]
34
-x1
35
-x2
36
-colSds(m)
37
-colMeans(m)
38
-x2 <- runExactSVD(t(m), scale = colSds(m), center = colMeans(m))
39
-x2 <- x2$v[,1]
40
-x2
41
-x1
42
-x2 <- runExactSVD(m, scale = colSds(t(m)), center = colMeans(t(m)))
43
-x2 <- x2$v[,1]
44
-x2
45
-x1
46
-x1 <- t(scale(t(m)))
47
-x1
48
-colSds(m)
49
-rowSds(m)
50
-x1 <- t(scale(t(m)))
51
-x1 <- svd(x1)
52
-x1 <- x1$v[,1]
53
-x1
54
-x2 <- runExactSVD(m, center= colMeans(m), scale= rowSds(m))
55
-x2 <- x2$v[,1]
56
-x2
57
-x2 <- runExactSVD(m, center= colMeans(m), scale= rowSds(m))
58
-x2$v[,1]
59
-x3 <- runSVD(m, k=Inf)
60
-x4 <- runExactSVD(m)
61
-x3
62
-x4 <- runExactSVD(m)
63
-x4
64
-x1 <- t(scale(t(m)))
65
-x1 <- svd(x1[gene.sets[[1]],])
66
-x1 <- x1$v[,1]
67
-x1
68
-x2 <- runExactSVD(m[gene.sets[[1]],], center= colMeans(m), scale= rowSds(m))
69
-x2 <- x2$v[,1]
70
-x2
71
-x3 <- runSVD(m[gene.sets[[1]],], k=Inf)
72
-x3$v[,1]
73
-x2
74
-x2 <- runExactSVD(m[gene.sets[[1]],], center= colMeans(m), scale= rowSds(m))
75
-x2
76
-x1
77
-x1 <- t(scale(t(m)))
78
-x1
79
-x1 <- svd(x1[gene.sets[[1]],])
80
-x1
81
-x2 <- runExactSVD(m[gene.sets[[1]],], center= colMeans(m), scale= rowSds(m))
82
-x2
83
-rowSds(m)
84
-x1 <- t(scale(t(m)))
85
-x1 <- svd(x1[gene.sets[[1]],])
86
-x1 <- x1$v[,1]
87
-x1
88
-x2 <- runExactSVD(m[gene.sets[[1]],], center= colMeans(m), scale= rowSds(t(m)))
89
-x2 <- x2$v[,1]
90
-x2
91
-colMeans(m)
92
-apply(m, 2, sd)
93
-x1 <- t(scale(t(m)))
94
-x1 <- svd(x1[gene.sets[[1]],])
95
-x1 <- x1$v[,1]
96
-x1
97
-x2 <- runExactSVD(m[gene.sets[[1]],], center= colMeans(m), scale= TRUE)
98
-x2 <- x2$v[,1]
99
-x2
100
-x1 <- scale(m)
101
-x1 <- t(scale(t(m)))
102
-x1 <- svd(x1[1,])
103
-x1
104
-x1 <- svd(x1[1,,drop=FALSE])
105
-x1 <- svd(x1[1, , drop=FALSE])
106
-x1[1, , drop=FALSE]
107
-x1[1,  drop=FALSE]
108
-x1 <- t(scale(t(m)))
109
-x1 <- t(scale(t(m)))
110
-x1 <- svd(x1[1,  ,drop=FALSE])
111
-x1
112
-x2 <- runExactSVD(m[1, ])
113
-x2 <- runExactSVD(m[1, , drop=FALSE])
114
-x2
115
-x2 <- runExactSVD(m[1, , drop=FALSE], center=TRUE, scale=TRUE)
116
-x2
117
-x1 <- t(scale(t(m)))
118
-x1 <- svd(x1[1:2,])
119
-x1
120
-x2 <- runExactSVD(m[1:2,], center=TRUE, scale=TRUE)
121
-x2
122
-x2 <- runExactSVD(m[1:2,], center=colMeans(t(m)), scale=TRUE)
123
-x2
124
-x <- gene.sets[[1]]
125
-x
126
-x1 <- svd(x1[x,])
127
-x <- gene.sets[[1]]
128
-x1 <- t(scale(t(m)))
129
-x1 <- svd(x1[x,])
130
-x1 <- x1$v[,1]
131
-x1
132
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= colSds(t(m)))
133
-x2 <- x2$v[,1]
134
-x2
135
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= colSds(m))
136
-x2 <- x2$v[,1]
137
-x2
138
-x1 <- t(scale(t(m)))
139
-x1 <- svd(x1[x,])
140
-x1 <- x1$v[,1]
141
-x1
142
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= rowSds(t(m)))
143
-x2 <- x2$v[,1]
144
-x2
145
-x2 <- runExactSVD(m[x,], center= colMeans(m), scale= rowSds(t(m)))
146
-x2 <- x2$v[,1]
147
-x2
148
-x2 <- runExactSVD(m[x,], center= colMeans(m), scale= colSds(t(m)))
149
-x2 <- x2$v[,1]
150
-x2
151
-x1
152
-x2 <- runExactSVD(m[x,], center= colMeans(m), scale= colSds(m))
153
-x2 <- x2$v[,1]
154
-x2
155
-x1
156
-scale(t(m))
157
-colMeans(t(m))
158
-colSds(m)
159
-colSds(t(m))
160
-rowSds(m)
161
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= rowSds(m))
162
-x2 <- x2$v[,1]
163
-x2
164
-x1
165
-x1 <- t(scale(t(m)))
166
-x1 <- svd(x1[x,])
167
-x1
168
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= rowSds(m))
169
-x2
170
-min(dim(m[x,]))
171
-x2 <- runSvd(m[x,], k=2, center= colMeans(t(m)), scale= rowSds(m))
172
-x2 <- runSVD(m[x,], k=2, center= colMeans(t(m)), scale= rowSds(m))
173
-x2
174
-x1$v[,1]
175
-x2$v[,1]
176
-x2 <- runSVD(m[x,], k=2, center= colMeans(t(m)), scale= rowSds(m), BSPARAM = ExactParam())
177
-x2$v[,1]
178
-x2 <- runSVD(m[x,], k=2, center= colMeans(t(m)), scale= rowSds(m), BSPARAM = FastAutoParam())
179
-x2$v[,1]
180
-x2 <- runSVD(m[x,], k=2, center= colMeans(t(m)), scale= rowSds(m), BSPARAM = bsparam())
181
-x2$v[,1]
182
-x2 <- runSVD(m[x,], k=2, center= colMeans(t(m)), scale= rowSds(m), BSPARAM = ExactParam(deferred=TRUE))
183
-x2$v[,1]
184
-x1 <- t(scale(t(m)))
185
-x1
186
-colMeans(t(m))
187
-owSds(m)
188
-rowSds(m)
189
-m
190
-m <- matrix(sample.int(10, 25, T), 5, 5)
191
-colnames(m) <- paste0("cell_", 1:5)
192
-rownames(m) <- paste0("gene_", 1:5)
193
-set.seed(123)
194
-m <- matrix(sample.int(10, 25, T), 5, 5)
195
-colnames(m) <- paste0("cell_", 1:5)
196
-rownames(m) <- paste0("gene_", 1:5)
197
-gene.sets <- list("my_list1"= c(1,2))
198
-m <- matrix(sample.int(10, 25, T), 5, 5)
199
-colnames(m) <- paste0("cell_", 1:5)
200
-rownames(m) <- paste0("gene_", 1:5)
201
-x <- c(1,2)
202
-x1 <- t(scale(t(m)))
203
-x1 <- svd(x1[x,])
204
-x1$v[,1]
205
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= rowSds(m))
206
-x2$v[,1]
207
-rowSds(m)
208
-scale(t(m))
209
-t(scale(t(m)))
210
-x1 <- t(scale(t(m)))
211
-x1@scaled
212
-str(x)
213
-str(x1)
214
-x1$scaled
215
-x1@scaled
216
-attr(x1)
217
-attr(x1, "scaled:scale")
218
-x2 <- runExactSVD(t(m[x,]), center= colMeans(t(m)), scale= rowSds(m))
219
-x2$v[,1]
220
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= rowSds(m))
221
-x2$v[,1]
222
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= rowSds(t(m)))
223
-x1$v[,1]
224
-x1 <- t(scale(t(m)))
225
-x1 <- svd(x1[x,])
226
-x1$v[,1]
227
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= rowSds(t(m)))
228
-x2$v[,1]
229
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= rowSds(m))
230
-x2$v[,1]
231
-C <- colMeans(t(m))
232
-i <- 1
233
-sqrt(sum((m[,i] - C[i])^2)/(ncol(m)-1))
234
-x2 <- runExactSVD(m[x,], center= colMeans(t(m)), scale= TRUE)
235
-x2$v[,1]
236
-x1$v[,1]
237
-colMeans(t(m))
238
-rowMeans(m)
239
-rowSds(m)
240
-x1 <- t(scale(t(m)))
241
-x1 <- svd(x1[x,])
242
-x1$v[,1]
243
-x2 <- runExactSVD(m[x,], center= rowMeans(m), scale= rowSds(m))
244
-x2$v[,1]
245
-library(BiocSingular)
246
-set.seed(123)
247
-m <- matrix(sample.int(10, 25, T), 5, 5)
248
-x <- c(1,2)
249
-x1 <- t(scale(t(m)))
250
-x1 <- svd(x1[x,])
251
-x1$v[,1]
252
-x2 <- runExactSVD(t(scale(t(m))))
253
-x2$v[,1]
254
-x1 <- t(scale(t(m)))
255
-res1 <- svd(x1[x,])
256
-res1$v[,1]
257
-x2 <- runExactSVD(x1[x,])
258
-x2$v[,1]
259
-x3 <- runExactSVD(m[x,], center=colMeans(t(m)))
260
-x3
261
-library(matrixStats)
262
-x1 <- t(scale(t(m)))
263
-x2 <- t( (t(m) - colMeans(t(m))) /  colSds(t(m)) )
264
-x1
265
-x2
266
-x2 <- t( (t(m) - colMeans(t(m))) /  rowSds(t(m)) )
267
-x2
268
-x1 <- t(scale(t(m)))
269
-x2 <- t( (t(m) - colMeans(t(m))) /  apply(t(m), 2, sd) )
270
-x1
271
-x2
272
-apply(t(m), 2, sd)
273
-m
274
-x1
275
-t(m)
276
-scale(m)
277
-m
278
-x1 <- t(scale(t(m)))
279
-x1
280
-x1 <- t(scale(t(m)))
281
-x2 <- t( (t(m) - colMeans(t(m))) /  apply(t(m), 2, sd) )
282
-x1
283
-x2
284
-colMeans(t(m))
285
-apply(t(m), 2, sd)
1
+sdGenes <- apply(expr, 1, sd)
2
+}
3
+if (any(sdGenes == 0) || any(is.na(sdGenes))) {
4
+warning(sum(sdGenes == 0 | is.na(sdGenes)),
5
+" genes with constant expression values throuhgout the samples.")
6
+if (method != "ssgsea") {
7
+warning("Since argument method!=\"ssgsea\", genes with constant expression values are discarded.")
8
+expr <- expr[sdGenes > 0 & !is.na(sdGenes), ]
9
+}
10
+}
11
+expr
12
+}
13
+expr <- .filterFeatures(expr, method)
14
+expr
15
+m <- matrix(sample.int(10, 25, T), 15, 15)
16
+colnames(m) <- paste0("cell_", 1:10)
17
+colnames(m) <- paste0("cell_", 1:15)
18
+rownames(m) <- paste0("gene_", 1:15)
19
+m <- as(m, "HDF5Array")
286 20
 m
287
-x1 <- scale(t(m))
288
-x2 <- (t(m) - colMeans(t(m))) /  apply(t(m), 2, sd)
289
-x1
290
-x2
291
-x2 <- (t(m) - colMeans(t(m))) /  (apply(t(m), 2, sd))
292
-x1
293
-x2
294
-x1 <- scale(m)
295
-x2 <- (m - colMeans(m)) /  (apply(m, 2, sd))
296
-x1
297
-x2
21
+gset.idx.list <- list("my_list1"= paste0("gene_", 1:2),
22
+"my_list2"= paste0("gene_", 6:7))
23
+expr <- m
24
+.filterFeatures <- function(expr, method) {
25
+if(is(expr, "DelayedArray")){
26
+sdGenes <- DelayedMatrixStats::rowSds(expr)
27
+} else {
28
+sdGenes <- apply(expr, 1, sd)
29
+}
30
+if (any(sdGenes == 0) || any(is.na(sdGenes))) {
31
+warning(sum(sdGenes == 0 | is.na(sdGenes)),
32
+" genes with constant expression values throuhgout the samples.")
33
+if (method != "ssgsea") {
34
+warning("Since argument method!=\"ssgsea\", genes with constant expression values are discarded.")
35
+expr <- expr[sdGenes > 0 & !is.na(sdGenes), ]
36
+}
37
+}
38
+expr
39
+}
40
+expr <- .filterFeatures(expr, method)
41
+expr
42
+X <- expr
43
+Z <- t(DelayedArray::scale(t(X)))
44
+Z
45
+dim(m)
46
+nrow(m)
47
+rightsingularsvdvectorgset <- function(gSetIdx, Z) {
48
+s <- svd(Z[gSetIdx, ])
49
+s$v[, 1]
50
+}
51
+sink <- HDF5RealizationSink(dim = c(1L, nrow(Z)))
52
+dim(sink)
53
+rightsingularsvdvectorgset <- function(gSetIdx, Z) {
54
+s <- svd(Z[gSetIdx, ])
55
+s$v[, 1]
56
+}
57
+right2 <- function(gSetIdx, Z) {
58
+sink <- HDF5RealizationSink(dim = c(1L, nrow(Z)))
59
+sink_grid <- defaultAutoGrid(sink, block.shape="first-dim-grows-first")
60
+for(bid in seq_along(sink_grid)){
61
+block <- rightsingularsvdvectorgset(Z, gSetIdx)
62
+block <- as(matrix(block, 1, nrow(Z)), "HDF5Matrix")
63
+sink <- write_block(sink, sink_grid[[bid]], block)
64
+}
65
+close(sink)
66
+res <- as(sink, "DelayedArray")
67
+res
68
+}
69
+Z <- t(DelayedArray::scale(t(X)))
70
+Z <- as(Z, "HDF5Array")
71
+es <- bplapply(geneSets, right2, Z,
72
+BPPARAM=BPPARAM)
73
+geneSets <- list("my_list1"=c(1,2), "mylist2"=c(3,4))
74
+es <- bplapply(geneSets, right2, Z,
75
+BPPARAM=BPPARAM)
76
+names(geneSets)
77
+length(names(geneSets))
78
+right2 <- function(gSetIdx, Z) {
79
+sink <- HDF5RealizationSink(dim = c(1L, nrow(Z)))
80
+sink_grid <- defaultAutoGrid(sink, block.shape="first-dim-grows-first")
81
+for(bid in seq_along(sink_grid)){
82
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
83
+block <- as(matrix(block, 1, nrow(Z)), "HDF5Matrix")
84
+sink <- write_block(sink, sink_grid[[bid]], block)
85
+}
86
+close(sink)
87
+res <- as(sink, "DelayedArray")
88
+res
89
+}
90
+Z <- t(DelayedArray::scale(t(X)))
91
+Z <- as(Z, "HDF5Array")
92
+es <- bplapply(geneSets, right2, Z,
93
+BPPARAM=BPPARAM)
94
+es
95
+es <- do.call(rbind, es)
96
+es
97
+es <- as(es, "HDF5Array")
98
+es
99
+geneSets
100
+es2 <- es
101
+Z <- t(scale(t(X)))
102
+es <- bplapply(geneSets, rightsingularsvdvectorgset, Z,
103
+BPPARAM=BPPARAM)
104
+es <- do.call(rbind, es)
105
+es
106
+es2
107
+library(scRNAseq)
108
+sce <- ReprocessedAllenData("tophat_counts")
109
+library(scRNAseq)
110
+x <- do.call(cbind, lapply(1:20, function(j) {
111
+rpois(n = 10000, lambda = sample(20:40, 10000, replace = TRUE))
112
+}))
113
+colnames(x) <- paste0("S", 1:20)
114
+x <- realize(x, "HDF5Array")
115
+x
116
+geneSets <- list("my_list1"=1:500,
117
+"my_list2"=1001:1500)
118
+geneSets
119
+X <- x
120
+X
121
+plage2 <- function(X, geneSets, parallel.sz, verbose=TRUE,
122
+BPPARAM=SerialParam(progressbar=verbose)) {
123
+Z <- t(DelayedArray::scale(t(X)))
124
+Z <- as(Z, "HDF5Array")
125
+es <- bplapply(geneSets, right2, Z,
126
+BPPARAM=BPPARAM)
127
+es <- do.call(rbind, es)
128
+es <- as(es, "HDF5Array")
129
+es
130
+}
131
+res1 <- plage(x, geneSets)
132
+plage <- function(X, geneSets, parallel.sz, verbose=TRUE,
133
+BPPARAM=SerialParam(progressbar=verbose)) {
134
+Z <- t(scale(t(X)))
135
+es <- bplapply(geneSets, rightsingularsvdvectorgset, Z,
136
+BPPARAM=BPPARAM)
137
+es <- do.call(rbind, es)
138
+if (length(geneSets) == 1)
139
+es <- matrix(es, nrow=1)
140
+rownames(es) <- names(geneSets)
141
+colnames(es) <- colnames(X)
142
+es
143
+}
144
+res1 <- plage(x, geneSets)
145
+res1
146
+res2 <- plage2(x, geneSets)
147
+res2
298 148
 library(bench)
299
-m <- matrix(sample.int(10, 25, T), 10, 10)
300
-genes <- c(1,2)
301
-x <- t(scale(t(m)))
302
-m <- matrix(sample.int(10, 25, T), 10, 10)
303
-genes <- c(1,2)
304 149
 bench::mark(
305
-svd(x[genes,]),
306
-runExactSVD(x[genes,])
150
+plage1(x, geneSets),
151
+plage2(x, geneSets),
307 152
 )
308
-library(Matrix)
309
-m<-rsparsematrix(10,10,.5)
310
-m
311
-# m <- matrix(sample.int(10, 25, T), 10, 10)
312
-m<-rsparsematrix(1000,10000,.5)
313
-genes <- c(1:400)
314
-x <- t(scale(t(m)))
315
-X
316
-x
317
-x1 <- t(scale(t(m)))
318
-x2 <- as(x1, "dgCMatrix")
319 153
 bench::mark(
320
-svd(x1[genes,]),
321
-runExactSVD(x2[genes,])
154
+plage(x, geneSets),
155
+plage2(x, geneSets),
156
+check=FALSE
322 157
 )
158
+print(object.size(res), unit="Kb")
159
+print(object.size(res2), unit="Kb")
160
+res
161
+print(object.size(res1), unit="Kb")
162
+print(object.size(res2), unit="Kb")
163
+sce <- readRDS("/home/bort/Downloads/pbmc.sce.rds")
164
+# sce <- readRDS("/home/bort/curro/gsva-devel/shortdgC.sce.rds")
165
+gset.idx.list <- list("my_genes1" = sample(rownames(sce), 5000),
166
+"my_genes2" = sample(rownames(sce), 5000))
167
+se <- sce
168
+annotation <- names(assays(se))[1]
169
+expr <- assays(se)[[annotation]]
170
+.filterFeatures <- function(expr, method) {
171
+if(is(expr, "DelayedArray")){
172
+sdGenes <- DelayedMatrixStats::rowSds(expr)
173
+} else {
174
+sdGenes <- apply(expr, 1, sd)
175
+}
176
+if (any(sdGenes == 0) || any(is.na(sdGenes))) {
177
+warning(sum(sdGenes == 0 | is.na(sdGenes)),
178
+" genes with constant expression values throuhgout the samples.")
179
+if (method != "ssgsea") {
180
+warning("Since argument method!=\"ssgsea\", genes with constant expression values are discarded.")
181
+expr <- expr[sdGenes > 0 & !is.na(sdGenes), ]
182
+}
183
+}
184
+expr
185
+}
186
+expr <- .filterFeatures(expr, method)
187
+expr
188
+expr <- as(expr, "HDF5Array")
189
+expr <- NULL
190
+sce <- NULL
191
+se <- NULL
192
+library(TEN)
193
+library(TENxBrainData)
194
+sce <- TENxBrainData20k()
195
+sce
196
+rownames(sce) <- rowData(sce)$Symbol
197
+colData(sce)
198
+colnames(sce) <- colData(sce)$Sequence
199
+sce
200
+se <- sce
201
+annotation <- names(assays(se))[1]
202
+expr <- assays(se)[[annotation]]
203
+expr
204
+expr <- as(expr, "HDF5Array")
205
+se <- sce
206
+annotation <- names(assays(se))[1]
207
+expr <- assays(se)[[annotation]]
208
+expr
209
+# sce <- readRDS("/home/bort/curro/gsva-devel/shortdgC.sce.rds")
210
+gset.idx.list <- list("my_genes1" = sample(rownames(sce), 5000),
211
+"my_genes2" = sample(rownames(sce), 5000))
212
+gset.idx.list
213
+se <- NULL
214
+sce <- NULL
215
+expr <- .filterFeatures(expr, method)
216
+expr
217
+mapped.gset.idx.list <- gset.idx.list
218
+mapped.gset.idx.list <- lapply(mapped.gset.idx.list,
219
+function(x, y) na.omit(fastmatch::fmatch(x, y)),
220
+rownames(expr))
221
+filterGeneSets <- function(gSets, min.sz=1, max.sz=Inf) {
222
+gSetsLen <- sapply(gSets,length)
223
+return (gSets[gSetsLen >= min.sz & gSetsLen <= max.sz])
224
+}
225
+mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
226
+min.sz=max(1, min.sz),
227
+max.sz=max.sz)
228
+eSco2 <- plage2(expr, mapped.gset.idx.list , parallel.sz, verbose, BPPARAM=BPPARAM)
229
+X <- expr
230
+Z <- t(DelayedArray::scale(t(X)))
231
+t(X)
232
+dimnames(X)
233
+t(X)
234
+colnames(X)
235
+rownames(X)
236
+t(X)
237
+is.character(dimnames(X))
238
+which(is.character(dimnames(X)))
239
+as.character(dimnames(X))
240
+dinmaes <- as.character(dimnames(X))
241
+dinmaes <- NULL
242
+colnames(X) <- NULL
243
+sce
244
+BiocManager::install("scRNAseq")
245
+options(Ncpus=6)
246
+BiocManager::install("scRNAseq")
247
+sce <- ReprocessedAllenData("tophat_counts")
248
+library(scRNAseq)
249
+sce <- ReprocessedAllenData("tophat_counts")
250
+counts <- assay(sce, "tophat_counts")
251
+counts
252
+str(counts)
253
+print(object.size(counts), unit="Kb")
254
+print(object.size(counts), unit="Mb")
255
+counts <- as(counts, "HDF5Array")
256
+library(HDF5Array)
257
+counts <- as(counts, "HDF5Array")
258
+counts
259
+expr <- as(t(counts), "HDF5Array")
260
+expr
261
+# sce <- readRDS("/home/bort/curro/gsva-devel/shortdgC.sce.rds")
262
+gset.idx.list <- list("my_genes1" = sample(rownames(expr), 100),
263
+"my_genes2" = sample(rownames(expr), 100))
264
+method <- "plage"
265
+kcdf <- "Gaussian"
266
+min.sz <- 1
267
+max.sz <- Inf
268
+abs.ranking <- FALSE
269
+parallel.sz=1L
270
+mx.diff=TRUE
271
+tau=1
272
+kernel=TRUE
273
+ssgsea.norm=TRUE
274
+verbose=TRUE
275
+rnaseq=FALSE
276
+BPPARAM=SerialParam(progressbar=verbose)
277
+library(BiocParallel)
278
+BPPARAM=SerialParam(progressbar=verbose)
279
+.filterFeatures <- function(expr, method) {
280
+if(is(expr, "DelayedArray")){
281
+sdGenes <- DelayedMatrixStats::rowSds(expr)
282
+} else {
283
+sdGenes <- apply(expr, 1, sd)
284
+}
285
+if (any(sdGenes == 0) || any(is.na(sdGenes))) {
286
+warning(sum(sdGenes == 0 | is.na(sdGenes)),
287
+" genes with constant expression values throuhgout the samples.")
288
+if (method != "ssgsea") {
289
+warning("Since argument method!=\"ssgsea\", genes with constant expression values are discarded.")
290
+expr <- expr[sdGenes > 0 & !is.na(sdGenes), ]
291
+}
292
+}
293
+expr
294
+}
295
+expr <- .filterFeatures(expr, method)
296
+expr
297
+mapped.gset.idx.list <- gset.idx.list
298
+mapped.gset.idx.list <- lapply(mapped.gset.idx.list,
299
+function(x, y) na.omit(fastmatch::fmatch(x, y)),
300
+rownames(expr))
301
+filterGeneSets <- function(gSets, min.sz=1, max.sz=Inf) {
302
+gSetsLen <- sapply(gSets,length)
303
+return (gSets[gSetsLen >= min.sz & gSetsLen <= max.sz])
304
+}
305
+mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
306
+min.sz=max(1, min.sz),
307
+max.sz=max.sz)
308
+rightsingularsvdvectorgset <- function(gSetIdx, Z) {
309
+s <- svd(Z[gSetIdx, ])
310
+s$v[, 1]
311
+}
312
+right2 <- function(gSetIdx, Z) {
313
+sink <- HDF5RealizationSink(dim = c(1L, nrow(Z)))
314
+sink_grid <- defaultAutoGrid(sink, block.shape="first-dim-grows-first")
315
+for(bid in seq_along(sink_grid)){
316
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
317
+block <- as(matrix(block, 1, nrow(Z)), "HDF5Matrix")
318
+sink <- write_block(sink, sink_grid[[bid]], block)
319
+}
320
+close(sink)
321
+res <- as(sink, "DelayedArray")
322
+res
323
+}
324
+plage2 <- function(X, geneSets, parallel.sz, verbose=TRUE,
325
+BPPARAM=SerialParam(progressbar=verbose)) {
326
+Z <- t(DelayedArray::scale(t(X)))
327
+Z <- as(Z, "HDF5Array")
328
+es <- bplapply(geneSets, right2, Z,
329
+BPPARAM=BPPARAM)
330
+es <- do.call(rbind, es)
331
+es <- as(es, "HDF5Array")
332
+es
333
+}
334
+plage3 <- function(X, geneSets, parallel.sz, verbose=TRUE,
335
+BPPARAM=SerialParam(progressbar=verbose)) {
336
+Z <- t(DelayedArray::scale(t(X)))
337
+es <- bplapply(geneSets, right2, Z,
338
+BPPARAM=BPPARAM)
339
+es <- do.call(rbind, es)
340
+es <- as(es, "HDF5Array")
341
+es
342
+}
343
+plage <- function(X, geneSets, parallel.sz, verbose=TRUE,
344
+BPPARAM=SerialParam(progressbar=verbose)) {
345
+Z <- t(scale(t(X)))
346
+es <- bplapply(geneSets, rightsingularsvdvectorgset, Z,
347
+BPPARAM=BPPARAM)
348
+es <- do.call(rbind, es)
349
+if (length(geneSets) == 1)
350
+es <- matrix(es, nrow=1)
351
+rownames(es) <- names(geneSets)
352
+colnames(es) <- colnames(X)
353
+es
354
+}
355
+library(bench)
323 356
 bench::mark(
324
-svd(x1[genes,]),
325
-runExactSVD(m[genes,])
357
+plage(expr, mapped.gset.idx.list),
358
+plage2(expr, mapped.gset.idx.list),
359
+plage3(expr, mapped.gset.idx.list),
360
+check=FALSE
326 361
 )
327 362
 bench::mark(
328
-svd(x1[genes,]),
329
-runExactSVD(m[genes,]),
363
+plage(expr, mapped.gset.idx.list),
364
+plage2(expr, mapped.gset.idx.list),
365
+plage3(expr, mapped.gset.idx.list),
330 366
 check=FALSE
331 367
 )
368
+plage3(expr, mapped.gset.idx.list)
369
+plage2(expr, mapped.gset.idx.list)
370
+expr
371
+sink <- HDF5RealizationSink(dim = c(1L, ncol(Z)))
372
+sink <- HDF5RealizationSink(dim = c(1L, ncol(expr)))
373
+sink
374
+right2 <- function(gSetIdx, Z) {
375
+sink <- HDF5RealizationSink(dim = c(1L, ncol(Z)))
376
+sink_grid <- defaultAutoGrid(sink, block.shape="first-dim-grows-first")
377
+for(bid in seq_along(sink_grid)){
378
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
379
+block <- as(matrix(block, 1, nrow(Z)), "HDF5Matrix")
380
+sink <- write_block(sink, sink_grid[[bid]], block)
381
+}
382
+close(sink)
383
+res <- as(sink, "DelayedArray")
384
+res
385
+}
332 386
 bench::mark(
333
-svd(x1[genes,]),
334
-runExactSVD(x1[genes,]),
387
+plage(expr, mapped.gset.idx.list),
388
+plage2(expr, mapped.gset.idx.list),
389
+plage3(expr, mapped.gset.idx.list),
335 390
 check=FALSE
336 391
 )
337
-x1 <- NULL
338
-x2 <- NULL
339
-m <- NULL
340
-x <- NULL
341
-library(BiocSingular)
342
-set.seed(123)
343
-m <- matrix(sample.int(10, 25, T), 10, 10)
344
-genes <- 1:2
345
-genes
346
-genes <- c(1:2)
347
-genes
348
-genes <- 1:2
349
-m <- matrix(sample.int(10, 25, T), 10, 10)
350
-genes <- 1:2
351
-x1 <- t(scale(t(m)))
352
-res1 <- svd(x1[genes,])
353
-res$v[,1]
354
-res1$v[,1]
355
-m
356
-t(m)
357
-apply(t(m),2,sd)
358
-res2 <- runExactSVD(m, center=colMeans(t(m)), scale=apply(t(m),2,sd))
359
-res2$v[,1]
360
-res1$v[,1]
361
-res2 <- runExactSVD(m, center=colMeans(t(m)), scale=apply(m,2,sd))
362
-res2$v[,1]
363
-svd( t( (t(m) - C) / colSds(t(m)) )[genes,] )
364
-t(m) - colMeans(t(m))
365
-t(m)
366
-colMeans(t(m))
367
-colSds(t(m))
368
-( t(m) - colMeans(t(m)) ) / colSds(t(m))
369
-sweep(t(m), 2, colMeanst(m))
370
-sweep(t(m), 2, colMeans(t(m)))
371
-svd( t( sweep(t(m), 2, colMeans(t(m))) / colSds(t(m)) ) )
372
-svd( t( sweep(t(m), 2, colMeans(t(m))) / colSds(t(m)) )[genes,] )
373
-svd( t( sweep(t(m), 2, colMeans(t(m))) / colSds(t(m)) )[genes,] )$v[,1]
374
-x1 <- t(scale(t(m)))
375
-res1 <- svd(x1[genes,])
376
-res1$v[,1]
377
-svd( t( sweep(t(m), 2, colMeans(t(m))) / rowSds(t(m)) )[genes,] )$v[,1]
378
-res1$v[,1]
379
-X <- m
380
-Z <- Matrix::t(X)
381
-Z <- .dgCapply(Z, scale, 2)
382
-Z <- Matrix::t(Z)
383
-Z
384
-library(Matrix)
385
-library(SingleCellExperiment)
386
-library(BiocParallel)
387
-library(sparseMatrixStats)
388
-m<-rsparsematrix(10,10,.5)
389
-colnames(m) <- paste0("cell_", 1:10)
390
-rownames(m) <- paste0("gene_", 1:10)
391
-# gset.idx.list <- list("my_genes" = sample(rownames(m), 2))
392
-geneSets <- list("my_genes1" = c(1,3))
393
-library(GSVA)
394
-x <- gsva(m, geneSets, method="plage")
395
-colnames(m) <- paste0("cell_", 1:10)
396
-rownames(m) <- paste0("gene_", 1:10)
397
-x <- gsva(m, geneSets, method="plage")
398
-# gset.idx.list <- list("my_genes" = sample(rownames(m), 2))
399
-geneSets <- list("my_genes1" = c("gene_1","gene_3"))
400
-x <- gsva(m, geneSets, method="plage")
401
-x
402
-x <- gsva(m, geneSets, method="zscore")
403
-x
404
-x <- gsva(m, geneSets, method="ssgsea")
405
-x
406
-X <- m
407
-n <- ncol(X)
408
-if(is(X, "dgCMatrix")){
409
-R <- t(sparseMatrixStats::colRanks(X, ties.method = "average"))
410
-mode(R) <- "integer"
411
-} else {
412
-R <- apply(X, 2, function(x, p) as.integer(rank(x)), p)
413
-}
414
-R
415
-R <- t(sparseMatrixStats::colRanks(X, ties.method = "average"))
416
-mode(R) <- "integer"
417
-R
418
-R <- apply(X, 2, function(x, p) as.integer(rank(x)), p)
419
-R
420
-R <- t(sparseMatrixStats::colRanks(X, ties.method = "average"))
421
-source('~/curro/gsva-devel/devel3_ssgsea.R', echo=TRUE)
422
-mode(R) <- "integer"
423
-R
424
-Ra <- abs(R)^alpha
425
-Ra
426
-es <- bplapply(as.list(1:n), function(j) {
427
-geneRanking <- order(R[, j], decreasing=TRUE)
428
-es_sample <- lapply(geneSets, .fastRndWalk, geneRanking, j, Ra)
429
-unlist(es_sample)
430
-}, BPPARAM=BPPARAM)
431
-.fastRndWalk <- function(gSetIdx, geneRanking, j, Ra) {
432
-n <- length(geneRanking)
433
-k <- length(gSetIdx)
434
-idxs <- sort.int(fastmatch::fmatch(gSetIdx, geneRanking))
435
-stepCDFinGeneSet2 <-
436
-sum(Ra[geneRanking[idxs], j] * (n - idxs + 1)) /
437
-sum((Ra[geneRanking[idxs], j]))
438
-stepCDFoutGeneSet2 <- (n * (n + 1) / 2 - sum(n - idxs + 1)) / (n - k)
439
-walkStat <- stepCDFinGeneSet2 - stepCDFoutGeneSet2
440
-walkStat
441
-}
442
-n <- ncol(X)
443
-es <- bplapply(as.list(1:n), function(j) {
444
-geneRanking <- order(R[, j], decreasing=TRUE)
445
-es_sample <- lapply(geneSets, .fastRndWalk, geneRanking, j, Ra)
446
-unlist(es_sample)
447
-}, BPPARAM=BPPARAM)
448
-es
449
-geneSets <- 1,3
450
-geneSets <- c(1,3)
451
-es <- bplapply(as.list(1:n), function(j) {
452
-geneRanking <- order(R[, j], decreasing=TRUE)
453
-es_sample <- lapply(geneSets, .fastRndWalk, geneRanking, j, Ra)
454
-unlist(es_sample)
455
-}, BPPARAM=BPPARAM)
456
-es
457
-es <- do.call("cbind", es)
458
-es
459
-geneSets <- 1
460
-es <- bplapply(as.list(1:n), function(j) {
461
-geneRanking <- order(R[, j], decreasing=TRUE)
462
-es_sample <- lapply(geneSets, .fastRndWalk, geneRanking, j, Ra)
463
-unlist(es_sample)
464
-}, BPPARAM=BPPARAM)
465
-es
466
-es <- do.call("cbind", es)
467
-es
468
-if (normalization) {
469
-## normalize enrichment scores by using the entire data set, as indicated
470
-## by Barbie et al., 2009, online methods, pg. 2
471
-es <- apply(es, 2, function(x, es) x / (range(es)[2] - range(es)[1]), es)
392
+right2 <- function(gSetIdx, Z) {
393
+sink <- HDF5RealizationSink(dim = c(1L, ncol(Z)))
394
+sink_grid <- defaultAutoGrid(sink, block.shape="first-dim-grows-first")
395
+for(bid in seq_along(sink_grid)){
396
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
397
+block <- as(matrix(block, 1, nrow(Z)), "HDF5Matrix")
398
+sink <- write_block(sink, sink_grid[[bid]], block)
472 399
 }
473
-normalization=TRUE
474
-if (normalization) {
475
-## normalize enrichment scores by using the entire data set, as indicated
476
-## by Barbie et al., 2009, online methods, pg. 2
477
-es <- apply(es, 2, function(x, es) x / (range(es)[2] - range(es)[1]), es)
400
+close(sink)
401
+res <- as(sink, "DelayedArray")
402
+res
478 403
 }
479
-es
480
-rownames(es) <- names(geneSets)
481
-colnames(es) <- colnames(X)
482
-geneSets <- c(1,3)
483
-es <- bplapply(as.list(1:n), function(j) {
484
-geneRanking <- order(R[, j], decreasing=TRUE)
485
-es_sample <- lapply(geneSets, .fastRndWalk, geneRanking, j, Ra)
486
-unlist(es_sample)
487
-}, BPPARAM=BPPARAM)
488
-es <- do.call("cbind", es)
489
-if (normalization) {
490
-## normalize enrichment scores by using the entire data set, as indicated
491
-## by Barbie et al., 2009, online methods, pg. 2
492
-es <- apply(es, 2, function(x, es) x / (range(es)[2] - range(es)[1]), es)
404
+right2(mapped.gset.idx.list[[1]], expr)
405
+expr
406
+showtree(expr)
407
+right2 <- function(gSetIdx, Z) {
408
+sink <- HDF5RealizationSink(dim = c(1L, ncol(Z)))
409
+sink_grid <- defaultAutoGrid(sink, block.shape="first-dim-grows-first")
410
+for(bid in seq_along(sink_grid)){
411
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
412
+# block <- as(matrix(block, 1, nrow(Z)), "HDF5Matrix")
413
+sink <- write_block(sink, sink_grid[[bid]], block)
493 414
 }
494
-es
495
-if(is(X, "dgCMatrix")){
496
-es <- as(es, "dgCMatrix")
415
+close(sink)
416
+res <- as(sink, "DelayedArray")
417
+res
497 418
 }
498
-es
499
-rownames(es) <- names(geneSets)
500
-colnames(es) <- colnames(X)
501
-if(is(X, "dgCMatrix")){
502
-es <- as(es, "dgCMatrix")
419
+right2(mapped.gset.idx.list[[1]], expr)
420
+nrow(expr)
421
+sink <- HDF5RealizationSink(dim = c(1L, nrow(Z)))
422
+sink <- HDF5RealizationSink(dim = c(1L, nrow(expr)))
423
+sink
424
+matrix(block, 1, nrow(Z))
425
+matrix(block, 1, nrow(expr))
426
+Z <- expr
427
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
428
+gSetIdx <- mapped.gset.idx.list[[1]]
429
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
430
+block
431
+dim(block)
432
+length(block)
433
+ncol(Z)
434
+block <- as(matrix(block, 1, length(block)), "HDF5Matrix")
435
+block
436
+sink <- HDF5RealizationSink(dim = c(1L, ncol(Z)))
437
+seq_along(sink_grid)
438
+sink_grid <- defaultAutoGrid(sink, block.shape="first-dim-grows-first")
439
+sink_grid <- defaultAutoGrid(sink, block.shape="first-dim-grows-first")
440
+seq_along(sink_grid)
441
+sink <- write_block(sink, sink_grid[[bid]], block)
442
+sink <- write_block(sink, sink_grid[[1]], block)
443
+sink <- write_block(sink, sink_grid[[1L]], block)
444
+sink
445
+close(sink)
446
+res <- as(sink, "DelayedArray")
447
+res
448
+rightsingularsvdvectorgset(mapped.gset.idx.list[[1]], Z)
449
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
450
+sink <- write_block(sink, sink_grid[[bid]], block)
451
+sink <- write_block(sink, sink_grid[[1L]], block)
452
+block <- matrix(block, 1, length(block))
453
+sink <- write_block(sink, sink_grid[[bid]], block)
454
+sink <- write_block(sink, sink_grid[[1L]], block)
455
+close(sink)
456
+res <- as(sink, "DelayedArray")
457
+res
458
+right2 <- function(gSetIdx, Z) {
459
+sink <- HDF5RealizationSink(dim = c(1L, ncol(Z)))
460
+sink_grid <- rowAutoGrid(sink, nrow = 1)
461
+for(bid in seq_along(sink_grid)){
462
+block <- rightsingularsvdvectorgset(gSetIdx, Z)
463
+block <- matrix(block, 1, length(block))
464
+sink <- write_block(sink, sink_grid[[bid]], block)
503 465
 }
504
-es
466
+close(sink)
467
+res <- as(sink, "DelayedArray")
468
+res
469
+}
470
+right2(mapped.gset.idx.list[[1]], Z)
471
+bench::mark(
472
+plage(expr, mapped.gset.idx.list),
473
+plage2(expr, mapped.gset.idx.list),
474
+plage3(expr, mapped.gset.idx.list),
475
+check=FALSE
476
+)
477
+expr
478
+expr2 <- as(expr, "matrix")
479
+expr2
480
+str(expr2)
481
+x1 <- DelayedMatrixStats::rowSds(expr)
482
+x2 <- apply(expr, 1, sd)
483
+x1
484
+x2
485
+all.equal(x1, x2)
486
+x1
487
+x2
488
+names(x2)
489
+names(x2) <- NULL
490
+x1
491
+x2
492
+all.equal(x1, x2)
493
+x1 <- DelayedMatrixStats::rowSds(expr2)
494
+x2 <- apply(expr2, 1, sd)
495
+x1
496
+x2
497
+names(x2) <- NULL
498
+all.equal(x1, x2)
505 499
 setwd("~/curro/gsva-devel/GSVA")
506 500
 devtools::check(vignettes = FALSE)
507 501
 devtools::check(vignettes = FALSE)
502
+devtools::check(vignettes = FALSE)
503
+devtools::check(vignettes = FALSE)
508 504
 devtools::build(vignettes = FALSE)
509 505
 devtools::install(build_vignettes = FALSE)
510
-x <- gsva(sce, gset.idx.list, method="zscore")
511
-x <- gsva(sce, gset.idx.list, method="ssgsea")
512
-X
506
+library(GSVA)
507
+m <- matrix(sample.int(10, 25, T), 10, 10)
508
+colnames(m) <- paste0("cell_", 1:10)
509
+rownames(m) <- paste0("gene_", 1:10)
510
+genes <- c(3,5)
511
+m <- as(m, "HDF5Array")
512
+res <- gsva(m, genes, method="plage")
... ...
@@ -1,5 +1,5 @@
1 1
 Package: GSVA
2
-Version: 1.39.7
2
+Version: 1.39.8
3 3
 Title: Gene Set Variation Analysis for microarray and RNA-seq data
4 4
 Authors@R: c(person("Justin", "Guinney", role=c("aut", "cre"), email="justin.guinney@sagebase.org"),
5 5
              person("Robert", "Castelo", role="aut", email="robert.castelo@upf.edu"),
... ...
@@ -8,8 +8,9 @@ Authors@R: c(person("Justin", "Guinney", role=c("aut", "cre"), email="justin.gui
8 8
 Depends: R (>= 3.5.0)
9 9
 Imports: methods, stats, utils, graphics, BiocGenerics, S4Vectors,
10 10
          Biobase, SummarizedExperiment, GSEABase, Matrix,
11
-         parallel, BiocParallel, fastmatch, SingleCellExperiment,
12
-         BiocSingular, sparseMatrixStats
11
+         parallel, BiocParallel, fastmatch, SingleCellExperiment, 
12
+         sparseMatrixStats, DelayedArray, DelayedMatrixStats,
13
+         HDF5Array
13 14
 Suggests: RUnit, limma, RColorBrewer, genefilter, edgeR, GSVAdata,
14 15
           shiny, shinythemes, ggplot2, data.table, plotly
15 16
 Description: Gene Set Variation Analysis (GSVA) is a non-parametric, unsupervised method for estimating variation of gene set enrichment through the samples of a expression data set. GSVA performs a change in coordinate systems, transforming the data from a gene by sample matrix to a gene-set by sample matrix, thereby allowing the evaluation of pathway enrichment for each sample. This new matrix of GSVA enrichment scores facilitates applying standard analytical methods like functional enrichment, survival analysis, clustering, CNV-pathway analysis or cross-tissue pathway analysis, in a pathway-centric manner.
... ...
@@ -8,6 +8,8 @@ importClassesFrom(SummarizedExperiment, SummarizedExperiment)
8 8
 importClassesFrom(GSEABase, GeneSetCollection)
9 9
 importClassesFrom(SingleCellExperiment, SingleCellExperiment)
10 10
 importClassesFrom(Matrix, dgCMatrix)
11
+importClassesFrom(DelayedArray, DelayedArray)
12
+importClassesFrom(HDF5Array, HDF5Array)
11 13
 
12 14
 importMethodsFrom(Biobase, featureNames,
13 15
                            phenoData,
... ...
@@ -48,8 +50,12 @@ importFrom(BiocParallel, SerialParam,
48 50
                          multicoreWorkers,
49 51
                          bpnworkers)
50 52
 importFrom(SingleCellExperiment, SingleCellExperiment)
51
-importFrom(BiocSingular, runExactSVD)
52 53
 importFrom(sparseMatrixStats, colRanks)
54
+importFrom(DelayedArray, rowAutoGrid,
55
+			  write_block,
56
+			  close)
57
+importFrom(HDF5Array, HDF5RealizationSink)
58
+importFrom(DelayedMatrixStats, rowSds)
53 59
 
54 60
 exportMethods(gsva,
55 61
               filterGeneSets,
... ...
@@ -5,6 +5,61 @@
5 5
 
6 6
 setGeneric("gsva", function(expr, gset.idx.list, ...) standardGeneric("gsva"))
7 7
 
8
+setMethod("gsva", signature(expr="HDF5Array", gset.idx.list="list"),
9
+          function(expr, gset.idx.list, annotation,
10
+  method=c("gsva", "ssgsea", "zscore", "plage"),
11
+  kcdf=c("Gaussian", "Poisson", "none"),
12
+  abs.ranking=FALSE,
13
+  min.sz=1,
14
+  max.sz=Inf,
15
+  parallel.sz=1L,
16
+  mx.diff=TRUE,
17
+  tau=switch(method, gsva=1, ssgsea=0.25, NA),
18
+  ssgsea.norm=TRUE,
19
+  verbose=TRUE,
20
+  BPPARAM=SerialParam(progressbar=verbose))
21
+{
22
+  method <- match.arg(method)
23
+  kcdf <- match.arg(kcdf)
24
+  
25
+  ## filter genes according to verious criteria,
26
+  ## e.g., constant expression
27
+  expr <- .filterFeatures(expr, method)
28
+  
29
+  if (nrow(expr) < 2)
30
+    stop("Less than two genes in the input assay object\n")
31
+  
32
+  ## map to the actual features for which expression data is available
33
+  mapped.gset.idx.list <- lapply(gset.idx.list,
34
+                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
35
+                                 rownames(expr))
36
+  
37
+  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
38
+    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
39
+  
40
+  ## remove gene sets from the analysis for which no features are available
41
+  ## and meet the minimum and maximum gene-set size specified by the user
42
+  mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
43
+                                         min.sz=max(1, min.sz),
44
+                                         max.sz=max.sz)
45
+  
46
+  if (!missing(kcdf)) {
47
+    if (kcdf == "Gaussian") {
48
+      rnaseq <- FALSE
49
+      kernel <- TRUE
50
+    } else if (kcdf == "Poisson") {
51
+      rnaseq <- TRUE
52
+      kernel <- TRUE
53
+    } else
54
+      kernel <- FALSE
55
+  }
56
+  
57
+  rval <- .gsva(expr, mapped.gset.idx.list, method, kcdf, rnaseq, abs.ranking,
58
+                parallel.sz, mx.diff, tau, kernel, ssgsea.norm, verbose, BPPARAM)
59
+  
60
+  rval
61
+})
62
+
8 63
 setMethod("gsva", signature(expr="SingleCellExperiment", gset.idx.list="list"),
9 64
           function(expr, gset.idx.list, annotation,
10 65
   method=c("gsva", "ssgsea", "zscore", "plage"),
... ...
@@ -589,6 +644,11 @@ setMethod("gsva", signature(expr="matrix", gset.idx.list="list"),
589 644
   ssgsea.norm=TRUE,
590 645
   verbose=TRUE,
591 646
   BPPARAM=SerialParam(progressbar=verbose)) {
647
+  
648
+  if(is(expr, "DelayedArray")){
649
+    return(.gsvaDelayedArray(expr, gset.idx.list, method, kcdf, rnaseq, abs.ranking,
650
+                             parallel.sz, mx.diff, tau, kernel, ssgsea.norm, verbose, BPPARAM))
651
+  }
592 652
 
593 653
 	if (length(gset.idx.list) == 0)
594 654
 		stop("The gene set list is empty! Filter may be too stringent.")
595 655
new file mode 100644
... ...
@@ -0,0 +1,94 @@
1
+.gsvaDelayedArray <- function(expr, gset.idx.list,
2
+                  method=c("gsva", "ssgsea", "zscore", "plage"),
3
+                  kcdf=c("Gaussian", "Poisson", "none"),
4
+                  rnaseq=FALSE,
5
+                  abs.ranking=FALSE,
6
+                  parallel.sz=1L,
7
+                  mx.diff=TRUE,
8
+                  tau=1,
9
+                  kernel=TRUE,
10
+                  ssgsea.norm=TRUE,
11
+                  verbose=TRUE,
12
+                  BPPARAM=SerialParam(progressbar=verbose)) {
13
+  
14
+  if (length(gset.idx.list) == 0)
15
+    stop("The gene set list is empty! Filter may be too stringent.")
16
+  
17
+  if (any(lengths(gset.idx.list) == 1))
18
+    warning("Some gene sets have size one. Consider setting 'min.sz > 1'.")
19
+  
20
+  parallel.sz <- as.integer(parallel.sz)
21
+  if (parallel.sz < 1L)
22
+    parallel.sz <- 1L
23
+  
24
+  ## because we keep the argument 'parallel.sz' for backwards compatibility
25
+  ## we need to harmonize it with the contents of BPPARAM
26
+  if (parallel.sz > 1L && class(BPPARAM) == "SerialParam") {
27
+    BPPARAM=MulticoreParam(progressbar=verbose, workers=parallel.sz, tasks=100)
28
+  } else if (parallel.sz == 1L && class(BPPARAM) != "SerialParam") {
29
+    parallel.sz <- bpnworkers(BPPARAM)
30
+  } else if (parallel.sz > 1L && class(BPPARAM) != "SerialParam") {
31
+    bpworkers(BPPARAM) <- parallel.sz
32
+  }
33
+  
34
+  if (class(BPPARAM) != "SerialParam" && verbose)
35
+    cat(sprintf("Setting parallel calculations through a %s back-end\nwith workers=%d and tasks=100.\n",
36
+                class(BPPARAM), parallel.sz))
37
+
38
+  
39
+  if (method == "plage") {
40
+    if (rnaseq)
41
+      stop("rnaseq=TRUE does not work with method='plage'.")
42
+    
43
+    if(verbose)
44
+      cat("Estimating PLAGE scores for", length(gset.idx.list),"gene sets.\n")
45
+    
46
+    return(plageDelayed(expr, gset.idx.list, parallel.sz, verbose, BPPARAM=BPPARAM))
47
+  }
48
+  
49
+
50
+}
51
+
52
+rightsingularsvdvectorgset <- function(gSetIdx, Z) {
53
+  s <- svd(Z[gSetIdx, ])
54
+  s$v[, 1]
55
+}
56
+
57
+svdDelayed <- function(gSetIdx, Z) {
58
+  
59
+  # step 1: creating a HDF5 sink object
60
+  sink <- HDF5Array::HDF5RealizationSink(dim = c(1L, ncol(Z)))
61
+  
62
+  #step 2: creating the sink's grid
63
+  sink_grid <- DelayedArray::rowAutoGrid(sink, nrow = 1)
64
+  
65
+  # step 3: creating the block and writing it in the sink
66
+  # object using the sink's grid
67
+  block <- rightsingularsvdvectorgset(gSetIdx, Z)
68
+  block <- matrix(block, 1, length(block))
69
+  sink <- DelayedArray::write_block(sink, sink_grid[[1L]], block)
70
+  
71
+  # step 4: closing the sink and realizating it
72
+  # in a hdf5 array object
73
+  DelayedArray::close(sink)
74
+  res <- as(sink, "DelayedArray")
75
+  
76
+  res
77
+  
78
+}
79
+
80
+plageDelayed <- function(X, geneSets, parallel.sz, verbose=TRUE,
81
+                  BPPARAM=SerialParam(progressbar=verbose)) {
82
+  
83
+  Z <- t(DelayedArray::scale(t(X)))
84
+  
85
+  es <- bplapply(geneSets, svdDelayed, Z,
86
+                 BPPARAM=BPPARAM)
87
+  
88
+  es <- do.call(rbind, es)
89
+  
90
+  es <- as(es, "HDF5Array")
91
+  
92
+  es
93
+}
94
+
... ...
@@ -1,6 +1,8 @@
1 1
 .filterFeatures <- function(expr, method) {
2 2
 
3 3
   ## filter out genes with constant expression values
4
+  ## DelayedMatrixStats::rowSds() works for both base and 
5
+  ## DelayedArray matrices
4 6
   sdGenes <- DelayedMatrixStats::rowSds(expr)
5 7
   if (any(sdGenes == 0) || any(is.na(sdGenes))) {
6 8
     warning(sum(sdGenes == 0 | is.na(sdGenes)),
... ...
@@ -1,5 +1,6 @@
1 1
 \name{gsva}
2 2
 \alias{gsva}
3
+\alias{gsva,HDF5Array,list-method}
3 4
 \alias{gsva,SingleCellExperiment,list-method}
4 5
 \alias{gsva,dgCMatrix,list-method}
5 6
 \alias{gsva,SummarizedExperiment,list-method}
... ...
@@ -30,6 +31,18 @@ Estimates GSVA enrichment scores.
30 31
     ssgsea.norm=TRUE,
31 32
     verbose=TRUE,
32 33
     BPPARAM=SerialParam(progressbar=verbose))
34
+\S4method{gsva}{HDF5Array,list}(expr, gset.idx.list, annotation,
35
+    method=c("gsva", "ssgsea", "zscore", "plage"),
36
+    kcdf=c("Gaussian", "Poisson", "none"),
37
+    abs.ranking=FALSE,
38
+    min.sz=1,
39
+    max.sz=Inf,
40
+    parallel.sz=1L,
41
+    mx.diff=TRUE,
42
+    tau=switch(method, gsva=1, ssgsea=0.25, NA),
43
+    ssgsea.norm=TRUE,
44
+    verbose=TRUE,
45
+    BPPARAM=SerialParam(progressbar=verbose))
33 46
 \S4method{gsva}{dgCMatrix,list}(expr, gset.idx.list, annotation,
34 47
     method=c("gsva", "ssgsea", "zscore", "plage"),
35 48
     kcdf=c("Gaussian", "Poisson", "none"),
... ...
@@ -120,7 +133,8 @@ Estimates GSVA enrichment scores.
120 133
               \code{SummarizedExperiment}, \code{SingleCellExperiment}
121 134
               \code{ExpressionSet} object, or as a matrix of expression
122 135
               values where rows correspond to genes and columns correspond to samples.
123
-              This matrix can be also in a sparse format, as a \code{dgCMatrix}.}
136
+              This matrix can be also in a sparse format, as a \code{dgCMatrix}, or
137
+              as an on-disk backend representation, such as \code{HDF5Array} .}
124 138
   \item{gset.idx.list}{Gene sets provided either as a \code{list} object or as a
125 139
                        \code{GeneSetCollection} object.}
126 140
   \item{annotation}{In the case of calling \code{gsva()} on a