Browse code

Fixed call to fastmatch::fmatch() to avoid that this call modifies rownames of input expression matrices. Use this opportunity to factor out relevant code.

Robert Castelo authored on 22/01/2021 11:27:10
Showing 3 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: GSVA
2
-Version: 1.39.11
2
+Version: 1.39.12
3 3
 Title: Gene Set Variation Analysis for microarray and RNA-seq data
4 4
 Authors@R: c(person("Justin", "Guinney", role=c("aut", "cre"), email="justin.guinney@sagebase.org"),
5 5
              person("Robert", "Castelo", role="aut", email="robert.castelo@upf.edu"),
... ...
@@ -26,19 +26,8 @@ setMethod("gsva", signature(expr="HDF5Array", gset.idx.list="list"),
26 26
   ## e.g., constant expression
27 27
   expr <- .filterFeatures(expr, method)
28 28
   
29
-  if (nrow(expr) < 2)
30
-    stop("Less than two genes in the input assay object\n")
31
-  
32
-  if(is.null(rownames(expr)))
33
-    stop("The input assay object doesn't have rownames\n")
34
-  
35 29
   ## map to the actual features for which expression data is available
36
-  mapped.gset.idx.list <- lapply(gset.idx.list,
37
-                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
38
-                                 rownames(expr))
39
-  
40
-  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
41
-    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
30
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
42 31
   
43 32
   ## remove gene sets from the analysis for which no features are available
44 33
   ## and meet the minimum and maximum gene-set size specified by the user
... ...
@@ -105,20 +94,8 @@ setMethod("gsva", signature(expr="SingleCellExperiment", gset.idx.list="list"),
105 94
     expr <- .filterFeatures(expr, method)
106 95
   }
107 96
   
108
-  
109
-  if (nrow(expr) < 2)
110
-    stop("Less than two genes in the input ExpressionSet object\n")
111
-  
112
-  if(is.null(rownames(expr)))
113
-    stop("The input assay object doesn't have rownames\n")
114
-  
115 97
   ## map to the actual features for which expression data is available
116
-  mapped.gset.idx.list <- lapply(gset.idx.list,
117
-                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
118
-                                 rownames(expr))
119
-  
120
-  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
121
-    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
98
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
122 99
   
123 100
   ## remove gene sets from the analysis for which no features are available
124 101
   ## and meet the minimum and maximum gene-set size specified by the user
... ...
@@ -169,19 +146,8 @@ setMethod("gsva", signature(expr="dgCMatrix", gset.idx.list="list"),
169 146
   ## e.g., constant expression
170 147
   expr <- .filterFeaturesSparse(expr, method)
171 148
   
172
-  if (nrow(expr) < 2)
173
-    stop("Less than two genes in the input assay object\n")
174
-  
175
-  if(is.null(rownames(expr)))
176
-    stop("The input assay object doesn't have rownames\n")
177
-  
178 149
   ## map to the actual features for which expression data is available
179
-  mapped.gset.idx.list <- lapply(gset.idx.list,
180
-                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
181
-                                 rownames(expr))
182
-  
183
-  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
184
-    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
150
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
185 151
   
186 152
   ## remove gene sets from the analysis for which no features are available
187 153
   ## and meet the minimum and maximum gene-set size specified by the user
... ...
@@ -243,12 +209,6 @@ setMethod("gsva", signature(expr="SummarizedExperiment", gset.idx.list="GeneSetC
243 209
   ## e.g., constant expression
244 210
   expr <- .filterFeatures(expr, method)
245 211
 
246
-  if (nrow(expr) < 2)
247
-    stop("Less than two genes in the input ExpressionSet object\n")
248
-  
249
-  if(is.null(rownames(expr)))
250
-    stop("The input assay object doesn't have rownames\n")
251
-
252 212
   annotpkg <- metadata(se)$annotation
253 213
   if (!is.null(annotpkg) && length(annotpkg) > 0 && is.character(annotpkg) && annotpkg != "") {
254 214
     if (!annotpkg %in% installed.packages())
... ...
@@ -272,12 +232,7 @@ setMethod("gsva", signature(expr="SummarizedExperiment", gset.idx.list="GeneSetC
272 232
   }
273 233
 
274 234
   ## map to the actual features for which expression data is available
275
-  mapped.gset.idx.list <- lapply(mapped.gset.idx.list,
276
-                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
277
-                                 rownames(expr))
278
-
279
-  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
280
-    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
235
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
281 236
 
282 237
   ## remove gene sets from the analysis for which no features are available
283 238
   ## and meet the minimum and maximum gene-set size specified by the user
... ...
@@ -344,20 +299,9 @@ setMethod("gsva", signature(expr="SummarizedExperiment", gset.idx.list="list"),
344 299
   ## e.g., constant expression
345 300
   expr <- .filterFeatures(expr, method)
346 301
 
347
-  if (nrow(expr) < 2)
348
-    stop("Less than two genes in the input ExpressionSet object\n")
349
-  
350
-  if(is.null(rownames(expr)))
351
-    stop("The input assay object doesn't have rownames\n")
352
-
353 302
   ## map to the actual features for which expression data is available
354
-  mapped.gset.idx.list <- lapply(gset.idx.list,
355
-                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
356
-                                 rownames(expr))
357
-
358
-  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
359
-    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
360
-
303
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
304
+  
361 305
   ## remove gene sets from the analysis for which no features are available
362 306
   ## and meet the minimum and maximum gene-set size specified by the user
363 307
   mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
... ...
@@ -409,20 +353,9 @@ setMethod("gsva", signature(expr="ExpressionSet", gset.idx.list="list"),
409 353
   ## e.g., constant expression
410 354
   expr <- .filterFeatures(expr, method)
411 355
 
412
-  if (nrow(expr) < 2)
413
-    stop("Less than two genes in the input ExpressionSet object\n")
414
-  
415
-  if(is.null(rownames(expr)))
416
-    stop("The input assay object doesn't have rownames\n")
417
-
418 356
   ## map to the actual features for which expression data is available
419
-  mapped.gset.idx.list <- lapply(gset.idx.list,
420
-                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
421
-                                 rownames(expr))
422
-
423
-  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
424
-    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
425
-
357
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
358
+  
426 359
   ## remove gene sets from the analysis for which no features are available
427 360
   ## and meet the minimum and maximum gene-set size specified by the user
428 361
   mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
... ...
@@ -472,12 +405,6 @@ setMethod("gsva", signature(expr="ExpressionSet", gset.idx.list="GeneSetCollecti
472 405
   ## e.g., constant expression
473 406
   expr <- .filterFeatures(expr, method)
474 407
 
475
-  if (nrow(expr) < 2)
476
-    stop("Less than two genes in the input ExpressionSet object\n")
477
-  
478
-  if(is.null(rownames(expr)))
479
-    stop("The input assay object doesn't have rownames\n")
480
-
481 408
   annotpkg <- Biobase::annotation(eset)
482 409
   if (length(annotpkg) > 0 && annotpkg != "") {
483 410
     if (!annotpkg %in% installed.packages())
... ...
@@ -501,14 +428,11 @@ setMethod("gsva", signature(expr="ExpressionSet", gset.idx.list="GeneSetCollecti
501 428
   }
502 429
 
503 430
   ## map to the actual features for which expression data is available
504
-  tmp <- lapply(mapped.gset.idx.list,
505
-                function(x, y) na.omit(fastmatch::fmatch(x, y)),
506
-                rownames(expr))
507
-  names(tmp) <- names(mapped.gset.idx.list)
508
-
431
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
432
+  
509 433
   ## remove gene sets from the analysis for which no features are available
510 434
   ## and meet the minimum and maximum gene-set size specified by the user
511
-  mapped.gset.idx.list <- filterGeneSets(tmp,
435
+  mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
512 436
                                          min.sz=max(1, min.sz),
513 437
                                          max.sz=max.sz)
514 438
 
... ...
@@ -553,12 +477,6 @@ setMethod("gsva", signature(expr="matrix", gset.idx.list="GeneSetCollection"),
553 477
   ## e.g., constant expression
554 478
   expr <- .filterFeatures(expr, method)
555 479
 
556
-  if (nrow(expr) < 2)
557
-    stop("Less than two genes in the input expression data matrix\n")
558
-  
559
-  if(is.null(rownames(expr)))
560
-    stop("The input assay object doesn't have rownames\n")
561
-
562 480
   ## map gene identifiers of the gene sets to the features in the matrix
563 481
   mapped.gset.idx.list <- gset.idx.list
564 482
   if (!missing(annotation)) {
... ...
@@ -570,17 +488,11 @@ setMethod("gsva", signature(expr="matrix", gset.idx.list="GeneSetCollection"),
570 488
   }
571 489
   
572 490
   ## map to the actual features for which expression data is available
573
-  tmp <- lapply(geneIds(mapped.gset.idx.list),
574
-                                 function(x, y) na.omit(fastmatch::fmatch(x, y)),
575
-                                 rownames(expr))
576
-  names(tmp) <- names(mapped.gset.idx.list)
577
-
578
-  if (length(unlist(tmp, use.names=FALSE)) == 0)
579
-    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
580
-
491
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
492
+  
581 493
   ## remove gene sets from the analysis for which no features are available
582 494
   ## and meet the minimum and maximum gene-set size specified by the user
583
-  mapped.gset.idx.list <- filterGeneSets(tmp,
495
+  mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
584 496
                                          min.sz=max(1, min.sz),
585 497
                                          max.sz=max.sz)
586 498
 
... ...
@@ -623,16 +535,9 @@ setMethod("gsva", signature(expr="matrix", gset.idx.list="list"),
623 535
   ## e.g., constant expression
624 536
   expr <- .filterFeatures(expr, method)
625 537
 
626
-  if (nrow(expr) < 2)
627
-    stop("Less than two genes in the input expression data matrix\n")
628
-
629
-  mapped.gset.idx.list <- lapply(gset.idx.list,
630
-                                 function(x ,y) na.omit(fastmatch::fmatch(x, y)),
631
-                                 rownames(expr))
632
-
633
-  if (length(unlist(mapped.gset.idx.list, use.names=FALSE)) == 0)
634
-    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
635
-
538
+  ## map to the actual features for which expression data is available
539
+  mapped.gset.idx.list <- .mapGeneSetsToFeatures(gset.idx.list, rownames(expr))
540
+  
636 541
   ## remove gene sets from the analysis for which no features are available
637 542
   ## and meet the minimum and maximum gene-set size specified by the user
638 543
   mapped.gset.idx.list <- filterGeneSets(mapped.gset.idx.list,
... ...
@@ -1091,10 +996,10 @@ setMethod("computeGeneSetsOverlap", signature(gSets="list", uniqGenes="character
1091 996
           function(gSets, uniqGenes, min.sz=1, max.sz=Inf) {
1092 997
   totalGenes <- length(uniqGenes)
1093 998
 
1094
-  ## map to the features requested
1095
-  gSets <- lapply(gSets, function(x, y) as.vector(na.omit(fastmatch::fmatch(x, y))), uniqGenes)
999
+  ## map to the actual features for which expression data is available
1000
+  gSets <- .mapGeneSetsToFeatures(gSets, uniqGenes)
1096 1001
 
1097
-  lenGsets <- sapply(gSets, length)
1002
+  lenGsets <- lengths(gSets)
1098 1003
   totalGsets <- length(gSets)
1099 1004
 
1100 1005
   gSetsMembershipMatrix <- matrix(0, nrow=totalGenes, ncol=totalGsets,
... ...
@@ -1111,9 +1016,9 @@ setMethod("computeGeneSetsOverlap", signature(gSets="list", uniqGenes="Expressio
1111 1016
   totalGenes <- length(uniqGenes)
1112 1017
 
1113 1018
   ## map to the actual features for which expression data is available
1114
-  gSets <- lapply(gSets, function(x, y) as.vector(na.omit(fastmatch::fmatch(x, y))), uniqGenes)
1019
+  gSets <- .mapGeneSetsToFeatures(gSets, uniqGenes)
1115 1020
 
1116
-  lenGsets <- sapply(gSets, length)
1021
+  lenGsets <- lengths(gSets)
1117 1022
   totalGsets <- length(gSets)
1118 1023
 
1119 1024
   gSetsMembershipMatrix <- matrix(0, nrow=totalGenes, ncol=totalGsets,
... ...
@@ -13,9 +13,41 @@
13 13
     }
14 14
   } 
15 15
 
16
+  if (nrow(expr) < 2)
17
+    stop("Less than two genes in the input assay object\n")
18
+  
19
+  if(is.null(rownames(expr)))
20
+    stop("The input assay object doesn't have rownames\n")
21
+  
16 22
   expr
17 23
 }
18 24
 
25
+## maps gene sets content in 'gsets' to 'features', where 'gsets'
26
+## is a 'list' object with character string vectors as elements,
27
+## and 'features' is a character string vector object. it assumes
28
+## features in both input objects follow the same nomenclature,
29
+.mapGeneSetsToFeatures <- function(gsets, features) {
30
+
31
+  ## fastmatch::fmatch() modifies the 'table' argument (i.e., the
32
+  ## second argument) in place by adding the attribute '.match.hash'
33
+  ## https://github.com/rcastelo/GSVA/issues/39
34
+  ## to avoid that undesired feature we duplicate 'features'
35
+  ## by adding an "impossible value" at the end and let
36
+  ## fastmatch::match() work with the duplicated object
37
+  features2 <- c(features, "&!%impossiblevalue%!&")
38
+
39
+  ## map to the actual features for which expression data is available
40
+  mapdgenesets <- lapply(gsets,
41
+                         function(x, y)
42
+                           as.vector(na.omit(fastmatch::fmatch(x, y))),
43
+                         features2)
44
+  
45
+  if (length(unlist(mapdgenesets, use.names=FALSE)) == 0)
46
+    stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.")
47
+
48
+  mapdgenesets
49
+}
50
+
19 51
 ## transforms a dgCMatrix into a list of its
20 52
 ## non-zero values by MARGIN (1 for row, 2 for column)
21 53
 .sparseToList <-function(dgCMat, MARGIN){
... ...
@@ -58,5 +90,11 @@
58 90
     }
59 91
   }
60 92
   
93
+  if (nrow(expr) < 2)
94
+    stop("Less than two genes in the input assay object\n")
95
+  
96
+  if(is.null(rownames(expr)))
97
+    stop("The input assay object doesn't have rownames\n")
98
+  
61 99
   expr
62
-}
63 100
\ No newline at end of file
101
+}