... | ... |
@@ -4,6 +4,10 @@ |
4 | 4 |
## DelayedMatrixStats::rowSds() works for both base and |
5 | 5 |
## DelayedArray matrices |
6 | 6 |
sdGenes <- DelayedMatrixStats::rowSds(expr) |
7 |
+ ## the following fixes this bug, see issues |
|
8 |
+ ## https://github.com/rcastelo/GSVA/issues/54 |
|
9 |
+ ## https://github.com/HenrikBengtsson/matrixStats/issues/204 |
|
10 |
+ sdGenes[sdGenes < 1e-10] <- 0 |
|
7 | 11 |
if (any(sdGenes == 0) || any(is.na(sdGenes))) { |
8 | 12 |
warning(sum(sdGenes == 0 | is.na(sdGenes)), |
9 | 13 |
" genes with constant expression values throuhgout the samples.") |
... | ... |
@@ -66,27 +66,3 @@ |
66 | 66 |
m@x <- unlist(x, use.names=FALSE) |
67 | 67 |
m |
68 | 68 |
} |
69 |
- |
|
70 |
-## filter out genes which non-zero values have |
|
71 |
-## constant expression values |
|
72 |
-.filterFeaturesSparse <- function(expr, method) { |
|
73 |
- |
|
74 |
- sdGenes <- sapply(.sparseToList(expr, 1), sd) |
|
75 |
- |
|
76 |
- if (any(sdGenes == 0) || any(is.na(sdGenes))) { |
|
77 |
- warning(sum(sdGenes == 0 | is.na(sdGenes)), |
|
78 |
- " genes with constant expression values throuhgout the samples.") |
|
79 |
- if (method != "ssgsea") { |
|
80 |
- warning("Since argument method!=\"ssgsea\", genes with constant expression values are discarded.") |
|
81 |
- expr <- expr[sdGenes > 0 & !is.na(sdGenes), ] |
|
82 |
- } |
|
83 |
- } |
|
84 |
- |
|
85 |
- if (nrow(expr) < 2) |
|
86 |
- stop("Less than two genes in the input assay object\n") |
|
87 |
- |
|
88 |
- if(is.null(rownames(expr))) |
|
89 |
- stop("The input assay object doesn't have rownames\n") |
|
90 |
- |
|
91 |
- expr |
|
92 |
-} |
... | ... |
@@ -28,20 +28,12 @@ |
28 | 28 |
## features in both input objects follow the same nomenclature, |
29 | 29 |
.mapGeneSetsToFeatures <- function(gsets, features) { |
30 | 30 |
|
31 |
- ## fastmatch::fmatch() modifies the 'table' argument (i.e., the |
|
32 |
- ## second argument) in place by adding the attribute '.match.hash' |
|
33 |
- ## https://github.com/rcastelo/GSVA/issues/39 |
|
34 |
- ## to avoid that undesired feature we duplicate 'features' |
|
35 |
- ## by adding an "impossible value" at the end and let |
|
36 |
- ## fastmatch::match() work with the duplicated object |
|
37 |
- features2 <- c(features, "&!%impossiblevalue%!&") |
|
31 |
+ ## Aaron Lun's suggestion at |
|
32 |
+ ## https://github.com/rcastelo/GSVA/issues/39#issuecomment-765549620 |
|
33 |
+ gsets2 <- CharacterList(gsets) |
|
34 |
+ mt <- match(gsets2, features) |
|
35 |
+ mapdgenesets <- as.list(mt[!is.na(mt)]) |
|
38 | 36 |
|
39 |
- ## map to the actual features for which expression data is available |
|
40 |
- mapdgenesets <- lapply(gsets, |
|
41 |
- function(x, y) |
|
42 |
- as.vector(na.omit(fastmatch::fmatch(x, y))), |
|
43 |
- features2) |
|
44 |
- |
|
45 | 37 |
if (length(unlist(mapdgenesets, use.names=FALSE)) == 0) |
46 | 38 |
stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.") |
47 | 39 |
|
... | ... |
@@ -13,9 +13,41 @@ |
13 | 13 |
} |
14 | 14 |
} |
15 | 15 |
|
16 |
+ if (nrow(expr) < 2) |
|
17 |
+ stop("Less than two genes in the input assay object\n") |
|
18 |
+ |
|
19 |
+ if(is.null(rownames(expr))) |
|
20 |
+ stop("The input assay object doesn't have rownames\n") |
|
21 |
+ |
|
16 | 22 |
expr |
17 | 23 |
} |
18 | 24 |
|
25 |
+## maps gene sets content in 'gsets' to 'features', where 'gsets' |
|
26 |
+## is a 'list' object with character string vectors as elements, |
|
27 |
+## and 'features' is a character string vector object. it assumes |
|
28 |
+## features in both input objects follow the same nomenclature, |
|
29 |
+.mapGeneSetsToFeatures <- function(gsets, features) { |
|
30 |
+ |
|
31 |
+ ## fastmatch::fmatch() modifies the 'table' argument (i.e., the |
|
32 |
+ ## second argument) in place by adding the attribute '.match.hash' |
|
33 |
+ ## https://github.com/rcastelo/GSVA/issues/39 |
|
34 |
+ ## to avoid that undesired feature we duplicate 'features' |
|
35 |
+ ## by adding an "impossible value" at the end and let |
|
36 |
+ ## fastmatch::match() work with the duplicated object |
|
37 |
+ features2 <- c(features, "&!%impossiblevalue%!&") |
|
38 |
+ |
|
39 |
+ ## map to the actual features for which expression data is available |
|
40 |
+ mapdgenesets <- lapply(gsets, |
|
41 |
+ function(x, y) |
|
42 |
+ as.vector(na.omit(fastmatch::fmatch(x, y))), |
|
43 |
+ features2) |
|
44 |
+ |
|
45 |
+ if (length(unlist(mapdgenesets, use.names=FALSE)) == 0) |
|
46 |
+ stop("No identifiers in the gene sets could be matched to the identifiers in the expression data.") |
|
47 |
+ |
|
48 |
+ mapdgenesets |
|
49 |
+} |
|
50 |
+ |
|
19 | 51 |
## transforms a dgCMatrix into a list of its |
20 | 52 |
## non-zero values by MARGIN (1 for row, 2 for column) |
21 | 53 |
.sparseToList <-function(dgCMat, MARGIN){ |
... | ... |
@@ -58,5 +90,11 @@ |
58 | 90 |
} |
59 | 91 |
} |
60 | 92 |
|
93 |
+ if (nrow(expr) < 2) |
|
94 |
+ stop("Less than two genes in the input assay object\n") |
|
95 |
+ |
|
96 |
+ if(is.null(rownames(expr))) |
|
97 |
+ stop("The input assay object doesn't have rownames\n") |
|
98 |
+ |
|
61 | 99 |
expr |
62 |
-} |
|
63 | 100 |
\ No newline at end of file |
101 |
+} |
... | ... |
@@ -1,6 +1,8 @@ |
1 | 1 |
.filterFeatures <- function(expr, method) { |
2 | 2 |
|
3 | 3 |
## filter out genes with constant expression values |
4 |
+ ## DelayedMatrixStats::rowSds() works for both base and |
|
5 |
+ ## DelayedArray matrices |
|
4 | 6 |
sdGenes <- DelayedMatrixStats::rowSds(expr) |
5 | 7 |
if (any(sdGenes == 0) || any(is.na(sdGenes))) { |
6 | 8 |
warning(sum(sdGenes == 0 | is.na(sdGenes)), |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
.filterFeatures <- function(expr, method) { |
2 | 2 |
|
3 | 3 |
## filter out genes with constant expression values |
4 |
- sdGenes <- apply(expr, 1, sd) |
|
4 |
+ sdGenes <- DelayedMatrixStats::rowSds(expr) |
|
5 | 5 |
if (any(sdGenes == 0) || any(is.na(sdGenes))) { |
6 | 6 |
warning(sum(sdGenes == 0 | is.na(sdGenes)), |
7 | 7 |
" genes with constant expression values throuhgout the samples.") |
... | ... |
@@ -35,6 +35,12 @@ |
35 | 35 |
result |
36 | 36 |
} |
37 | 37 |
|
38 |
+.dgCapply<-function(m,f, MARGIN){ |
|
39 |
+ x <- lapply(.sparseToList(m, MARGIN), f) |
|
40 |
+ m@x <- unlist(x, use.names=FALSE) |
|
41 |
+ m |
|
42 |
+} |
|
43 |
+ |
|
38 | 44 |
## filter out genes which non-zero values have |
39 | 45 |
## constant expression values |
40 | 46 |
.filterFeaturesSparse <- function(expr, method) { |
... | ... |
@@ -13,3 +13,42 @@ |
13 | 13 |
|
14 | 14 |
expr |
15 | 15 |
} |
16 |
+ |
|
17 |
+## transforms a dgCMatrix into a list of its |
|
18 |
+## non-zero values by MARGIN (1 for row, 2 for column) |
|
19 |
+.sparseToList <-function(dgCMat, MARGIN){ |
|
20 |
+ MARGIN <- as.integer(MARGIN) |
|
21 |
+ J <- rep(1:ncol(dgCMat), diff(dgCMat@p)) |
|
22 |
+ I <- dgCMat@i + 1 |
|
23 |
+ x <- dgCMat@x |
|
24 |
+ if (MARGIN == 1L) { |
|
25 |
+ result <- split(x, I) |
|
26 |
+ names(result) <- rownames(dgCMat)[as.numeric(names(result))] |
|
27 |
+ } else if (MARGIN == 2L) { |
|
28 |
+ result <- split(x, J) |
|
29 |
+ names(result) <- colnames(dgCMat)[as.numeric(names(result))] |
|
30 |
+ } |
|
31 |
+ else { |
|
32 |
+ warning("invalid MARGIN; return NULL") |
|
33 |
+ result <- NULL |
|
34 |
+ } |
|
35 |
+ result |
|
36 |
+} |
|
37 |
+ |
|
38 |
+## filter out genes which non-zero values have |
|
39 |
+## constant expression values |
|
40 |
+.filterFeaturesSparse <- function(expr, method) { |
|
41 |
+ |
|
42 |
+ sdGenes <- sapply(.sparseToList(expr, 1), sd) |
|
43 |
+ |
|
44 |
+ if (any(sdGenes == 0) || any(is.na(sdGenes))) { |
|
45 |
+ warning(sum(sdGenes == 0 | is.na(sdGenes)), |
|
46 |
+ " genes with constant expression values throuhgout the samples.") |
|
47 |
+ if (method != "ssgsea") { |
|
48 |
+ warning("Since argument method!=\"ssgsea\", genes with constant expression values are discarded.") |
|
49 |
+ expr <- expr[sdGenes > 0 & !is.na(sdGenes), ] |
|
50 |
+ } |
|
51 |
+ } |
|
52 |
+ |
|
53 |
+ expr |
|
54 |
+} |
|
16 | 55 |
\ No newline at end of file |
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,15 @@ |
1 |
+.filterFeatures <- function(expr, method) { |
|
2 |
+ |
|
3 |
+ ## filter out genes with constant expression values |
|
4 |
+ sdGenes <- apply(expr, 1, sd) |
|
5 |
+ if (any(sdGenes == 0) || any(is.na(sdGenes))) { |
|
6 |
+ warning(sum(sdGenes == 0 | is.na(sdGenes)), |
|
7 |
+ " genes with constant expression values throuhgout the samples.") |
|
8 |
+ if (method != "ssgsea") { |
|
9 |
+ warning("Since argument method!=\"ssgsea\", genes with constant expression values are discarded.") |
|
10 |
+ expr <- expr[sdGenes > 0 & !is.na(sdGenes), ] |
|
11 |
+ } |
|
12 |
+ } |
|
13 |
+ |
|
14 |
+ expr |
|
15 |
+} |