Browse code

support for pathCol (formerly pathwayCol) and geneCol to accept column numbers or column names as input

Catherine Ross authored on 18/02/2021 20:45:03
Showing 7 changed files

... ...
@@ -4,10 +4,10 @@
4 4
 #' @param pathwayFile (char) path to file with pathway annotations.
5 5
 #' @param header (logical) whether \code{pathwayFile} has a header
6 6
 #'     (default FALSE).
7
-#' @param pathwayCol (char) column name with pathway identifiers.
8
-#'     For use with non-GMT input files (eg "Pathway.ID"; default NULL).
9
-#' @param geneCol (char) column name with gene identifiers.
10
-#'     For use with non-GMT input files (eg "Gene.ID"; default NULL).
7
+#' @param pathCol (char or int) column name or number with pathway identifiers.
8
+#'     For use with non-GMT input files (eg "Pathway.ID" or 2; default NULL).
9
+#' @param geneCol (char or int) column name or number with gene identifiers.
10
+#'     For use with non-GMT input files (eg "Gene.ID" or 5; default NULL).
11 11
 #' @param minGene (integer) minimum number of genes to be considered
12 12
 #'     in a pathway (default 1).
13 13
 #' @param maxGene (integer) maximum number of genes to be considered
... ...
@@ -22,7 +22,7 @@
22 22
 #' )
23 23
 #' pathways <- readPathways(
24 24
 #'     system.file("extdata", "SAFE_terms.xlsx", package = "FEDUP"),
25
-#'     header = TRUE, pathwayCol = "Enriched.GO.names", geneCol = "Gene.ID"
25
+#'     header = TRUE, pathCol = "Enriched.GO.names", geneCol = "Gene.ID"
26 26
 #' )
27 27
 #' @importFrom openxlsx read.xlsx
28 28
 #' @importFrom tibble deframe
... ...
@@ -30,8 +30,7 @@
30 30
 #' @importFrom utils head read.delim tail
31 31
 #' @export
32 32
 readPathways <- function(pathwayFile, header = FALSE,
33
-    pathwayCol = NULL, geneCol = NULL,
34
-    minGene = 1L, maxGene = Inf) {
33
+    pathCol = NULL, geneCol = NULL, minGene = 1L, maxGene = Inf) {
35 34
     s <- c("gmt", "txt", "xlsx")
36 35
     f <- sub(".*\\.", "", pathwayFile)
37 36
     if (!f %in% s) {
... ...
@@ -41,31 +40,32 @@ readPathways <- function(pathwayFile, header = FALSE,
41 40
         ))
42 41
     }
43 42
     if (f == "gmt") {
44
-        pathway_in <- strsplit(readLines(pathwayFile), "\t")
43
+        path_in <- strsplit(readLines(pathwayFile), "\t")
45 44
         if (header) {
46
-            pathway_in <- pathway_in[-1]
45
+            path_in <- path_in[-1]
47 46
         }
48
-        pathways <- lapply(pathway_in, tail, -2)
49
-        names(pathways) <- vapply(pathway_in, head, n = 1, character(1))
47
+        pathways <- lapply(path_in, tail, -2)
48
+        names(pathways) <- vapply(path_in, head, n = 1, character(1))
50 49
     } else {
51 50
         if (f == "xlsx") {
52
-            pathway_in <- read.xlsx(pathwayFile)
53
-        } else if (f == "txt") {
54
-            pathway_in <- read.delim(pathwayFile, header = header)
51
+            path_in <- read.xlsx(pathwayFile)
55 52
         }
56
-        if (missing(pathwayCol) || !pathwayCol %in% colnames(pathway_in)) {
57
-            stop("Pathway ID column (", pathwayCol, ") not in file")
58
-        } else if (missing(geneCol) || !geneCol %in% colnames(pathway_in)) {
53
+        if (f == "txt") {
54
+            path_in <- read.delim(pathwayFile, header = header)
55
+        }
56
+        if (!pathCol %in% names(path_in) && !pathCol %in% seq_along(path_in)) {
57
+            stop("Pathway ID column (", pathCol, ") not in file")
58
+        }
59
+        if (!geneCol %in% names(path_in) && !geneCol %in% seq_along(path_in)) {
59 60
             stop("Gene ID column (", geneCol, ") not in file")
60
-        } else {
61
-            pathway_df <- data.frame(
62
-                pathway = pathway_in[, pathwayCol], gene = pathway_in[, geneCol]
63
-            )
64
-            pathway_df[which(pathway_df$gene == ""), "gene"] <- NA
65
-            pathway_df <- na.omit(pathway_df)
66
-            pathway_df <- aggregate(gene ~ pathway, pathway_df, paste)
67
-            pathways <- deframe(pathway_df)
68 61
         }
62
+        pathway_df <- data.frame(
63
+            pathway = path_in[, pathCol], gene = path_in[, geneCol]
64
+        )
65
+        pathway_df[which(pathway_df$gene == ""), "gene"] <- NA
66
+        pathway_df <- na.omit(pathway_df)
67
+        pathway_df <- aggregate(gene ~ pathway, pathway_df, paste)
68
+        pathways <- deframe(pathway_df)
69 69
     }
70 70
     size <- lapply(pathways, length)
71 71
     pathways_s <- pathways[which(size >= minGene & size <= maxGene)]
... ...
@@ -1,6 +1,6 @@
1 1
 ## code to prepare `testGene` and `backgroundGene` datasets goes here
2
-pathway_file <- system.file("extdata", "Human_Reactome_November_17_2020_symbol.gmt", package = "FEDUP")
3
-pathwaysGMT <- readPathways(pathway_file, MIN_GENE = 10, MAX_GENE = 500)
2
+pathwayFile <- system.file("extdata", "Human_Reactome_November_17_2020_symbol.gmt", package = "FEDUP")
3
+pathwaysGMT <- readPathways(pathwayFile, minGene = 10, maxGene = 500)
4 4
 
5 5
 testGene <- pathwaysGMT[[grep("397014", names(pathwaysGMT))]] # Reactome muscle contraction pathway
6 6
 backgroundGene <- unique(unlist(pathwaysGMT))
... ...
@@ -1,5 +1,5 @@
1 1
 ## code to prepare `pathwaysGMT` dataset goes here
2
-pathway_file <- system.file("extdata", "Human_Reactome_November_17_2020_symbol.gmt", package = "FEDUP")
3
-pathwaysGMT <- readPathways(pathway_file, MIN_GENE = 10, MAX_GENE = 500)
2
+pathwayFile <- system.file("extdata", "Human_Reactome_November_17_2020_symbol.gmt", package = "FEDUP")
3
+pathwaysGMT <- readPathways(pathwayFile, minGene = 10, maxGene = 500)
4 4
 names(pathwaysGMT) <- stringi::stri_trans_general(names(pathwaysGMT), "latin-ascii")
5 5
 usethis::use_data(pathwaysGMT, compress = "xz", version = 2, overwrite = TRUE)
... ...
@@ -1,12 +1,12 @@
1 1
 ## code to prepare `pathwaysXLSX` dataset goes here
2 2
 library(tibble)
3 3
 
4
-pathway_file <- system.file("extdata", "SAFE_terms.txt", package = "FEDUP")
4
+pathwayFile <- system.file("extdata", "SAFE_terms.txt", package = "FEDUP")
5 5
 pathwaysTXT <- readPathways(
6
-    pathway_file,
6
+    pathwayFile,
7 7
     header = TRUE,
8
-    pathway_col = "Enriched.GO.names",
9
-    gene_col = "Gene.ID"
8
+    pathCol = "Enriched.GO.names",
9
+    geneCol = "Gene.ID"
10 10
 )
11 11
 
12 12
 names(pathwaysTXT) <- stringi::stri_trans_general(names(pathwaysTXT), "latin-ascii")
... ...
@@ -2,12 +2,12 @@
2 2
 library(openxlsx)
3 3
 library(tibble)
4 4
 
5
-pathway_file <- system.file("extdata", "SAFE_terms.xlsx", package = "FEDUP")
5
+pathwayFile <- system.file("extdata", "SAFE_terms.xlsx", package = "FEDUP")
6 6
 pathwaysXLSX <- readPathways(
7
-    pathway_file,
7
+    pathwayFile,
8 8
     header = TRUE,
9
-    pathway_col = "Enriched.GO.names",
10
-    gene_col = "Gene.ID"
9
+    pathCol = "Enriched.GO.names",
10
+    geneCol = "Gene.ID"
11 11
 )
12 12
 
13 13
 names(pathwaysXLSX) <- stringi::stri_trans_general(names(pathwaysXLSX), "latin-ascii")
... ...
@@ -8,7 +8,7 @@ Currently supports the following file format: gmt, txt, xlsx.}
8 8
 readPathways(
9 9
   pathwayFile,
10 10
   header = FALSE,
11
-  pathwayCol = NULL,
11
+  pathCol = NULL,
12 12
   geneCol = NULL,
13 13
   minGene = 1L,
14 14
   maxGene = Inf
... ...
@@ -20,11 +20,11 @@ readPathways(
20 20
 \item{header}{(logical) whether \code{pathwayFile} has a header
21 21
 (default FALSE).}
22 22
 
23
-\item{pathwayCol}{(char) column name with pathway identifiers.
24
-For use with non-GMT input files (eg "Pathway.ID"; default NULL).}
23
+\item{pathCol}{(char or int) column name or number with pathway identifiers.
24
+For use with non-GMT input files (eg "Pathway.ID" or 2; default NULL).}
25 25
 
26
-\item{geneCol}{(char) column name with gene identifiers.
27
-For use with non-GMT input files (eg "Gene.ID"; default NULL).}
26
+\item{geneCol}{(char or int) column name or number with gene identifiers.
27
+For use with non-GMT input files (eg "Gene.ID" or 5; default NULL).}
28 28
 
29 29
 \item{minGene}{(integer) minimum number of genes to be considered
30 30
 in a pathway (default 1).}
... ...
@@ -48,6 +48,6 @@ pathways <- readPathways(
48 48
 )
49 49
 pathways <- readPathways(
50 50
     system.file("extdata", "SAFE_terms.xlsx", package = "FEDUP"),
51
-    header = TRUE, pathwayCol = "Enriched.GO.names", geneCol = "Gene.ID"
51
+    header = TRUE, pathCol = "Enriched.GO.names", geneCol = "Gene.ID"
52 52
 )
53 53
 }
... ...
@@ -8,17 +8,17 @@ test_that("Test that readPathways stops without proper inputs", {
8 8
     expect_error(readPathways(
9 9
         pathwayFile,
10 10
         header = TRUE,
11
-        pathwayCol = "Enriched.GO.names", geneCol = "oops"
11
+        pathCol = "Enriched.GO.names", geneCol = "oops"
12 12
     ))
13 13
     expect_error(readPathways(
14 14
         pathwayFile,
15 15
         header = TRUE,
16
-        pathwayCol = "oops", geneCol = "Gene.ID"
16
+        pathCol = "oops", geneCol = "Gene.ID"
17 17
     ))
18 18
     expect_error(readPathways(
19 19
         pathwayFile,
20 20
         header = TRUE, minGene = 500,
21
-        pathwayCol = "Enriched.GO.names", geneCol = "Gene.ID"
21
+        pathCol = "Enriched.GO.names", geneCol = "Gene.ID"
22 22
     ))
23 23
 })
24 24
 
... ...
@@ -43,7 +43,7 @@ test_that("Test that readPathways works with XLSX input", {
43 43
     pathways <- readPathways(
44 44
         pathwayFile,
45 45
         header = TRUE,
46
-        pathwayCol = "Enriched.GO.names", geneCol = "Gene.ID"
46
+        pathCol = "Enriched.GO.names", geneCol = "Gene.ID"
47 47
     )
48 48
     expect_true(is.list(pathways))
49 49
     expect_equal(length(pathways), 30)
... ...
@@ -58,7 +58,7 @@ test_that("Test that readPathways works with TXT input", {
58 58
     pathways <- readPathways(
59 59
         pathwayFile,
60 60
         header = TRUE,
61
-        pathwayCol = "Enriched.GO.names", geneCol = "Gene.ID"
61
+        pathCol = "Enriched.GO.names", geneCol = "Gene.ID"
62 62
     )
63 63
     expect_true(is.list(pathways))
64 64
     expect_equal(length(pathways), 30)