Browse code

fixed bug with annotations

Tom Sherman authored on 09/08/2018 00:00:17
Showing 8 changed files

... ...
@@ -86,13 +86,14 @@ outputToFile=NULL, ...)
86 86
     )
87 87
     allParams <- parseExtraParams(allParams, list(...))
88 88
 
89
-    # display start up message for the user
90
-    startupMessage(data, allParams)
91
-
92 89
     # check file extension
93 90
     if (is(data, "character") & !supported(data))
94 91
         stop("unsupported file extension for data")
95 92
 
93
+    # enforce the use of explicit subsets with manual pattern matching
94
+    if (!is.null(allParams$matchedPatterns) & is.null(allParams$gaps@explicitSets))
95
+        stop("must provide explicit subsets when doing manual pattern matching")
96
+
96 97
     # check uncertainty matrix
97 98
     if (is(data, "character") & !is.null(uncertainty) & !is(uncertainty, "character"))
98 99
         stop("uncertainty must be same data type as data (file name)")
... ...
@@ -154,6 +155,7 @@ outputToFile=NULL, ...)
154 155
     allParams$sampleNames <- sampleNames
155 156
 
156 157
     # run cogaps
158
+    startupMessage(data, allParams)
157 159
     gapsReturnList <- dispatchFunc(data, allParams, uncertainty)
158 160
 
159 161
     # convert list to CogapsResult object
... ...
@@ -21,7 +21,7 @@ supported <- function(file)
21 21
 {
22 22
     if (!is(file, "character"))
23 23
         return(FALSE)
24
-    return(tools::file_ext(file) %in% c("tsv", "csv", "mtx"))
24
+    return(tools::file_ext(file) %in% c("tsv", "csv", "mtx", "gct"))
25 25
 }
26 26
 
27 27
 #' get number of rows from supported file name or matrix
... ...
@@ -38,7 +38,8 @@ nrowHelper <- function(data)
38 38
         return(switch(tools::file_ext(data),
39 39
             "csv" = nrow(data.table::fread(data, select=1)),
40 40
             "tsv" = nrow(data.table::fread(data, select=1)),
41
-            "mtx" = as.numeric(data.table::fread(data, nrows=1, fill=TRUE)[1,1])
41
+            "mtx" = as.numeric(data.table::fread(data, nrows=1, fill=TRUE)[1,1]),
42
+            "gct" = as.numeric(strsplit(as.matrix(data.table::fread(data, nrows=1, sep='\t')), "\\s+")[[1]][1])
42 43
         ))
43 44
     }
44 45
     return(nrow(data))
... ...
@@ -58,7 +59,8 @@ ncolHelper <- function(data)
58 59
         return(switch(tools::file_ext(data),
59 60
             "csv" = ncol(data.table::fread(data, nrows=1)) - 1,
60 61
             "tsv" = ncol(data.table::fread(data, nrows=1)) - 1,
61
-            "mtx" = as.numeric(data.table::fread(data, nrows=1, fill=TRUE)[1,2])
62
+            "mtx" = as.numeric(data.table::fread(data, nrows=1, fill=TRUE)[1,2]),
63
+            "gct" = as.numeric(strsplit(as.matrix(data.table::fread(data, nrows=1, sep='\t')), "\\s+")[[1]][2])
62 64
         ))
63 65
     }
64 66
     return(ncol(data))
... ...
@@ -30,11 +30,16 @@ sampleWithExplictSets <- function(allParams, total)
30 30
 #' @return list of subsets
31 31
 sampleWithAnnotationWeights <- function(allParams, setSize)
32 32
 {
33
+    # sort annotation group and weights so they match up
34
+    weight <- allParams$gaps@samplingWeight
35
+    weight <- weight[order(names(weight))]
33 36
     groups <- unique(allParams$gaps@samplingAnnotation)
37
+    groups <- sort(groups)
38
+
39
+    # sample accordingly
34 40
     return(lapply(1:allParams$gaps@nSets, function(i)
35 41
     {
36
-        groupCount <- sample(groups, size=setSize, replace=TRUE,
37
-            prob=allParams$gaps@samplingWeight)
42
+        groupCount <- sample(groups, size=setSize, replace=TRUE, prob=weight)
38 43
         sort(unlist(sapply(groups, function(g)
39 44
         {
40 45
             groupNdx <- which(allParams$gaps@samplingAnnotation == g)
... ...
@@ -110,6 +110,11 @@ setValidity("CogapsParams",
110 110
 
111 111
         if (!is.null(object@explicitSets) & length(object@explicitSets) != object@nSets)
112 112
             "wrong number of sets given"
113
+        if (length(object@samplingWeight) & is.null(names(object@samplingWeight)))
114
+            "samplingWeight must be a named vector"
115
+
116
+        if (!is.null(object@explicitSets) & !is.null(object@samplingWeight))
117
+            "explicitSets and samplingAnnotation/samplingWeight are both set"
113 118
     }
114 119
 )
115 120
 
116 121
Binary files a/tests/testthat/gaps_checkpoint.out and b/tests/testthat/gaps_checkpoint.out differ
117 122
new file mode 100644
... ...
@@ -0,0 +1,31 @@
1
+context("CoGAPS")
2
+
3
+test_that("Subsetting Data",
4
+{
5
+    data(GIST)
6
+    data(SimpSim)
7
+    testMatrix <- GIST.matrix
8
+
9
+    # sampling with weights
10
+    anno <- sample(letters[1:5], size=nrow(testMatrix), replace=TRUE)
11
+    weights <- c(0,1,4,1,0)
12
+    names(weights) <- letters[1:5]
13
+    params <- new("CogapsParams")
14
+    params <- setAnnotationWeights(params, annotation=anno, weights=weights)
15
+    result <- GWCoGAPS(testMatrix, params, messages=FALSE)
16
+
17
+    getIndex <- function(s) as.numeric(strsplit(s, "_")[[1]][2])
18
+    getIndices <- function(set) unname(sapply(set, getIndex))
19
+    countType <- function(set, type) sum(getIndices(set) %in% which(anno == type))
20
+    getHistogram <- function(set) sapply(letters[1:5], function(letter) countType(set, letter))
21
+    hist <- sapply(getSubsets(result), getHistogram)
22
+    freq <- unname(rowSums(hist) / sum(hist))
23
+    
24
+    expect_true(all.equal(freq, unname(weights / sum(weights)), tol=0.05))
25
+
26
+    # running cogaps with given subsets
27
+    sets <- list(1:225, 226:450, 451:675, 676:900)
28
+    result <- GWCoGAPS(SimpSim.data, explicitSets=sets, messages=FALSE)
29
+    subsets <- lapply(getSubsets(result), getIndices)
30
+    expect_true(all(sapply(1:4, function(i) all.equal(sets[[i]], subsets[[i]]))))
31
+})
0 32
\ No newline at end of file
... ...
@@ -19,6 +19,7 @@ test_that("Valid Top-Level CoGAPS Calls",
19 19
     gistCsvPath <- system.file("extdata/GIST.csv", package="CoGAPS")
20 20
     gistTsvPath <- system.file("extdata/GIST.tsv", package="CoGAPS")
21 21
     gistMtxPath <- system.file("extdata/GIST.mtx", package="CoGAPS")
22
+    gistGctPath <- system.file("extdata/GIST.gct", package="CoGAPS")
22 23
 
23 24
     # data types
24 25
     res <- list()
... ...
@@ -27,15 +28,16 @@ test_that("Valid Top-Level CoGAPS Calls",
27 28
     res[[3]] <- CoGAPS(gistCsvPath, nIterations=100, outputFrequency=50, seed=1, messages=FALSE)
28 29
     res[[4]] <- CoGAPS(gistTsvPath, nIterations=100, outputFrequency=50, seed=1, messages=FALSE)
29 30
     res[[5]] <- CoGAPS(gistMtxPath, nIterations=100, outputFrequency=50, seed=1, messages=FALSE)
31
+    res[[6]] <- CoGAPS(gistGctPath, nIterations=100, outputFrequency=50, seed=1, messages=FALSE)
30 32
     expect_true(all(sapply(res, no_na_in_result)))
31 33
     
32 34
     expect_equal(nrow(res[[1]]@featureLoadings), 1363)
33 35
     expect_equal(ncol(res[[1]]@featureLoadings), 7)
34 36
     expect_equal(nrow(res[[1]]@sampleFactors), 9)
35 37
     expect_equal(ncol(res[[1]]@sampleFactors), 7)
36
-    expect_true(all(sapply(1:4, function(i)
38
+    expect_true(all(sapply(1:5, function(i)
37 39
         res[[i]]@featureLoadings == res[[i+1]]@featureLoadings)))
38
-    expect_true(all(sapply(1:4, function(i)
40
+    expect_true(all(sapply(1:5, function(i)
39 41
         res[[i]]@sampleFactors == res[[i+1]]@sampleFactors)))
40 42
 
41 43
     # transposing data
... ...
@@ -50,15 +52,17 @@ test_that("Valid Top-Level CoGAPS Calls",
50 52
         outputFrequency=50, seed=1, messages=FALSE)
51 53
     res[[5]] <- CoGAPS(gistMtxPath, transposeData=TRUE, nIterations=100,
52 54
         outputFrequency=50, seed=1, messages=FALSE)
55
+    res[[6]] <- CoGAPS(gistGctPath, transposeData=TRUE, nIterations=100,
56
+        outputFrequency=50, seed=1, messages=FALSE)
53 57
     expect_true(all(sapply(res, no_na_in_result)))
54 58
     
55 59
     expect_equal(nrow(res[[1]]@featureLoadings), 9)
56 60
     expect_equal(ncol(res[[1]]@featureLoadings), 7)
57 61
     expect_equal(nrow(res[[1]]@sampleFactors), 1363)
58 62
     expect_equal(ncol(res[[1]]@sampleFactors), 7)
59
-    expect_true(all(sapply(1:4, function(i)
63
+    expect_true(all(sapply(1:5, function(i)
60 64
         res[[i]]@featureLoadings == res[[i+1]]@featureLoadings)))
61
-    expect_true(all(sapply(1:4, function(i)
65
+    expect_true(all(sapply(1:5, function(i)
62 66
         res[[i]]@sampleFactors == res[[i+1]]@sampleFactors)))
63 67
 
64 68
     # passing uncertainty
... ...
@@ -380,8 +380,10 @@ sampling all rows (cols) according to the weight of each category.
380 380
 ```{r}
381 381
 # sampling with weights
382 382
 anno <- sample(letters[1:5], size=nrow(SimpSim.data), replace=TRUE)
383
+w <- c(1,1,2,2,1)
384
+names(w) <- letters[1:5]
383 385
 params <- new("CogapsParams")
384
-params <- setAnnotationWeights(params, annotation=anno, weights=c(1,1,2,2,1))
386
+params <- setAnnotationWeights(params, annotation=anno, weights=w)
385 387
 result <- GWCoGAPS(SimpSim.data, params, messages=FALSE)
386 388
 ```
387 389