Browse code

parallel checkpoints mostly working - some bugs with processing output

sherman5 authored on 30/01/2018 20:50:37
Showing8 changed files

... ...
@@ -23,6 +23,7 @@
23 23
 #' the fixed patterns
24 24
 #' @param fixedPatterns matrix of fixed values in either A or P matrix
25 25
 #' @param checkpointInterval time (in seconds) between creating a checkpoint
26
+#' @param checkpointFile name of the checkpoint file
26 27
 #' @param ... keeps backwards compatibility with arguments from older versions
27 28
 #' @return list with A and P matrix estimates
28 29
 #' @importFrom methods new
... ...
@@ -32,8 +33,9 @@
32 33
 #' @export
33 34
 CoGAPS <- function(D, S, nFactor=7, nEquil=1000, nSample=1000, nOutputs=1000,
34 35
 nSnapshots=0, alphaA=0.01, alphaP=0.01, maxGibbmassA=100, maxGibbmassP=100,
35
-seed=-1, messages=TRUE, singleCellRNASeq=FALSE, whichMatrixFixed = 'N',
36
-fixedPatterns = matrix(0), checkpointInterval=0, ...)
36
+seed=-1, messages=TRUE, singleCellRNASeq=FALSE, whichMatrixFixed='N',
37
+fixedPatterns=matrix(0), checkpointInterval=0, checkpointFile="gaps_checkpoint.out",
38
+...)
37 39
 {
38 40
     # get v2 arguments
39 41
     oldArgs <- list(...)
... ...
@@ -57,12 +59,24 @@ fixedPatterns = matrix(0), checkpointInterval=0, ...)
57 59
         stop('D and S must be matrices')
58 60
     if (any(D < 0) | any(S < 0))
59 61
         stop('D and S matrix must be non-negative')
62
+    if (nrow(D) != nrow(S) | ncol(D) != ncol(S))
63
+        stop('D and S matrix have different dimensions')
64
+    if (whichMatrixFixed == 'A' & nrow(fixedPatterns) != nrow(D))
65
+        stop('invalid number of rows for fixedPatterns')
66
+    if (whichMatrixFixed == 'A' & ncol(fixedPatterns) > nFactor)
67
+        stop('invalid number of columns for fixedPatterns')
68
+    if (whichMatrixFixed == 'P' & nrow(fixedPatterns) > nFactor)
69
+        stop('invalid number of rows for fixedPatterns')
70
+    if (whichMatrixFixed == 'P' & ncol(fixedPatterns) != ncol(D))
71
+        stop('invalid number of columns for fixedPatterns')
60 72
 
61 73
     # run algorithm with call to C++ code
62 74
     result <- cogaps_cpp(D, S, nFactor, nEquil, nEquil/10, nSample, nOutputs,
63 75
         nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages,
64
-        singleCellRNASeq, whichMatrixFixed, fixedPatterns, checkpointInterval)
76
+        singleCellRNASeq, whichMatrixFixed, fixedPatterns, checkpointInterval,
77
+        checkpointFile)
65 78
     
79
+    # label matrices and return list
66 80
     patternNames <- paste('Patt', 1:nFactor, sep='')
67 81
     rownames(result$Amean) <- rownames(result$Asd) <- rownames(D)
68 82
     colnames(result$Amean) <- colnames(result$Asd) <- patternNames
... ...
@@ -80,9 +94,9 @@ fixedPatterns = matrix(0), checkpointInterval=0, ...)
80 94
 #' @param path path to checkpoint file
81 95
 #' @return list with A and P matrix estimates
82 96
 #' @export
83
-CoGapsFromCheckpoint <- function(D, S, path)
97
+CoGapsFromCheckpoint <- function(D, S, path, checkpointFile="gaps_checkpoint.out")
84 98
 {
85
-    cogapsFromCheckpoint_cpp(D, S, path)
99
+    cogapsFromCheckpoint_cpp(D, S, path, checkpointFile)
86 100
 }
87 101
 
88 102
 #' Display Information About Package Compilation
... ...
@@ -3,12 +3,10 @@
3 3
 #' @details calls the C++ MCMC code and performs Bayesian
4 4
 #' matrix factorization returning the two matrices that reconstruct
5 5
 #' the data matrix for whole genome data;
6
-#' @param D data matrix
7
-#' @param S uncertainty matrix (std devs for chi-squared of Log Likelihood)
8 6
 #' @param nFactor number of patterns (basis vectors, metagenes), which must be
9 7
 #' greater than or equal to the number of rows of FP
10 8
 #' @param nCores number of cores for parallelization. If left to the default NA, nCores = nSets.
11
-#' @param Cut number of branches at which to cut dendrogram used in patternMatch4Parallel
9
+#' @param cut number of branches at which to cut dendrogram used in patternMatch4Parallel
12 10
 #' @param minNS minimum of individual set contributions a cluster must contain
13 11
 #' @param ... additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun}
14 12
 #' @return list of A and P estimates
... ...
@@ -21,21 +19,23 @@
21 19
 #' @export
22 20
 GWCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, ...)
23 21
 {
22
+    if (!is.null(list(...)$checkpointFile))
23
+        stop("checkpoint file name automatically set in GWCoGAPS - don't pass this parameter")
24 24
     allDataSets <- preInitialPhase(simulationName, nCores)
25
-    initialResult <- runInitialPhase(allDataSets, nFactor)
26
-    matchedPatternSets <- postInitialPhase(initialResult, length(allDataSets), cut)
27
-    finalResult <- runFinalPhase(allDataSets, matchedPatternSets)
28
-    return(postFinalPhase(allDataSets, matchedPatternSets))
25
+    initialResult <- runInitialPhase(simulationName, allDataSets, nFactor, ...)
26
+    matchedPatternSets <- postInitialPhase(initialResult, length(allDataSets), cut, minNS)
27
+    finalResult <- runFinalPhase(simulationName, allDataSets, matchedPatternSets, ...)
28
+    return(postFinalPhase(finalResult, matchedPatternSets))
29 29
 }
30 30
 
31
-GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA)
31
+GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA, ...)
32 32
 {
33 33
     allDataSets <- preInitialPhase(simulationName, nCores)
34 34
 
35 35
     # figure out phase from file signature
36
-    initalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_initial_run_*.out"))
37
-    finalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_final_run_*.out"))
38
-    cptPrefix <- ifelse(length(finalCpts), "_final_run_", "_initial_run_")
36
+    initalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_initial_cpt_[0-9]+.out"))
37
+    finalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_final_cpt_[0-9]+.out"))
38
+    cptPrefix <- ifelse(length(finalCpts), "_final_cpt_", "_initial_cpt_")
39 39
 
40 40
     # run CoGAPS for each set, starting from appropiate checkpoint file
41 41
     result <- foreach(i=1:nSets) %dopar%
... ...
@@ -55,8 +55,8 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA)
55 55
     }
56 56
     else 
57 57
     {
58
-        matchedPatternSets <- postInitialPhase(result, length(allDataSets), cut)
59
-        finalResult <- runFinalPhase(allDataSets, matchedPatternSets)
58
+        matchedPatternSets <- postInitialPhase(result, length(allDataSets), cut, minNS)
59
+        finalResult <- runFinalPhase(allDataSets, matchedPatternSets, ...)
60 60
         return(postFinalPhase(allDataSets, matchedPatternSets))
61 61
     }
62 62
 }
... ...
@@ -64,8 +64,8 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA)
64 64
 preInitialPhase <- function(simulationName, nCores)
65 65
 {
66 66
     # find data files
67
-    fileSig <- paste(simulationName, "_partition_*.RData", sep="")
68
-    allDataSets <- list.files(full.names=TRUE, pattern = fileSig)
67
+    fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="")
68
+    allDataSets <- list.files(full.names=TRUE, pattern=fileSig)
69 69
 
70 70
     # establish the number of cores that we are able to use
71 71
     if (is.na(nCores))
... ...
@@ -74,7 +74,7 @@ preInitialPhase <- function(simulationName, nCores)
74 74
     return(allDataSets)
75 75
 }
76 76
 
77
-runInitialPhase <- function(allDataSets, nFactor)
77
+runInitialPhase <- function(simulationName, allDataSets, nFactor, ...)
78 78
 {
79 79
     #generate seeds for parallelization
80 80
     nut <- generateSeeds(chains=length(allDataSets), seed=-1)
... ...
@@ -87,14 +87,14 @@ runInitialPhase <- function(allDataSets, nFactor)
87 87
         sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x))))
88 88
 
89 89
         # run CoGAPS without any fixed patterns
90
-        cptFileName <- paste(simulationName, "_initial_run_", i, ".out", sep="")
90
+        cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out", sep="")
91 91
         CoGAPS(sampleD, sampleS, nFactor=nFactor, seed=nut[i],
92 92
             checkpointFile=cptFileName, ...)
93 93
     }
94 94
     return(initialResult)
95 95
 }
96 96
 
97
-postInitialPhase <- function(initialResult, nSets, cut)
97
+postInitialPhase <- function(initialResult, nSets, cut, minNS)
98 98
 {
99 99
     nFactor <- ncol(initialResult[[1]]$Amean)
100 100
     BySet <- reOrderBySet(AP=initialResult, nFactor=nFactor, nSets=nSets)
... ...
@@ -106,32 +106,34 @@ postInitialPhase <- function(initialResult, nSets, cut)
106 106
         minNS=minNS, bySet=TRUE))
107 107
 }
108 108
 
109
-runFinalPhase <- function(allDataSets, matchedPatternSets)
109
+runFinalPhase <- function(simulationName, allDataSets, matchedPatternSets, ...)
110 110
 {
111 111
     # generate seeds for parallelization
112 112
     nut <- generateSeeds(chains=length(allDataSets), seed=-1)
113 113
 
114 114
     # final number of factors
115
-    nFactorFinal <- dim(matchedPatternSets)[1]
115
+    nFactorFinal <- nrow(matchedPatternSets[[1]])
116 116
 
117 117
     # run fixed CoGAPS
118
-    finalResult <- foreach(i=1:length(allDataSets) %dopar%
118
+    finalResult <- foreach(i=1:length(allDataSets)) %dopar%
119 119
     {
120 120
         # load data set and shift values so gene minimum is zero
121 121
         load(allDataSets[[i]])
122 122
         sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x))))
123 123
 
124 124
         # run CoGAPS with fixed patterns
125
-        cptFileName <- paste(simulationName, "_final_run_", i, ".out", sep="")
126
-        CoGAPS(sampleD, sampleS, FP=matchedPs, nFactor=nFactorFinal, seed=nut[i],
127
-            checkpointFile=cptFileName, ...)
125
+        cptFileName <- paste(simulationName, "_final_cpt_", i, ".out", sep="")
126
+        CoGAPS(sampleD, sampleS, fixedPatterns=matchedPatternSets[[1]],
127
+            nFactor=nFactorFinal, seed=nut[i], checkpointFile=cptFileName,
128
+            whichMatrixFixed='P', ...)
129
+
128 130
     }
129 131
     return(finalResult)
130 132
 }
131 133
 
132 134
 postFinalPhase <- function(finalResult, matchedPatternSets)
133 135
 {
134
-    Aresult <- postFixed4Parallel(finalResult, matchedPatternSets)
136
+    Aresult <- postFixed4Parallel(finalResult, matchedPatternSets[[1]])
135 137
     finalResult <- list("Amean"=Aresult$A, "Asd"=Aresult$Asd,
136 138
         "Pmean"=matchedPatternSets, "PbySet"=matchedPatternSets[["PBySet"]])
137 139
     class(finalResult) <- append(class(finalResult), "CoGAPS")
... ...
@@ -1,12 +1,12 @@
1 1
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 2
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 3
 
4
-cogaps_cpp <- function(D, S, nFactor, nEquil, nEquilCool, nSample, nOutputs, nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, singleCellRNASeq, whichMatrixFixed, FP, checkpointInterval) {
5
-    .Call('_CoGAPS_cogaps_cpp', PACKAGE = 'CoGAPS', D, S, nFactor, nEquil, nEquilCool, nSample, nOutputs, nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, singleCellRNASeq, whichMatrixFixed, FP, checkpointInterval)
4
+cogaps_cpp <- function(D, S, nFactor, nEquil, nEquilCool, nSample, nOutputs, nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, singleCellRNASeq, whichMatrixFixed, FP, checkpointInterval, cptFile) {
5
+    .Call('_CoGAPS_cogaps_cpp', PACKAGE = 'CoGAPS', D, S, nFactor, nEquil, nEquilCool, nSample, nOutputs, nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, singleCellRNASeq, whichMatrixFixed, FP, checkpointInterval, cptFile)
6 6
 }
7 7
 
8
-cogapsFromCheckpoint_cpp <- function(D, S, fileName) {
9
-    .Call('_CoGAPS_cogapsFromCheckpoint_cpp', PACKAGE = 'CoGAPS', D, S, fileName)
8
+cogapsFromCheckpoint_cpp <- function(D, S, fileName, cptFile) {
9
+    .Call('_CoGAPS_cogapsFromCheckpoint_cpp', PACKAGE = 'CoGAPS', D, S, fileName, cptFile)
10 10
 }
11 11
 
12 12
 displayBuildReport_cpp <- function() {
... ...
@@ -5,7 +5,7 @@
5 5
 #' for gapsMapRun
6 6
 #' @return list of two data.frames containing the A matrix estimates or their
7 7
 #' corresponding standard deviations from output of parallel CoGAPS
8
-postFixed4Parallel <- function(AP.fixed=NA, setPs=NA)
8
+postFixed4Parallel <- function(AP.fixed, setPs)
9 9
 {
10 10
     ASummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Amean))
11 11
     Asd <- do.call(rbind,lapply(AP.fixed, function(x) x$Asd))
... ...
@@ -8,7 +8,8 @@ CoGAPS(D, S, nFactor = 7, nEquil = 1000, nSample = 1000,
8 8
   nOutputs = 1000, nSnapshots = 0, alphaA = 0.01, alphaP = 0.01,
9 9
   maxGibbmassA = 100, maxGibbmassP = 100, seed = -1, messages = TRUE,
10 10
   singleCellRNASeq = FALSE, whichMatrixFixed = "N",
11
-  fixedPatterns = matrix(0), checkpointInterval = 0, ...)
11
+  fixedPatterns = matrix(0), checkpointInterval = 0,
12
+  checkpointFile = "gaps_checkpoint.out", ...)
12 13
 }
13 14
 \arguments{
14 15
 \item{D}{data matrix}
... ...
@@ -48,6 +49,8 @@ the fixed patterns}
48 49
 
49 50
 \item{checkpointInterval}{time (in seconds) between creating a checkpoint}
50 51
 
52
+\item{checkpointFile}{name of the checkpoint file}
53
+
51 54
 \item{...}{keeps backwards compatibility with arguments from older versions}
52 55
 }
53 56
 \value{
... ...
@@ -4,7 +4,7 @@
4 4
 \alias{CoGapsFromCheckpoint}
5 5
 \title{Restart CoGAPS from Checkpoint File}
6 6
 \usage{
7
-CoGapsFromCheckpoint(D, S, path)
7
+CoGapsFromCheckpoint(D, S, path, checkpointFile = "gaps_checkpoint.out")
8 8
 }
9 9
 \arguments{
10 10
 \item{D}{data matrix}
... ...
@@ -4,29 +4,15 @@
4 4
 \alias{GWCoGAPS}
5 5
 \title{GWCoGAPS}
6 6
 \usage{
7
-GWCoGAPS(D, S, nFactor, nSets, nCores = NA, saveBySetResults = FALSE,
8
-  fname = "GWCoGAPS.AP.fixed", PatternsMatchFN = patternMatch4Parallel,
9
-  Cut = NA, minNS = NA, ...)
7
+GWCoGAPS(simulationName, nFactor, nCores = NA, cut = NA, minNS = NA, ...)
10 8
 }
11 9
 \arguments{
12
-\item{D}{data matrix}
13
-
14
-\item{S}{uncertainty matrix (std devs for chi-squared of Log Likelihood)}
15
-
16 10
 \item{nFactor}{number of patterns (basis vectors, metagenes), which must be
17 11
 greater than or equal to the number of rows of FP}
18 12
 
19
-\item{nSets}{number of sets for parallelization}
20
-
21 13
 \item{nCores}{number of cores for parallelization. If left to the default NA, nCores = nSets.}
22 14
 
23
-\item{saveBySetResults}{logical indicating whether to save by intermediary by set results. Default is FALSE.}
24
-
25
-\item{fname}{character string used to label file output. Default is "GWCoGAPS.AP.fixed"}
26
-
27
-\item{PatternsMatchFN}{function to use for pattern matching across sets}
28
-
29
-\item{Cut}{number of branches at which to cut dendrogram used in patternMatch4Parallel}
15
+\item{cut}{number of branches at which to cut dendrogram used in patternMatch4Parallel}
30 16
 
31 17
 \item{minNS}{minimum of individual set contributions a cluster must contain}
32 18
 
... ...
@@ -4,7 +4,7 @@
4 4
 \alias{postFixed4Parallel}
5 5
 \title{Post Processing of Parallel Output}
6 6
 \usage{
7
-postFixed4Parallel(AP.fixed = NA, setPs = NA)
7
+postFixed4Parallel(AP.fixed, setPs)
8 8
 }
9 9
 \arguments{
10 10
 \item{AP.fixed}{output of parallel gapsMapRun calls with same FP}