... | ... |
@@ -23,6 +23,7 @@ |
23 | 23 |
#' the fixed patterns |
24 | 24 |
#' @param fixedPatterns matrix of fixed values in either A or P matrix |
25 | 25 |
#' @param checkpointInterval time (in seconds) between creating a checkpoint |
26 |
+#' @param checkpointFile name of the checkpoint file |
|
26 | 27 |
#' @param ... keeps backwards compatibility with arguments from older versions |
27 | 28 |
#' @return list with A and P matrix estimates |
28 | 29 |
#' @importFrom methods new |
... | ... |
@@ -32,8 +33,9 @@ |
32 | 33 |
#' @export |
33 | 34 |
CoGAPS <- function(D, S, nFactor=7, nEquil=1000, nSample=1000, nOutputs=1000, |
34 | 35 |
nSnapshots=0, alphaA=0.01, alphaP=0.01, maxGibbmassA=100, maxGibbmassP=100, |
35 |
-seed=-1, messages=TRUE, singleCellRNASeq=FALSE, whichMatrixFixed = 'N', |
|
36 |
-fixedPatterns = matrix(0), checkpointInterval=0, ...) |
|
36 |
+seed=-1, messages=TRUE, singleCellRNASeq=FALSE, whichMatrixFixed='N', |
|
37 |
+fixedPatterns=matrix(0), checkpointInterval=0, checkpointFile="gaps_checkpoint.out", |
|
38 |
+...) |
|
37 | 39 |
{ |
38 | 40 |
# get v2 arguments |
39 | 41 |
oldArgs <- list(...) |
... | ... |
@@ -57,12 +59,24 @@ fixedPatterns = matrix(0), checkpointInterval=0, ...) |
57 | 59 |
stop('D and S must be matrices') |
58 | 60 |
if (any(D < 0) | any(S < 0)) |
59 | 61 |
stop('D and S matrix must be non-negative') |
62 |
+ if (nrow(D) != nrow(S) | ncol(D) != ncol(S)) |
|
63 |
+ stop('D and S matrix have different dimensions') |
|
64 |
+ if (whichMatrixFixed == 'A' & nrow(fixedPatterns) != nrow(D)) |
|
65 |
+ stop('invalid number of rows for fixedPatterns') |
|
66 |
+ if (whichMatrixFixed == 'A' & ncol(fixedPatterns) > nFactor) |
|
67 |
+ stop('invalid number of columns for fixedPatterns') |
|
68 |
+ if (whichMatrixFixed == 'P' & nrow(fixedPatterns) > nFactor) |
|
69 |
+ stop('invalid number of rows for fixedPatterns') |
|
70 |
+ if (whichMatrixFixed == 'P' & ncol(fixedPatterns) != ncol(D)) |
|
71 |
+ stop('invalid number of columns for fixedPatterns') |
|
60 | 72 |
|
61 | 73 |
# run algorithm with call to C++ code |
62 | 74 |
result <- cogaps_cpp(D, S, nFactor, nEquil, nEquil/10, nSample, nOutputs, |
63 | 75 |
nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, |
64 |
- singleCellRNASeq, whichMatrixFixed, fixedPatterns, checkpointInterval) |
|
76 |
+ singleCellRNASeq, whichMatrixFixed, fixedPatterns, checkpointInterval, |
|
77 |
+ checkpointFile) |
|
65 | 78 |
|
79 |
+ # label matrices and return list |
|
66 | 80 |
patternNames <- paste('Patt', 1:nFactor, sep='') |
67 | 81 |
rownames(result$Amean) <- rownames(result$Asd) <- rownames(D) |
68 | 82 |
colnames(result$Amean) <- colnames(result$Asd) <- patternNames |
... | ... |
@@ -80,9 +94,9 @@ fixedPatterns = matrix(0), checkpointInterval=0, ...) |
80 | 94 |
#' @param path path to checkpoint file |
81 | 95 |
#' @return list with A and P matrix estimates |
82 | 96 |
#' @export |
83 |
-CoGapsFromCheckpoint <- function(D, S, path) |
|
97 |
+CoGapsFromCheckpoint <- function(D, S, path, checkpointFile="gaps_checkpoint.out") |
|
84 | 98 |
{ |
85 |
- cogapsFromCheckpoint_cpp(D, S, path) |
|
99 |
+ cogapsFromCheckpoint_cpp(D, S, path, checkpointFile) |
|
86 | 100 |
} |
87 | 101 |
|
88 | 102 |
#' Display Information About Package Compilation |
... | ... |
@@ -3,12 +3,10 @@ |
3 | 3 |
#' @details calls the C++ MCMC code and performs Bayesian |
4 | 4 |
#' matrix factorization returning the two matrices that reconstruct |
5 | 5 |
#' the data matrix for whole genome data; |
6 |
-#' @param D data matrix |
|
7 |
-#' @param S uncertainty matrix (std devs for chi-squared of Log Likelihood) |
|
8 | 6 |
#' @param nFactor number of patterns (basis vectors, metagenes), which must be |
9 | 7 |
#' greater than or equal to the number of rows of FP |
10 | 8 |
#' @param nCores number of cores for parallelization. If left to the default NA, nCores = nSets. |
11 |
-#' @param Cut number of branches at which to cut dendrogram used in patternMatch4Parallel |
|
9 |
+#' @param cut number of branches at which to cut dendrogram used in patternMatch4Parallel |
|
12 | 10 |
#' @param minNS minimum of individual set contributions a cluster must contain |
13 | 11 |
#' @param ... additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun} |
14 | 12 |
#' @return list of A and P estimates |
... | ... |
@@ -21,21 +19,23 @@ |
21 | 19 |
#' @export |
22 | 20 |
GWCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, ...) |
23 | 21 |
{ |
22 |
+ if (!is.null(list(...)$checkpointFile)) |
|
23 |
+ stop("checkpoint file name automatically set in GWCoGAPS - don't pass this parameter") |
|
24 | 24 |
allDataSets <- preInitialPhase(simulationName, nCores) |
25 |
- initialResult <- runInitialPhase(allDataSets, nFactor) |
|
26 |
- matchedPatternSets <- postInitialPhase(initialResult, length(allDataSets), cut) |
|
27 |
- finalResult <- runFinalPhase(allDataSets, matchedPatternSets) |
|
28 |
- return(postFinalPhase(allDataSets, matchedPatternSets)) |
|
25 |
+ initialResult <- runInitialPhase(simulationName, allDataSets, nFactor, ...) |
|
26 |
+ matchedPatternSets <- postInitialPhase(initialResult, length(allDataSets), cut, minNS) |
|
27 |
+ finalResult <- runFinalPhase(simulationName, allDataSets, matchedPatternSets, ...) |
|
28 |
+ return(postFinalPhase(finalResult, matchedPatternSets)) |
|
29 | 29 |
} |
30 | 30 |
|
31 |
-GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA) |
|
31 |
+GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA, ...) |
|
32 | 32 |
{ |
33 | 33 |
allDataSets <- preInitialPhase(simulationName, nCores) |
34 | 34 |
|
35 | 35 |
# figure out phase from file signature |
36 |
- initalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_initial_run_*.out")) |
|
37 |
- finalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_final_run_*.out")) |
|
38 |
- cptPrefix <- ifelse(length(finalCpts), "_final_run_", "_initial_run_") |
|
36 |
+ initalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_initial_cpt_[0-9]+.out")) |
|
37 |
+ finalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_final_cpt_[0-9]+.out")) |
|
38 |
+ cptPrefix <- ifelse(length(finalCpts), "_final_cpt_", "_initial_cpt_") |
|
39 | 39 |
|
40 | 40 |
# run CoGAPS for each set, starting from appropiate checkpoint file |
41 | 41 |
result <- foreach(i=1:nSets) %dopar% |
... | ... |
@@ -55,8 +55,8 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA) |
55 | 55 |
} |
56 | 56 |
else |
57 | 57 |
{ |
58 |
- matchedPatternSets <- postInitialPhase(result, length(allDataSets), cut) |
|
59 |
- finalResult <- runFinalPhase(allDataSets, matchedPatternSets) |
|
58 |
+ matchedPatternSets <- postInitialPhase(result, length(allDataSets), cut, minNS) |
|
59 |
+ finalResult <- runFinalPhase(allDataSets, matchedPatternSets, ...) |
|
60 | 60 |
return(postFinalPhase(allDataSets, matchedPatternSets)) |
61 | 61 |
} |
62 | 62 |
} |
... | ... |
@@ -64,8 +64,8 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA) |
64 | 64 |
preInitialPhase <- function(simulationName, nCores) |
65 | 65 |
{ |
66 | 66 |
# find data files |
67 |
- fileSig <- paste(simulationName, "_partition_*.RData", sep="") |
|
68 |
- allDataSets <- list.files(full.names=TRUE, pattern = fileSig) |
|
67 |
+ fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="") |
|
68 |
+ allDataSets <- list.files(full.names=TRUE, pattern=fileSig) |
|
69 | 69 |
|
70 | 70 |
# establish the number of cores that we are able to use |
71 | 71 |
if (is.na(nCores)) |
... | ... |
@@ -74,7 +74,7 @@ preInitialPhase <- function(simulationName, nCores) |
74 | 74 |
return(allDataSets) |
75 | 75 |
} |
76 | 76 |
|
77 |
-runInitialPhase <- function(allDataSets, nFactor) |
|
77 |
+runInitialPhase <- function(simulationName, allDataSets, nFactor, ...) |
|
78 | 78 |
{ |
79 | 79 |
#generate seeds for parallelization |
80 | 80 |
nut <- generateSeeds(chains=length(allDataSets), seed=-1) |
... | ... |
@@ -87,14 +87,14 @@ runInitialPhase <- function(allDataSets, nFactor) |
87 | 87 |
sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x)))) |
88 | 88 |
|
89 | 89 |
# run CoGAPS without any fixed patterns |
90 |
- cptFileName <- paste(simulationName, "_initial_run_", i, ".out", sep="") |
|
90 |
+ cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out", sep="") |
|
91 | 91 |
CoGAPS(sampleD, sampleS, nFactor=nFactor, seed=nut[i], |
92 | 92 |
checkpointFile=cptFileName, ...) |
93 | 93 |
} |
94 | 94 |
return(initialResult) |
95 | 95 |
} |
96 | 96 |
|
97 |
-postInitialPhase <- function(initialResult, nSets, cut) |
|
97 |
+postInitialPhase <- function(initialResult, nSets, cut, minNS) |
|
98 | 98 |
{ |
99 | 99 |
nFactor <- ncol(initialResult[[1]]$Amean) |
100 | 100 |
BySet <- reOrderBySet(AP=initialResult, nFactor=nFactor, nSets=nSets) |
... | ... |
@@ -106,32 +106,34 @@ postInitialPhase <- function(initialResult, nSets, cut) |
106 | 106 |
minNS=minNS, bySet=TRUE)) |
107 | 107 |
} |
108 | 108 |
|
109 |
-runFinalPhase <- function(allDataSets, matchedPatternSets) |
|
109 |
+runFinalPhase <- function(simulationName, allDataSets, matchedPatternSets, ...) |
|
110 | 110 |
{ |
111 | 111 |
# generate seeds for parallelization |
112 | 112 |
nut <- generateSeeds(chains=length(allDataSets), seed=-1) |
113 | 113 |
|
114 | 114 |
# final number of factors |
115 |
- nFactorFinal <- dim(matchedPatternSets)[1] |
|
115 |
+ nFactorFinal <- nrow(matchedPatternSets[[1]]) |
|
116 | 116 |
|
117 | 117 |
# run fixed CoGAPS |
118 |
- finalResult <- foreach(i=1:length(allDataSets) %dopar% |
|
118 |
+ finalResult <- foreach(i=1:length(allDataSets)) %dopar% |
|
119 | 119 |
{ |
120 | 120 |
# load data set and shift values so gene minimum is zero |
121 | 121 |
load(allDataSets[[i]]) |
122 | 122 |
sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x)))) |
123 | 123 |
|
124 | 124 |
# run CoGAPS with fixed patterns |
125 |
- cptFileName <- paste(simulationName, "_final_run_", i, ".out", sep="") |
|
126 |
- CoGAPS(sampleD, sampleS, FP=matchedPs, nFactor=nFactorFinal, seed=nut[i], |
|
127 |
- checkpointFile=cptFileName, ...) |
|
125 |
+ cptFileName <- paste(simulationName, "_final_cpt_", i, ".out", sep="") |
|
126 |
+ CoGAPS(sampleD, sampleS, fixedPatterns=matchedPatternSets[[1]], |
|
127 |
+ nFactor=nFactorFinal, seed=nut[i], checkpointFile=cptFileName, |
|
128 |
+ whichMatrixFixed='P', ...) |
|
129 |
+ |
|
128 | 130 |
} |
129 | 131 |
return(finalResult) |
130 | 132 |
} |
131 | 133 |
|
132 | 134 |
postFinalPhase <- function(finalResult, matchedPatternSets) |
133 | 135 |
{ |
134 |
- Aresult <- postFixed4Parallel(finalResult, matchedPatternSets) |
|
136 |
+ Aresult <- postFixed4Parallel(finalResult, matchedPatternSets[[1]]) |
|
135 | 137 |
finalResult <- list("Amean"=Aresult$A, "Asd"=Aresult$Asd, |
136 | 138 |
"Pmean"=matchedPatternSets, "PbySet"=matchedPatternSets[["PBySet"]]) |
137 | 139 |
class(finalResult) <- append(class(finalResult), "CoGAPS") |
... | ... |
@@ -1,12 +1,12 @@ |
1 | 1 |
# Generated by using Rcpp::compileAttributes() -> do not edit by hand |
2 | 2 |
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 |
3 | 3 |
|
4 |
-cogaps_cpp <- function(D, S, nFactor, nEquil, nEquilCool, nSample, nOutputs, nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, singleCellRNASeq, whichMatrixFixed, FP, checkpointInterval) { |
|
5 |
- .Call('_CoGAPS_cogaps_cpp', PACKAGE = 'CoGAPS', D, S, nFactor, nEquil, nEquilCool, nSample, nOutputs, nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, singleCellRNASeq, whichMatrixFixed, FP, checkpointInterval) |
|
4 |
+cogaps_cpp <- function(D, S, nFactor, nEquil, nEquilCool, nSample, nOutputs, nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, singleCellRNASeq, whichMatrixFixed, FP, checkpointInterval, cptFile) { |
|
5 |
+ .Call('_CoGAPS_cogaps_cpp', PACKAGE = 'CoGAPS', D, S, nFactor, nEquil, nEquilCool, nSample, nOutputs, nSnapshots, alphaA, alphaP, maxGibbmassA, maxGibbmassP, seed, messages, singleCellRNASeq, whichMatrixFixed, FP, checkpointInterval, cptFile) |
|
6 | 6 |
} |
7 | 7 |
|
8 |
-cogapsFromCheckpoint_cpp <- function(D, S, fileName) { |
|
9 |
- .Call('_CoGAPS_cogapsFromCheckpoint_cpp', PACKAGE = 'CoGAPS', D, S, fileName) |
|
8 |
+cogapsFromCheckpoint_cpp <- function(D, S, fileName, cptFile) { |
|
9 |
+ .Call('_CoGAPS_cogapsFromCheckpoint_cpp', PACKAGE = 'CoGAPS', D, S, fileName, cptFile) |
|
10 | 10 |
} |
11 | 11 |
|
12 | 12 |
displayBuildReport_cpp <- function() { |
... | ... |
@@ -5,7 +5,7 @@ |
5 | 5 |
#' for gapsMapRun |
6 | 6 |
#' @return list of two data.frames containing the A matrix estimates or their |
7 | 7 |
#' corresponding standard deviations from output of parallel CoGAPS |
8 |
-postFixed4Parallel <- function(AP.fixed=NA, setPs=NA) |
|
8 |
+postFixed4Parallel <- function(AP.fixed, setPs) |
|
9 | 9 |
{ |
10 | 10 |
ASummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Amean)) |
11 | 11 |
Asd <- do.call(rbind,lapply(AP.fixed, function(x) x$Asd)) |
... | ... |
@@ -8,7 +8,8 @@ CoGAPS(D, S, nFactor = 7, nEquil = 1000, nSample = 1000, |
8 | 8 |
nOutputs = 1000, nSnapshots = 0, alphaA = 0.01, alphaP = 0.01, |
9 | 9 |
maxGibbmassA = 100, maxGibbmassP = 100, seed = -1, messages = TRUE, |
10 | 10 |
singleCellRNASeq = FALSE, whichMatrixFixed = "N", |
11 |
- fixedPatterns = matrix(0), checkpointInterval = 0, ...) |
|
11 |
+ fixedPatterns = matrix(0), checkpointInterval = 0, |
|
12 |
+ checkpointFile = "gaps_checkpoint.out", ...) |
|
12 | 13 |
} |
13 | 14 |
\arguments{ |
14 | 15 |
\item{D}{data matrix} |
... | ... |
@@ -48,6 +49,8 @@ the fixed patterns} |
48 | 49 |
|
49 | 50 |
\item{checkpointInterval}{time (in seconds) between creating a checkpoint} |
50 | 51 |
|
52 |
+\item{checkpointFile}{name of the checkpoint file} |
|
53 |
+ |
|
51 | 54 |
\item{...}{keeps backwards compatibility with arguments from older versions} |
52 | 55 |
} |
53 | 56 |
\value{ |
... | ... |
@@ -4,29 +4,15 @@ |
4 | 4 |
\alias{GWCoGAPS} |
5 | 5 |
\title{GWCoGAPS} |
6 | 6 |
\usage{ |
7 |
-GWCoGAPS(D, S, nFactor, nSets, nCores = NA, saveBySetResults = FALSE, |
|
8 |
- fname = "GWCoGAPS.AP.fixed", PatternsMatchFN = patternMatch4Parallel, |
|
9 |
- Cut = NA, minNS = NA, ...) |
|
7 |
+GWCoGAPS(simulationName, nFactor, nCores = NA, cut = NA, minNS = NA, ...) |
|
10 | 8 |
} |
11 | 9 |
\arguments{ |
12 |
-\item{D}{data matrix} |
|
13 |
- |
|
14 |
-\item{S}{uncertainty matrix (std devs for chi-squared of Log Likelihood)} |
|
15 |
- |
|
16 | 10 |
\item{nFactor}{number of patterns (basis vectors, metagenes), which must be |
17 | 11 |
greater than or equal to the number of rows of FP} |
18 | 12 |
|
19 |
-\item{nSets}{number of sets for parallelization} |
|
20 |
- |
|
21 | 13 |
\item{nCores}{number of cores for parallelization. If left to the default NA, nCores = nSets.} |
22 | 14 |
|
23 |
-\item{saveBySetResults}{logical indicating whether to save by intermediary by set results. Default is FALSE.} |
|
24 |
- |
|
25 |
-\item{fname}{character string used to label file output. Default is "GWCoGAPS.AP.fixed"} |
|
26 |
- |
|
27 |
-\item{PatternsMatchFN}{function to use for pattern matching across sets} |
|
28 |
- |
|
29 |
-\item{Cut}{number of branches at which to cut dendrogram used in patternMatch4Parallel} |
|
15 |
+\item{cut}{number of branches at which to cut dendrogram used in patternMatch4Parallel} |
|
30 | 16 |
|
31 | 17 |
\item{minNS}{minimum of individual set contributions a cluster must contain} |
32 | 18 |
|