#' scCoGAPS #' @export #' #' @details calls the C++ MCMC code and performs Bayesian #' matrix factorization returning the two matrices that reconstruct #' the data matrix for whole genome data; #' @param simulationName name of this simulation #' @param nFactor number of patterns (basis vectors, metagenes), which must be #' greater than or equal to the number of rows of FP #' @param nCores number of cores for parallelization. If left to the default NA, nCores = nSets. #' @param cut number of branches at which to cut dendrogram used in patternMatch4singleCell #' @param minNS minimum of individual set contributions a cluster must contain #' @param manualMatch logical indicating whether or not to stop after initial phase for manual pattern matching #' @param consensusAs fixed pattern matrix to be used to ensure reciprocity of A weights accross sets #' @param ... additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun} #' @return list of A and P estimates scCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, manualMatch=FALSE, consensusAs=NULL, ...) { if (!is.null(list(...)$checkpointFile)) { stop("checkpoint file name automatically set in GWCoGAPS - don't pass this parameter") } if (is.null(consensusAs)) { allDataSets <- sc_preInitialPhase(simulationName, nCores) initialResult <- sc_runInitialPhase(simulationName, allDataSets, nFactor, ...) if (manualMatch) { saveRDS(initialResult,file=paste(simulationName,"_initial.rds", sep="")) stop("Please provide concensus gene weights upon restarting.") } matchedAmplitudes <- sc_postInitialPhase(initialResult, length(allDataSets), cut, minNS) consensusAs <- matchedAmplitudes[[1]] save(consensusAs, file=paste(simulationName, "_matched_As.RData", sep="")) } finalResult <- sc_runFinalPhase(simulationName, allDataSets, consensusAs, nCores, ...) return(sc_postFinalPhase(finalResult, consensusAs)) } #' Restart a scCoGAPS run from a Checkpoint #' #' @inheritParams GWCoGAPS #' @return list of A and P estimates #' @importFrom utils file_test scCoGapsFromCheckpoint <- function(simulationName, nCores, cut=NA, minNS=NA, ...) { # find data files allDataSets <- sc_preInitialPhase(simulationName, nCores) # figure out phase from file signature initialCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_initial_cpt_[0-9]+.out", sep="")) finalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_final_cpt_[0-9]+.out", sep="")) if (length(finalCpts)) { finalResult <- foreach(i=1:length(allDataSets)) %dopar% { # load data set and shift values so gene minimum is zero load(allDataSets[[i]]) sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x)))) # run CoGAPS with fixed patterns cptFileName <- paste(simulationName, "_final_cpt_", i, ".out", sep="") CoGapsFromCheckpoint(sampleD, sampleS, cptFileName) } load(paste(simulationName, "_matched_As.RData", sep="")) } else if (file_test("-f", paste(simulationName, "_matched_As.RData", sep=""))) { load(paste(simulationName, "_matched_As.RData", sep="")) finalResult <- sc_runFinalPhase(simulationName, allDataSets, consensusAs, nCores, ...) } else if (length(initialCpts)) { # initial phase - always needs to be run to get matchedAmplitudes initialResult <- foreach(i=1:length(allDataSets)) %dopar% { # load data set and shift values so gene minimum is zero load(allDataSets[[i]]) sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x)))) # run CoGAPS from checkpoint cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out", sep="") CoGapsFromCheckpoint(sampleD, sampleS, cptFileName) } matchedAmplitudes <- sc_postInitialPhase(initialResult, length(allDataSets), cut, minNS) consensusAs <- matchedAmplitudes[[1]] save(consensusAs, file=paste(simulationName, "_matched_As.RData", sep="")) finalResult <- sc_runFinalPhase(simulationName, allDataSets, consensusAs, nCores, ...) } else { stop("no checkpoint files found") } return(sc_postFinalPhase(finalResult, consensusAs)) } sc_preInitialPhase <- function(simulationName, nCores) { # find data files fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="") allDataSets <- list.files(full.names=TRUE, pattern=fileSig) # establish the number of cores that we are able to use if (is.na(nCores)) { nCores <- length(allDataSets) } registerDoParallel(cores=nCores) return(allDataSets) } sc_runInitialPhase <- function(simulationName, allDataSets, nFactor, ...) { #generate seeds for parallelization nut <- generateSeeds(chains=length(allDataSets), seed=-1) # run CoGAPS for each set initialResult <- foreach(i=1:length(allDataSets)) %dopar% { # load data set and shift values so gene minimum is zero load(allDataSets[[i]]) sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x)))) # run CoGAPS without any fixed patterns cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out", sep="") CoGAPS(sampleD, sampleS, nFactor=nFactor, seed=nut[i], checkpointFile=cptFileName, singleCellRNASeq=TRUE, ...) } return(initialResult) } sc_postInitialPhase <- function(initialResult, nSets, cut, minNS) { nFactor <- ncol(initialResult[[1]]$Amean) BySet <- reOrderBySet(AP=initialResult, nFactor=nFactor, nSets=nSets,match="A") #run postpattern match function if (is.na(cut)) { cut <- nFactor } return(cellMatchR(Atot=BySet$A, nSets=nSets, cnt=cut, minNS=minNS, bySet=TRUE)) } sc_runFinalPhase <- function(simulationName, allDataSets, consensusAs, nCores, ...) { if (length(dim(consensusAs)) != 2) { stop("consensusAs must be a matrix") } # find data files if providing consensus patterns fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="") allDataSets <- list.files(full.names=TRUE, pattern=fileSig) if (is.na(nCores)) { nCores <- length(allDataSets) } registerDoParallel(cores=nCores) # generate seeds for parallelization nut <- generateSeeds(chains=length(allDataSets), seed=-1) # final number of factors nFactorFinal <- ncol(consensusAs) # run fixed CoGAPS finalResult <- foreach(i=1:length(allDataSets)) %dopar% { # load data set and shift values so gene minimum is zero load(allDataSets[[i]]) sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x)))) # run CoGAPS with fixed patterns cptFileName <- paste(simulationName, "_final_cpt_", i, ".out", sep="") CoGAPS(sampleD, sampleS, fixedPatterns=consensusAs, nFactor=nFactorFinal, seed=nut[i], checkpointFile=cptFileName, whichMatrixFixed='A', singleCellRNASeq=TRUE, ...) } return(finalResult) } sc_postFinalPhase <- function(finalResult, consensusAs) { Aresult <- postFixed4Parallel(finalResult, consensusAs, setMatrix="A") finalResult <- list("Pmean"=Aresult$P, "Psd"=Aresult$Psd,"Amean"=consensusAs) class(finalResult) <- append(class(finalResult), "CoGAPS") return(finalResult) }