# driver for the matrix decomposition GAPS <- function(data, unc, outputDir, outputBase="", sep = "\t", isPercentError=FALSE, numPatterns, MaxAtomsA=2^32, alphaA=0.01, MaxAtomsP=2^32, alphaP=0.01, SAIter=1000000000, iter = 500000000, thin=-1, verbose=TRUE, keepChain = FALSE) { # formatting data for use by algorithm if (verbose) { message("Formatting data for GAPS"); } # process data dataInFile = tolower(class(data))=="character" # read in data from a file if (dataInFile) { if (verbose) { message(paste("Reading in data from", data, sep=" ")) } D <- as.matrix(read.table(file = data, sep = sep, row.names=1, header=TRUE)) } else { # data is input in matrix / array form D <- as.matrix(data) } if (numPatterns >= nrow(D) || numPatterns >= ncol(D)) { stop(sprintf("Cannot decompose data into more patterns (%d) than rows (%d) or columns (%d).", numPatterns, nrow(D), ncol(D))) } # process uncertainty sdInFile = tolower(class(unc))=="character" if (sdInFile) { # uncertainty is specified in a file if (isPercentError) { stop(paste("Uncertainty specified in file", unc, "must be specified as additive error.")) } if (verbose) { message(paste("Reading in uncertainty from", unc, sep=" ")) } Sigma <- as.matrix(read.table(file = unc, sep = sep, row.names=1,header=TRUE)) } else { nrunc <- nrow(unc) ncunc <- ncol(unc) if (length(unc)==1) { nrunc <- 1 ncunc <- 1 } # see if input uncertainty is a scalar if ( nrunc == 1 && ncunc == 1 ) { if (verbose) { message("Using single unc value specified in input.") } # compute the error as either fractional or additive if (isPercentError) { Sigma <- pmax(unc*abs(D),unc) } else { Sigma <- matrix(data=unc, nrow=nrow(D), ncol=ncol(D)) } # check that the uncertainty is positive if (any(Sigma <= 0)) { stop(paste("Uncertainty must be positive (Sigma = ", Sigma, ").", sep="")) } } else { if (verbose) { message("Using matrix unc value specified in input.") } # check that dimensions of uncertainty and data match if (nrunc != nrow(D) || ncunc != ncol(D)) { stop(paste("Dimensions of data (", nrow(D), ", ", ncol(D), ") and uncertainty (", nrow(Sigma), ", ", ncol(Sigma), ") do not agree", sep = "")) } # check that the uncertainty is positive if (any(unc <= 0)) { stop(paste("Uncertainty must be positive (unc = ", unc, ").", sep="")) } # ensure that matrix errors additive if (isPercentError) { stop("Uncertainty specified in an array must be additive error.") } Sigma <- as.matrix(unc) } } # make a temporary file to outputBase which contains the row and column names if (outputDir != "" && outputDir != ".") { fullOutputPath <- paste(outputDir,outputBase,sep="/") if (!file.exists(outputDir)) { dir.create(outputDir) } } else { fullOutputPath <- outputBase } if (verbose) { message(paste("Decomposing data of size:", nrow(D), "by", ncol(D), "into", numPatterns, "patterns.", sep=" ")) } # get expected mass of each matrix element based on data if (verbose) { message("Initializing distribution parameters") } # computing the mean mass of atoms in A and P based on the data matrix lambdaA <- alphaA * sqrt(numPatterns) / sqrt(mean(D)) lambdaP <- alphaP * sqrt(numPatterns) / sqrt(mean(D)) # estimate number of outputs based on expected number of matrix elements if (thin <= 0) { if (iter < 10000) { outThin <- 1 } else { outThin <- iter/10000 } } else { outThin <- thin } # assign the unique file identifier to be the number after the decimal if (floor(outThin) - outThin < 1.e-10) { outThin <- outThin + runif(1) } # initialize output arrays, including chain dimensions Amean <- array(0., c(nrow(D), numPatterns)) row.names(Amean) <- row.names(D) colnames(Amean) <- paste("p",1:numPatterns,sep="") Asd <- array(0., c(nrow(D), numPatterns)) row.names(Asd) <- row.names(D) colnames(Amean) <- paste("p",1:numPatterns,sep="") Pmean <- array(0., c(numPatterns, ncol(D))) row.names(Pmean) <- paste("p",1:numPatterns,sep="") colnames(Pmean) <- colnames(D) Psd <- array(0., c(numPatterns, ncol(D))) row.names(Psd) <- paste("p",1:numPatterns,sep="") colnames(Psd) <- colnames(D) meanChi2 <- 0. meanMock <- array(0., c(nrow(D),ncol(D))) if (verbose) { message(paste("Running GAPS.")) message(paste(SAIter, "burn-in steps;", iter, "steps.", sep=" ")) } # run the mcmc mdout <- RunGAPS(D, Sigma, numPatterns, MaxAtomsA, alphaA, lambdaA, MaxAtomsP, alphaP, lambdaP, SAIter, iter, outThin) if (verbose) { message(paste("Normalizing results.")) } # obtain the mean and standard deviation of A and P from the chain files FID <- sprintf('%.10f', outThin - floor(outThin)) AFile <- paste("AResults", FID, ".ChainValues.txt", sep="") PFile <- paste("PResults", FID, ".ChainValues.txt", sep="") normmd <- AtomChain2APNorm(Afile = AFile, Pfile = PFile, nGene = nrow(D), nPattern = numPatterns, nSample = ncol(D)) # extract chain data and store in above arrays Amean[,] <- normmd$Amean Asd[,] <- normmd$Asd Pmean[,] <- normmd$Pmean Psd[,] <- normmd$Psd write.table(Amean[,], file=paste(fullOutputPath,"Amean.",FID,".txt",sep=""), sep="\t") write.table(Asd[,], file=paste(fullOutputPath,"Asd.",FID,".txt",sep=""), sep="\t") write.table(Pmean[,], file=paste(fullOutputPath,"Pmean.",FID,".txt",sep=""), sep="\t") write.table(Psd[,], file=paste(fullOutputPath,"Psd.",FID,".txt",sep=""), sep="\t") if (verbose) { message(paste("Computing chi2")) } atmp <- as.matrix(Amean[,]) ptmp <- as.matrix(Pmean[,]) mtmp <- atmp %*% ptmp meanMock[,] = array(mtmp, c(nrow(D), ncol(D), 1)) meanChi2 <- sum((D-meanMock[,])^2 / Sigma^2) # process the diagnostic file and add appropriate means ADiagFile <- paste("AResults",FID,".Diagnostics.txt",sep="") PDiagFile <- paste("PResults",FID,".Diagnostics.txt",sep="") ADiag <- read.table(ADiagFile, sep="\t", header=T, skip=10) PDiag <- read.table(PDiagFile, sep="\t", header=T, skip=10) chainKeepA <- which(ADiag[,2]==0) meanOfChi2A <- mean(ADiag[chainKeepA,4]) meanAtomsA <- mean(ADiag[chainKeepA,5]) chi2DiagA <- paste("", meanAtomsA, ":", meanOfChi2A, "; chi^2 of mean:", meanChi2, sep=" ") cat("\n\nSummary\n", file=ADiagFile, append=TRUE) cat(chi2DiagA, file=ADiagFile, append=TRUE) cat("\n", file=ADiagFile, append=TRUE) chainKeepP <- which(PDiag[,2]==0) meanOfChi2P <- mean(PDiag[chainKeepP,4]) meanAtomsP <- mean(PDiag[chainKeepP,5]) chi2DiagP <- paste("", meanAtomsP, ":", meanOfChi2P, "; chi^2 of mean:", meanChi2, sep=" ") cat("\n\nSummary\n", file=PDiagFile, append=TRUE) cat(chi2DiagP, file=PDiagFile, append=TRUE) cat("\n", file=PDiagFile, append=TRUE) if (fullOutputPath != "") { file.rename(ADiagFile, paste(fullOutputPath, basename(ADiagFile), sep="")) file.rename(PDiagFile, paste(fullOutputPath, basename(PDiagFile), sep="")) } # delete chain files or move to appropriate directory if (!keepChain) { message("Deleting chain files") file.remove(AFile) file.remove(PFile) } else { message(paste('Moving chain files to ', outputDir, sep=' ')) if (fullOutputPath != "") { file.rename(AFile, paste(fullOutputPath, basename(AFile),sep="")) file.rename(PFile, paste(fullOutputPath, basename(PFile),sep="")) } } if (verbose) { message("Completed simulation.\n") message("Return list contains chi^2 of solution and normalized, mean and standard deviation of A and P matrices.\n") } # output list of data for each chain return(list(meanChi2=meanChi2, D=D, Sigma=Sigma, Amean=Amean, Asd=Asd, Pmean=Pmean, Psd=Psd, meanMock=meanMock)) }