902afe0d |
# driver for the matrix decomposition
GAPS <- function(data, unc,
outputDir, outputBase="",
sep = "\t", isPercentError=FALSE,
numPatterns,
MaxAtomsA=2^32, alphaA=0.01,
MaxAtomsP=2^32, alphaP=0.01,
SAIter=1000000000, iter = 500000000, thin=-1,
verbose=TRUE, keepChain = FALSE) {
# formatting data for use by algorithm
if (verbose) {
message("Formatting data for GAPS");
}
# process data
dataInFile = tolower(class(data))=="character"
# read in data from a file
if (dataInFile) {
if (verbose) {
message(paste("Reading in data from", data, sep=" "))
}
D <- as.matrix(read.table(file = data, sep = sep,
row.names=1, header=TRUE))
} else {
# data is input in matrix / array form
D <- as.matrix(data)
}
if (numPatterns >= nrow(D) || numPatterns >= ncol(D)) {
stop(sprintf("Cannot decompose data into more patterns (%d) than rows (%d) or columns (%d).", numPatterns, nrow(D), ncol(D)))
}
# process uncertainty
sdInFile = tolower(class(unc))=="character"
if (sdInFile) {
# uncertainty is specified in a file
if (isPercentError) {
stop(paste("Uncertainty specified in file", unc,
"must be specified as additive error."))
}
if (verbose) {
message(paste("Reading in uncertainty from", unc, sep=" "))
}
Sigma <- as.matrix(read.table(file = unc, sep = sep,
row.names=1,header=TRUE))
} else {
nrunc <- nrow(unc)
ncunc <- ncol(unc)
if (length(unc)==1) {
nrunc <- 1
ncunc <- 1
}
# see if input uncertainty is a scalar
if ( nrunc == 1 && ncunc == 1 ) {
if (verbose) {
message("Using single unc value specified in input.")
}
# compute the error as either fractional or additive
if (isPercentError) {
Sigma <- pmax(unc*abs(D),unc)
} else {
Sigma <- matrix(data=unc, nrow=nrow(D), ncol=ncol(D))
}
# check that the uncertainty is positive
if (any(Sigma <= 0)) {
stop(paste("Uncertainty must be positive (Sigma = ", Sigma, ").", sep=""))
}
} else {
if (verbose) {
message("Using matrix unc value specified in input.")
}
# check that dimensions of uncertainty and data match
if (nrunc != nrow(D) || ncunc != ncol(D)) {
stop(paste("Dimensions of data (", nrow(D), ", ", ncol(D),
") and uncertainty (", nrow(Sigma), ", ", ncol(Sigma), ") do not agree",
sep = ""))
}
# check that the uncertainty is positive
if (any(unc <= 0)) {
stop(paste("Uncertainty must be positive (unc = ", unc, ").", sep=""))
}
# ensure that matrix errors additive
if (isPercentError) {
stop("Uncertainty specified in an array must be additive error.")
}
Sigma <- as.matrix(unc)
}
}
# make a temporary file to outputBase which contains the row and column names
if (outputDir != "" && outputDir != ".") {
fullOutputPath <- paste(outputDir,outputBase,sep="/")
if (!file.exists(outputDir)) {
dir.create(outputDir)
}
} else {
fullOutputPath <- outputBase
}
if (verbose) {
message(paste("Decomposing data of size:",
nrow(D), "by", ncol(D),
"into", numPatterns, "patterns.", sep=" "))
}
# get expected mass of each matrix element based on data
if (verbose) {
message("Initializing distribution parameters")
}
# computing the mean mass of atoms in A and P based on the data matrix
lambdaA <- alphaA * sqrt(numPatterns) / sqrt(mean(D))
lambdaP <- alphaP * sqrt(numPatterns) / sqrt(mean(D))
# estimate number of outputs based on expected number of matrix elements
if (thin <= 0) {
if (iter < 10000) {
outThin <- 1
} else {
outThin <- iter/10000
}
} else {
outThin <- thin
}
# assign the unique file identifier to be the number after the decimal
if (floor(outThin) - outThin < 1.e-10) {
outThin <- outThin + runif(1)
}
# initialize output arrays, including chain dimensions
Amean <- array(0., c(nrow(D), numPatterns))
row.names(Amean) <- row.names(D)
colnames(Amean) <- paste("p",1:numPatterns,sep="")
Asd <- array(0., c(nrow(D), numPatterns))
row.names(Asd) <- row.names(D)
colnames(Amean) <- paste("p",1:numPatterns,sep="")
Pmean <- array(0., c(numPatterns, ncol(D)))
row.names(Pmean) <- paste("p",1:numPatterns,sep="")
colnames(Pmean) <- colnames(D)
Psd <- array(0., c(numPatterns, ncol(D)))
row.names(Psd) <- paste("p",1:numPatterns,sep="")
colnames(Psd) <- colnames(D)
meanChi2 <- 0.
meanMock <- array(0., c(nrow(D),ncol(D)))
if (verbose) {
message(paste("Running GAPS."))
message(paste(SAIter, "burn-in steps;", iter, "steps.", sep=" "))
}
# run the mcmc
mdout <- RunGAPS(D, Sigma, numPatterns,
MaxAtomsA, alphaA, lambdaA,
MaxAtomsP, alphaP, lambdaP,
SAIter, iter, outThin)
if (verbose) {
message(paste("Normalizing results."))
}
# obtain the mean and standard deviation of A and P from the chain files
FID <- sprintf('%.10f', outThin - floor(outThin))
AFile <- paste("AResults", FID, ".ChainValues.txt", sep="")
PFile <- paste("PResults", FID, ".ChainValues.txt", sep="")
normmd <- AtomChain2APNorm(Afile = AFile,
Pfile = PFile,
nGene = nrow(D),
nPattern = numPatterns,
nSample = ncol(D))
# extract chain data and store in above arrays
Amean[,] <- normmd$Amean
Asd[,] <- normmd$Asd
Pmean[,] <- normmd$Pmean
Psd[,] <- normmd$Psd
write.table(Amean[,],
file=paste(fullOutputPath,"Amean.",FID,".txt",sep=""),
sep="\t")
write.table(Asd[,],
file=paste(fullOutputPath,"Asd.",FID,".txt",sep=""),
sep="\t")
write.table(Pmean[,],
file=paste(fullOutputPath,"Pmean.",FID,".txt",sep=""),
sep="\t")
write.table(Psd[,],
file=paste(fullOutputPath,"Psd.",FID,".txt",sep=""),
sep="\t")
if (verbose) {
message(paste("Computing chi2"))
}
atmp <- as.matrix(Amean[,])
ptmp <- as.matrix(Pmean[,])
mtmp <- atmp %*% ptmp
meanMock[,] = array(mtmp, c(nrow(D), ncol(D), 1))
meanChi2 <- sum((D-meanMock[,])^2 / Sigma^2)
# process the diagnostic file and add appropriate means
ADiagFile <- paste("AResults",FID,".Diagnostics.txt",sep="")
PDiagFile <- paste("PResults",FID,".Diagnostics.txt",sep="")
ADiag <- read.table(ADiagFile, sep="\t", header=T, skip=10)
PDiag <- read.table(PDiagFile, sep="\t", header=T, skip=10)
chainKeepA <- which(ADiag[,2]==0)
meanOfChi2A <- mean(ADiag[chainKeepA,4])
meanAtomsA <- mean(ADiag[chainKeepA,5])
chi2DiagA <- paste("<number of atoms>", meanAtomsA,
"<chi^2>:", meanOfChi2A,
"; chi^2 of mean:", meanChi2, sep=" ")
cat("\n\nSummary\n", file=ADiagFile, append=TRUE)
cat(chi2DiagA, file=ADiagFile, append=TRUE)
cat("\n", file=ADiagFile, append=TRUE)
chainKeepP <- which(PDiag[,2]==0)
meanOfChi2P <- mean(PDiag[chainKeepP,4])
meanAtomsP <- mean(PDiag[chainKeepP,5])
chi2DiagP <- paste("<number of atoms>", meanAtomsP,
"<chi^2>:", meanOfChi2P,
"; chi^2 of mean:", meanChi2, sep=" ")
cat("\n\nSummary\n", file=PDiagFile, append=TRUE)
cat(chi2DiagP, file=PDiagFile, append=TRUE)
cat("\n", file=PDiagFile, append=TRUE)
if (fullOutputPath != "") {
file.rename(ADiagFile, paste(fullOutputPath, basename(ADiagFile), sep=""))
file.rename(PDiagFile, paste(fullOutputPath, basename(PDiagFile), sep=""))
}
# delete chain files or move to appropriate directory
if (!keepChain) {
message("Deleting chain files")
file.remove(AFile)
file.remove(PFile)
} else {
message(paste('Moving chain files to ', outputDir, sep=' '))
if (fullOutputPath != "") {
file.rename(AFile, paste(fullOutputPath, basename(AFile),sep=""))
file.rename(PFile, paste(fullOutputPath, basename(PFile),sep=""))
}
}
if (verbose) {
message("Completed simulation.\n")
message("Return list contains chi^2 of solution and normalized, mean and standard deviation of A and P matrices.\n")
}
# output list of data for each chain
return(list(meanChi2=meanChi2,
D=D, Sigma=Sigma,
Amean=Amean, Asd=Asd,
Pmean=Pmean, Psd=Psd,
meanMock=meanMock))
}
|