Browse code

clean up CoGAPS interface

sherman5 authored on 23/01/2018 16:29:50
Showing 32 changed files

... ...
@@ -1,92 +1,112 @@
1
-# CoGAPS: function to call gapsRun and calculate gene set
2
-#         statistics on result, also to produce A and P plots
3
-# History: v1.0 - EJF original CoGAPS, MFO modified to to match
4
-#                 new variable names
5
-
6
-# Inputs: data - data matrix
7
-#         unc - uncertainty matrix (std devs for chi-squared of Log Likelihood)
8
-#         GStoGenes - data.frame or list of gene sets
9
-#         nFactor - number of patterns (basis vectors, metagenes)
10
-#         nEquil - number of iterations for burn-in
11
-#         nSample - number of iterations for sampling
12
-#         nOutR - how often to print status into R by iterations
13
-#         output_atomic - whether to write atom files (large)
14
-#         simulation_id - name to attach to atoms files if created
15
-#         plot - logical to determine if plots produced
16
-#         nPerm - number of permutations for gene set test
17
-#         alphaA, alphaP - sparsity parameters for A and P domains
18
-#         max_gibbmass_paraA(P) - limit truncated normal to max size
19
-#         nMaxA, nMaxP - PRESENTLY UNUSED, future = limit number of atoms
20
-
1
+#' CoGAPS Matrix Factorization Algorithm
2
+#' 
3
+#' @details calls the C++ MCMC code and performs Bayesian
4
+#'matrix factorization returning the two matrices that reconstruct
5
+#'the data matrix
6
+#' @param D data matrix
7
+#' @param S uncertainty matrix (std devs for chi-squared of Log Likelihood)
8
+#' @param params GapsParams object 
9
+#' @return list with A and P matrix estimates
10
+#' @export
11
+CoGaps <- function(D, S, params = new('GapsParams', 7, 1000, 1000), ...)
12
+{
13
+    # process v2 style arguments for backwards compatibility
14
+    params <- oldParams(params, list(...))
21 15
 
22
-# Output: list with matrices, mean chi-squared value, and gene set
23
-#         results
16
+    # check data
17
+    checkParamsWithData(params, nrow(D), ncol(D), nrow(S), ncol(S))
18
+    if (any(D) < 0 | any(S) < 0) # too slow for large matrices ?
19
+        stop('D and S matrix must be non-negative')
24 20
 
25
-#'\code{CoGAPS} calls the C++ MCMC code through gapsRun and performs Bayesian
26
-#'matrix factorization returning the two matrices that reconstruct
27
-#'the data matrix and then calls calcCoGAPSStat to estimate gene set
28
-#'activity with nPerm set to 500
29
-#'
30
-#'@param data data matrix
31
-#'@param unc uncertainty matrix (std devs for chi-squared of Log Likelihood)
32
-#'@param ABins a matrix of same size as A which gives relative
33
-#' probability of that element being non-zero
34
-#'@param PBins a matrix of same size as P which gives relative
35
-#' probability of that element being non-zero
36
-#'@param GStoGenes data.frame or list with gene sets
37
-#'@param nFactor number of patterns (basis vectors, metagenes)
38
-#'@param simulation_id name to attach to atoms files if created
39
-#'@param nEquil number of iterations for burn-in
40
-#'@param nSample number of iterations for sampling
41
-#'@param nOutR how often to print status into R by iterations
42
-#'@param output_atomic whether to write atom files (large)
43
-#'@param fixedBinProbs Boolean for using relative probabilities
44
-#' given in Abins and Pbins
45
-#'@param fixedDomain character to indicate whether A or P is
46
-#' domain for relative probabilities
47
-#'@param sampleSnapshots Boolean to indicate whether to capture
48
-#' individual samples from Markov chain during sampling
49
-#'@param numSnapshots the number of individual samples to capture
50
-#'@param plot Boolean to indicate whether to produce output graphics
51
-#'@param nPerm number of permutations in gene set test
52
-#'@param alphaA sparsity parameter for A domain
53
-#'@param nMaxA PRESENTLY UNUSED, future = limit number of atoms
54
-#'@param alphaP sparsity parameter for P domain
55
-#'@param max_gibbmass_paraA limit truncated normal to max size
56
-#'@param nMaxP PRESENTLY UNUSED, future = limit number of atoms
57
-#'@param max_gibbmass_paraP limit truncated normal to max size
58
-#'@export
21
+    # run algorithm
22
+    result <- cogaps_cpp(as.matrix(D), as.matrix(S), nFactor, alphaA,
23
+        alphaP, nEquil, floor(nEquil/10), nSample, max_gibbmass_paraA,
24
+        max_gibbmass_paraP, fixedPatterns, whichMatrixFixed, seed, messages,
25
+        singleCellRNASeq, nOutR, numSnapshots, checkpoint_interval)
59 26
 
60
-CoGAPS <- function(data, unc, ABins = data.frame(), PBins = data.frame(), GStoGenes, nFactor = 7, simulation_id="simulation", nEquil=1000,
61
-                nSample=1000, nOutR=1000, output_atomic=FALSE,
62
-                fixedBinProbs = FALSE, fixedDomain = "N",
63
-                sampleSnapshots = TRUE, numSnapshots = 100,
64
-                plot=TRUE, nPerm=500,
65
-                alphaA = 0.01,  nMaxA = 100000,
66
-                max_gibbmass_paraA = 100.0,
67
-                alphaP = 0.01, nMaxP = 100000,
68
-                max_gibbmass_paraP = 100.0) {
27
+    # backwards compatible with v2
28
+    if (length(list(...)$GStoGenes))
29
+    {
30
+        warning('the GStoGenes argument is depracted with v3.0,')
31
+        if (list(...)$plot)
32
+            plotGAPS(result$Amean, result$Pmean)
33
+        GSP <- calcCoGAPSStat(result$Amean, result$Asd, list(...)$GStoGenes,
34
+            list(...)$nPerm)
35
+        result <- list(meanChi2=result$meanChi2, D=D, Sigma=S,
36
+            Amean=result$Amean, Asd=result$Asd, Pmean=result$Pmean,
37
+            Psd=result$Psd, GSUpreg=GSP$GSUpreg, GSDownreg=GSP$GSDownreg,
38
+            GSActEst=GSP$GSActEst)
39
+    }
69 40
 
41
+    return(result)
42
+}
70 43
 
71
-  # decompose the data
72
-  matrixDecomp <- gapsRun(data, unc, ABins, PBins, nFactor, simulation_id,
73
-                    nEquil, nSample, nOutR, output_atomic, fixedBinProbs,
74
-                    fixedDomain, sampleSnapshots, numSnapshots, alphaA,  nMaxA, max_gibbmass_paraA,
75
-                    alphaP, nMaxP, max_gibbmass_paraP)
44
+#' Restart CoGAPS from Checkpoint File
45
+#'
46
+#' @details loads the state of a previous CoGAPS run from a file and
47
+#'  continues the run from that point
48
+#' @param D data matrix
49
+#' @param S uncertainty matrix
50
+#' @param path path to checkpoint file
51
+#' @return list with A and P matrix estimates
52
+#' @export
53
+CoGapsFromCheckpoint <- function(D, S, path)
54
+{
55
+    # TODO add checksum to make sure D,S are correct
56
+    cogapsFromCheckpoint_cpp(D, S, path)
57
+}
76 58
 
77
-  # plot patterns and show heatmap of Anorm
78
-  if (plot) {
79
-    plotGAPS(matrixDecomp$Amean, matrixDecomp$Pmean)
80
-  }
59
+#' Backwards Compatibility with v2
60
+#'
61
+#' @param D data matrix
62
+#' @param S uncertainty matrix
63
+#' @param ... v2 style parameters
64
+#' @return list with A and P matrix estimates
65
+#' @export
66
+gapsRun <- function(D, S, ...)
67
+{
68
+    warning('gapsRun is depracated with v3.0, use CoGAPS')
69
+    params <- new('GapsParams', 7, 1000, 1000)
70
+    params <- oldParams(params, list(...))
71
+    CoGAPS(D, S, params)
72
+}
81 73
 
82
-  # compute the gene set scores
83
-  GSP <- calcCoGAPSStat(matrixDecomp$Amean, matrixDecomp$Asd, GStoGenes, nPerm)
74
+#' Backwards Compatibility with v2
75
+#'
76
+#' @param D data matrix
77
+#' @param S uncertainty matrix
78
+#' @param FP data.frame with rows giving fixed patterns for P
79
+#' @param ... v2 style parameters
80
+#' @return list with A and P matrix estimates
81
+#' @export
82
+gapsMapRun <- function(D, S, FP, ...)
83
+{
84
+    warning('gapsMapRun is depracated with v3.0, use CoGaps')
85
+    params <- new('GapsParams', 7, 1000, 1000)
86
+    params <- oldParams(params, list(...))
87
+    CoGAPS(D, S, params)
88
+}
84 89
 
85
-  return(list(meanChi2=matrixDecomp$calcChiSq,
86
-              D=data, Sigma=unc,
87
-              Amean=matrixDecomp$Amean, Asd=matrixDecomp$Asd,
88
-              Pmean=matrixDecomp$Pmean, Psd=matrixDecomp$Psd,
89
-              GSUpreg=GSP$GSUpreg, GSDownreg=GSP$GSDownreg, GSActEst=GSP$GSActEst))
90
+# helper function to process v2 parameters
91
+oldParams <- function(params, args)
92
+{
93
+    # standard params
94
+    if (length(args$nFactor))      params@nFactor    <- args$nFactor
95
+    if (length(args$nEquil))       params@nEquil     <- args$nEquil
96
+    if (length(args$nSample))      params@nSample    <- args$nSample
97
+    if (length(args$numSnapshots)) params@nSnapshots <- args$numSnapshots
98
+    if (length(args$alphaA))       params@alphaA     <- args$alphaA
99
+    if (length(args$alphaP))       params@alphaP     <- args$alphaP
100
+    if (length(args$seed))         params@seed       <- args$seed
101
+    if (length(args$messages))     params@messages   <- args$messages
90 102
 
103
+    # gapsMap params
104
+    if (length(args$FP))
105
+    {
106
+        params@fixedPatterns <- as.matrix(args$FP)
107
+        params@whichMatrixFixed <- 'P'
108
+    }
91 109
 
110
+    # return v3 style GapsParams
111
+    return(params)    
92 112
 }
... ...
@@ -1,90 +1,214 @@
1
-#' GWCoGAPS
1
+#' Genome Wide CoGAPS
2 2
 #'
3
-#'\code{GWCoGAPS} calls the C++ MCMC code and performs Bayesian
4
-#'matrix factorization returning the two matrices that reconstruct
5
-#'the data matrix for whole genome data;
6
-#'
7
-#'@param D data matrix
8
-#'@param S uncertainty matrix (std devs for chi-squared of Log Likelihood)
9
-#'@param nFactor number of patterns (basis vectors, metagenes), which must be
10
-#'greater than or equal to the number of rows of FP
11
-#'@param nSets number of sets for parallelization
12
-#'@param nCores number of cores for parallelization. If left to the default NA, nCores = nSets.
13
-#'@param saveBySetResults logical indicating whether to save by intermediary by set results. Default is FALSE.
14
-#'@param fname character string used to label file output. Default is "GWCoGAPS.AP.fixed"
15
-#'@param PatternsMatchFN function to use for pattern matching across sets
16
-#'@param Cut number of branches at which to cut dendrogram used in patternMatch4Parallel
17
-#'@param minNS minimum of individual set contributions a cluster must contain
18
-#'@param ... additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun}
19
-#'@export
20
-#'@seealso \code{\link{gapsRun}}, \code{\link{patternMatch4Parallel}}, and \code{\link{gapsMapRun}}
3
+#' @details calls the C++ MCMC code and performs Bayesian
4
+#'  matrix factorization returning the two matrices that reconstruct
5
+#'  the data matrix for whole genome data;
6
+#' @param D data matrix
7
+#' @param S uncertainty matrix (std devs for chi-squared of Log Likelihood)
8
+#' @param params GapsParams object 
9
+#' @param nSets number of sets for parallelization
10
+#' @param nCores number of cores for parallelization. If left to the default NA, nCores = nSets.
11
+#' @param saveBySetResults logical indicating whether to save by intermediary by set results. Default is FALSE.
12
+#' @param fname character string used to label file output. Default is "GWCoGAPS.AP.fixed"
13
+#' @param PatternsMatchFN function to use for pattern matching across sets
14
+#' @param Cut number of branches at which to cut dendrogram used in patternMatch4Parallel
15
+#' @param minNS minimum of individual set contributions a cluster must contain
16
+#' @param ... additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun}
17
+#' @export
18
+#' @seealso \code{\link{gapsRun}}, \code{\link{patternMatch4Parallel}}, and \code{\link{gapsMapRun}}
21 19
 #' @examples \dontrun{
22 20
 #' GWCoGAPS(nCores=NA, D, S, nFactor, nSets,saveBySetResults=TRUE, fname=fname,
23 21
 #' PatternsMatchFN = patternMatch4Parallel,numSnapshots=numSnapshots,minNS=minNS)
24 22
 #' }
25 23
 #'
24
+GWCoGAPS <- function(D, S, params, nSets, nCores=NA, saveBySetResults=FALSE,
25
+fname="GWCoGAPS.AP.fixed", PatternsMatchFN = patternMatch4Parallel, Cut=NA,
26
+minNS=NA, ...)
27
+{
28
+    # v2 style parameters compatibility
29
+    params <- oldParams(params, ...)
30
+
31
+    # establish the number of cores that you are able to use
32
+    if(is.na(nCores))
33
+        nCores <- nSets
34
+    registerDoParallel(cores=nCores)
35
+
36
+    # break the data into sets
37
+    genesInSets <- createGWCoGAPSSets(data=D, nSets=nSets, keep=FALSE)
38
+
39
+    # set gene min to 0
40
+    D <- sweep(D, 1, apply(D,1,function(x) pmin(0,min(x))))
41
+
42
+    #generate seeds for parallelization
43
+    nut <- generateSeeds(chains=nSets, seed=-1)
44
+
45
+    # run CoGAPS for each set
46
+    AP <- foreach(i=1:nSets) %dopar%
47
+    {
48
+        D <- as.matrix(D[genesInSets[[i]],])
49
+        S <- as.matrix(S[genesInSets[[i]],])
50
+        params <- setParam(params, 'seed', nut[i])
51
+        CoGAPS(D=D, S=S, params)
52
+    }
26 53
 
27
-GWCoGAPS <- function(D, S, nFactor, nSets, nCores=NA,
28
-                                 saveBySetResults=FALSE, fname="GWCoGAPS.AP.fixed",
29
-                                 PatternsMatchFN = patternMatch4Parallel,
30
-                                 Cut=NA, minNS=NA, ...) {
54
+    BySet <- reOrderBySet(AP=AP, nFactor=nFactor, nSets=nSets)
31 55
 
32
-# establish the number of cores that you are able to use
33
-if(is.na(nCores)){nCores<-nSets}
34
-registerDoParallel(cores=nCores)
56
+    #run postpattern match function
57
+    if (is.na(Cut))
58
+        Cut <- nFactor
59
+    matchedPs <- patternMatch4Parallel(Ptot=BySet$P, nP=nFactor, nSets=nSets,
60
+        cnt=Cut, minNS=minNS, bySet=TRUE)
35 61
 
36
-# break the data into sets
37
-genesInSets<-createGWCoGAPSSets(data=D, nSets=nSets, keep=FALSE)
62
+    #save BySet outputs
63
+    class(AP) <- append(class(AP),"CoGAPS")
64
+    if (saveBySetResults==TRUE)
65
+    {
66
+        save(AP,BySet,matchedPs,file=sprintf('APbySet.%s.nP%d.set%d.Rda',fname, nFactor,nSets))
67
+        message(sprintf('APbySet.%s.nP%d.set%d.Rda',fname, nFactor,nSets))
68
+    }
38 69
 
39
-# set gene min to 0
40
-D <- sweep(D, 1, apply(D,1,function(x){pmin(0,min(x))}))
70
+    PbySet <- matchedPs[["PBySet"]]
71
+    matchedPs <- matchedPs[[1]]
72
+
73
+    # generate seeds for parallelization
74
+    nut <- generateSeeds(chains=nSets, seed=-1)
75
+
76
+    # final number of factors
77
+    nFactorFinal <- dim(matchedPs)[1]
78
+
79
+    # run fixed CoGAPS
80
+    params <- setParam('fixedMatrix', as.matrix(matchedPs))
81
+    params <- setParam('whichMatrixFixed', 'P')
82
+    params <- setParam('nFactor', nFactorFinal)
83
+    Fixed <- foreach(i=1:nSets) %dopar%
84
+    {
85
+    	D <- as.matrix(D[genesInSets[[i]],])
86
+    	S <- as.matrix(S[genesInSets[[i]],])
87
+        params <- setParam(params, 'seed', nut[i])
88
+        CoGAPS(D=D, S=S, params)
89
+    }
41 90
 
42
-#generate seeds for parallelization
43
-nut<-generateSeeds(chains=nSets, seed=-1)
91
+    #extract A and Asds
92
+    As4fixPs <- postFixed4Parallel(AP.fixed=Fixed, setPs=matchedPs)
44 93
 
45
-# run CoGAPS for each set
46
-AP <- foreach(i=1:nSets) %dopar% {
47
-     D <- as.matrix(D[genesInSets[[i]],])
48
-     S <- as.matrix(S[genesInSets[[i]],])
49
-     gapsRun(D=D, S=S, nFactor=nFactor,seed=nut[i],...)
94
+    #save final
95
+    AP.fixed <- list("Amean"=As4fixPs$A, "Asd"=As4fixPs$Asd, "Pmean"=matchedPs,
96
+        "PbySet"=PbySet)
97
+    class(AP.fixed) <- append(class(AP.fixed),"CoGAPS")
98
+    save(AP.fixed, file=paste(fname, ".Rda", sep=""))
99
+    message(paste(fname, ".Rda", sep=""))
50 100
 }
51 101
 
52
-BySet<-reOrderBySet(AP=AP,nFactor=nFactor,nSets=nSets)
53
-
54
-#run postpattern match function
55
-if(is.na(Cut)){Cut<-nFactor}
56
-matchedPs<-patternMatch4Parallel(Ptot=BySet$P,nP=nFactor,nSets=nSets,cnt=Cut,minNS=minNS,bySet=TRUE)
102
+#' Create Gene Sets for GWCoGAPS
103
+#'
104
+#'\code{createGWCoGAPSSets} factors whole genome data into randomly generated sets for indexing;
105
+#'
106
+#'@param data data matrix with unique rownames
107
+#'@param nSets number of sets for parallelization
108
+#'@param outRDA name of output file
109
+#'@param keep logical indicating whether or not to save gene set list. Default is TRUE.
110
+#'@export
111
+#'@return list with randomly generated sets of genes from whole genome data
112
+#'@examples \dontrun{
113
+#'createGWCoGAPSSet(D,nSets=nSets)
114
+#'}
115
+createGWCoGAPSSets <- function(data, nSets, outRDA="GenesInCoGAPSSets.Rda",
116
+keep=TRUE)
117
+{
118
+    genes <- rownames(data)
119
+    setSize <- floor(length(genes) / nSets)
120
+    genesInSets <- list()
121
+    for (set in 1:nSets)
122
+    {
123
+        if (set != nSets)
124
+            genesInSets[[set]] <- sample(genes,setSize)
125
+        if (set == nSets)
126
+            genesInSets[[set]] <- genes
127
+        genes <- genes[!genes %in% genesInSets[[set]]]
128
+    }
57 129
 
58
-#save BySet outputs
59
-class(AP)<-append(class(AP),"CoGAPS")
60
-if(saveBySetResults==TRUE){
61
-    save(AP,BySet,matchedPs,file=sprintf('APbySet.%s.nP%d.set%d.Rda',fname, nFactor,nSets))
62
-    print(sprintf('APbySet.%s.nP%d.set%d.Rda',fname, nFactor,nSets))
130
+    if (!identical(sort(unlist(genesInSets)),sort(rownames(data))))
131
+        warning("Gene identifiers not unique!")
132
+    if (keep == TRUE)
133
+        save(list=c('genesInSets'), file=outRDA)
134
+    return(genesInSets)
63 135
 }
64 136
 
65
-PbySet<-matchedPs[["PBySet"]]
66
-matchedPs<-matchedPs[[1]]
67
-
68
-#generate seeds for parallelization
69
-nut<-generateSeeds(chains=nSets, seed=-1)
70
-
71
-#final number of factors
72
-nFactorFinal<-dim(matchedPs)[1]
73
-
74
-# run fixed CoGAPS
75
-Fixed <- foreach(i=1:nSets) %dopar% {
76
-	D <- as.matrix(D[genesInSets[[i]],])
77
-	S <- as.matrix(S[genesInSets[[i]],])
78
-	AP <- gapsMapRun(D, S, FP=matchedPs, nFactor=nFactorFinal, fixedMatrix = "P",seed=nut[i],...)
137
+#' Generate Seeds for Multiple Concurrent Runs
138
+#'
139
+#' @param chains number of seeds to generate (number of chains to run)
140
+#' @param seed positive values are kept, negative values will be overwritten
141
+#'  by a seed generated from the current time
142
+#' @return vector of randomly generated seeds
143
+#' @export
144
+#' @examples
145
+#' generateSeeds(chains=2, seed=-1)
146
+generateSeeds <- function(chains=2, seed=-1)
147
+{
148
+    if (chains < 2 | (as.integer(chains) != chains))
149
+        stop("chains must be >= 2 and an integer")
150
+
151
+    if (seed < 0)
152
+    {
153
+        secs <- as.numeric(difftime(Sys.time(), paste(Sys.Date(), "00:00"),
154
+            units="secs"))
155
+        return(seq_len(chains) * round(secs))
79 156
     }
157
+    else
158
+    {
159
+        return(seq_len(chains) * seed)
160
+    }
161
+}
80 162
 
81
-#extract A and Asds
82
-As4fixPs <- postFixed4Parallel(AP.fixed=Fixed,setPs=matchedPs)
83
-
163
+#' reOrderBySet
164
+#'
165
+#' @description <restructures output of gapsRun into a list containing each sets solution for Amean, Pmean, and Asd>
166
+#' @param AP output of gapsRun in parallel
167
+#' @param nFactor number of patterns
168
+#' @param nSets number of sets
169
+#'
170
+#' @return a list containing the \code{nSets} sets solution for Amean under "A", Pmean under "P", and Asd under "Asd"
171
+#' @export
172
+#'
173
+#' @examples \dontrun{
174
+#' reOrderBySet(AP,nFactor,nSets)
175
+#' }
176
+#'
177
+reOrderBySet<-function(AP, nFactor, nSets)
178
+{
179
+    P <- do.call(rbind,lapply(AP, function(x) x$Pmean))
180
+    rownames(P) <- paste(rep(1:nSets,each=nFactor),rep(1:nFactor,nSets),sep=".")
181
+    A <- lapply(AP, function(x) x$Amean)
182
+    Asd <- lapply(AP, function(x) x$Asd)
183
+    names(A) <- names(Asd) <- paste(rep("Set", nSets), rep(1:nSets), sep="")
184
+    return(list("A"=A, "Asd"=Asd, "P"=P))
185
+}
84 186
 
85
-#save final
86
-AP.fixed<-list("Amean"=As4fixPs$A, "Asd"=As4fixPs$Asd, "Pmean"=matchedPs,"PbySet"=PbySet)
87
-class(AP.fixed)<-append(class(AP.fixed),"CoGAPS")
88
-save(AP.fixed, file=paste(fname,".Rda",sep=""))
89
-print(paste(fname,".Rda",sep=""))
187
+#' postFixed4Parallel
188
+#'
189
+#' @param AP.fixed output of parallel gapsMapRun calls with same FP
190
+#' @param setPs data.frame with rows giving fixed patterns for P used as input for gapsMapRun
191
+#'
192
+#' @return list of two data.frames containing the A matrix estimates or their corresponding standard deviations
193
+#' from output of parallel gapsMapRun
194
+#' @export
195
+postFixed4Parallel <- function(AP.fixed=NA, setPs=NA)
196
+{
197
+    ASummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Amean))
198
+    Asd <- do.call(rbind,lapply(AP.fixed, function(x) x$Asd))
199
+    #PSummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Pmean))
200
+    PSummary <- AP.fixed[[1]]$Pmean
201
+
202
+    Pmax <- apply(PSummary,1,max)
203
+    Pneu <- sweep(PSummary,1,Pmax,FUN="/")
204
+    Aneu <- sweep(ASummary,2,Pmax,FUN="*")
205
+
206
+    X <- apply(Pneu,1,range)
207
+    Y <- apply(setPs,1,range)
208
+    colnames(X) <- colnames(Y)
209
+    if (!all.equal(X, Y, tolerance=1e-5))
210
+        warning("Patterns do not match fixed values.")
211
+
212
+    As4fixPs <- list("A"=Aneu, "Asd"=Asd)
213
+    return(As4fixPs)
90 214
 }
91 215
deleted file mode 100644
... ...
@@ -1,28 +0,0 @@
1
-#Binary Heatmap for Standardized A Matrix
2
-
3
-#'\code{binaryA} creates a binarized heatmap of the A matrix
4
-#'in which the value is 1 if the value in Amean is greater than
5
-#'threshold * Asd and 0 otherwise
6
-#'
7
-#'@param Amean the mean estimate for the A matrix
8
-#'@param Asd the standard deviations on Amean
9
-#'@param threshold the number of standard deviations above zero
10
-#'that an element of Amean must be to get a value of 1
11
-#'@export
12
-binaryA <-function(Amean, Asd, threshold=3) {
13
-
14
-
15
-  BinA_Map <- ifelse(Amean/Asd > threshold,1,0)
16
-  colnames(BinA_Map) <-colnames(BinA_Map,do.NULL=F,prefix = "Pattern ")
17
-  rownames(BinA_Map) <- rep(" ",nrow(BinA_Map))
18
-
19
-
20
-  heatmap.2(BinA_Map, Rowv = FALSE, Colv = FALSE,dendrogram="none",
21
-            scale="none",col = brewer.pal(3,"Blues"), trace="none",
22
-            density.info="none",cexCol=1.3,srtCol=45,
23
-            lmat=rbind(c(0, 3),c(2,1),c(0,4) ),
24
-            lwid=c(1,10),lhei=c(1, 4, 1.2 ),
25
-            main="Heatmap of Standardized A Matrix")
26
-  mtext(paste("(Threshold = ", threshold, ")"))
27
-
28
-}
29 0
deleted file mode 100644
... ...
@@ -1,130 +0,0 @@
1
-# calcCoGAPSStat: calculate gene set statistics for A matrix columns
2
-# History: v1.0 EJF original CoGAPS
3
-
4
-# Inputs: Amean - A matrix mean values
5
-#         Asd - A matrix standard deviations
6
-#         GStoGenes - data.frame, GSA.genesets class, or list with gene sets
7
-#         numPerm - number of permutations for null
8
-
9
-# Output: list with gene set statistics
10
-
11
-#'\code{calcCoGAPSStat} calculates the gene set statistics for each
12
-#'column of A using a Z-score from the elements of the A matrix,
13
-#'the input gene set, and permutation tests
14
-#'
15
-#'@param Amean A matrix mean values
16
-#'@param Asd A matrix standard deviations
17
-#'@param GStoGenes data.frame or list with gene sets
18
-#'@param numPerm number of permutations for null
19
-#'@export
20
-
21
-calcCoGAPSStat <- function (Amean, Asd, GStoGenes, numPerm=500) {
22
-
23
-  # test for std dev of zero, possible mostly in simple simulations
24
-  if (sum(Asd==0) > 0) {
25
-      #temp <- min(Asd[Asd>0])
26
-      Asd[Asd==0] <- .Machine$double.eps
27
-  }
28
-
29
-  # calculate Z scores
30
-  zMatrix <- calcZ(Amean,Asd)
31
-
32
-  # compute the p-value for each gene set belonging to each pattern
33
-
34
-  # check input arguments
35
-  if (!is(GStoGenes, "data.frame") && !is(GStoGenes, "list") && !is(GStoGenes,"GSA.genesets")) {
36
-    stop("GStoGenes must be a data.frame,GSA.genesets, or list with format specified in the users manual.")
37
-  }
38
-
39
-  if (is(GStoGenes, "GSA.genesets")) {
40
-    names(GStoGenes$genesets) <- GStoGenes$geneset.names
41
-    GStoGenes <- GStoGenes$genesets
42
-  }
43
-
44
-  if (is(GStoGenes, "list")) {
45
-    GStoGenesList <- GStoGenes
46
-  } else {
47
-    GStoGenesList <- list()
48
-    for (i in 1:dim(GStoGenes)[2]) {
49
-      GStoGenesList[[as.character(colnames(GStoGenes)[i])]] <- as.character(unique(GStoGenes[,i]))
50
-    }
51
-  }
52
-
53
-  # get dimensions
54
-  numGS   <- length(names(GStoGenesList))
55
-  numPatt <- dim(zMatrix)[2]
56
-  numG    <- dim(zMatrix)[1]+0.9999
57
-
58
-  # initialize matrices
59
-  statsUp   <- matrix(ncol = numGS, nrow = numPatt)
60
-  statsDown <- matrix(ncol = numGS, nrow = numPatt)
61
-  actEst    <- matrix(ncol = numGS, nrow = numPatt)
62
-  results   <- list()
63
-  zPerm     <- matrix(ncol=numPerm,nrow=numPatt)
64
-
65
-  # do permutation test
66
-  for (gs in 1:numGS) {
67
-    genes <- GStoGenesList[[names(GStoGenesList)[gs]]]
68
-    index <- rownames(zMatrix) %in% genes
69
-    zValues <- zMatrix[index,1]
70
-    numGenes <- length(zValues)
71
-    label <- as.character(numGenes)
72
-
73
-    if (!any(names(results)==label)) {
74
-      for (p in 1:numPatt) {
75
-        for (j in 1:numPerm) {
76
-          temp <- floor(runif(numGenes,1,numG))
77
-          temp2 <- zMatrix[temp,p]
78
-          zPerm[p,j] <- mean(temp2)
79
-        }
80
-      }
81
-      results[[label]] <- zPerm
82
-    }
83
-  }
84
-
85
-# get p-value
86
-  for (p in 1:numPatt) {
87
-
88
-    for (gs in 1:numGS) {
89
-
90
-      genes <- GStoGenesList[[names(GStoGenesList)[gs]]]
91
-      index <- rownames(zMatrix) %in% genes
92
-      zValues <- zMatrix[index,p]
93
-      zScore <- mean(zValues)
94
-
95
-      numGenes <- length(zValues)
96
-      label <- as.character(numGenes)
97
-
98
-      permzValues <- results[[label]][p,]
99
-      ordering <- order(permzValues)
100
-      permzValues <- permzValues[ordering]
101
-
102
-      statistic <- sum(zScore > permzValues)
103
-      statUpReg <- 1 - statistic/length(permzValues)
104
-      statsUp[p,gs] <- max(statUpReg, 1/numPerm)
105
-
106
-      statistic <- sum(zScore < permzValues)
107
-      statDownReg <- 1 - statistic/length(permzValues)
108
-      statsDown[p,gs] <- max(statDownReg,1/numPerm)
109
-
110
-      activity <- 1 - 2*max(statUpReg, 1/numPerm)
111
-      actEst[p,gs] <- activity
112
-    }
113
-
114
-  }
115
-
116
-  # format output
117
-  colnames(statsUp) <- names(GStoGenesList)
118
-  colnames(statsDown) <- names(GStoGenesList)
119
-  colnames(actEst) <- names(GStoGenesList)
120
-
121
-  rownames(statsUp) <- colnames(zMatrix)
122
-  rownames(statsDown) <- colnames(zMatrix)
123
-  rownames(actEst) <- colnames(zMatrix)
124
-
125
-  results[['GSUpreg']] <- statsUp
126
-  results[['GSDownreg']] <- statsDown
127
-  results[['GSActEst']] <- actEst
128
-
129
-  return(results)
130
-}
131 0
deleted file mode 100644
... ...
@@ -1,85 +0,0 @@
1
-# calcGeneGSStat: calculate probability gene belongs in gene set
2
-# History: v1.0 EJF original CoGAPS
3
-
4
-# Inputs: Amean - A matrix mean values
5
-#         Asd - A matrix standard deviations
6
-#         GStoGenes - data.frame or list with gene sets
7
-#         numPerm - number of permutations for null
8
-#         Pw - weighting on genes
9
-#         nullGenes - adjust genes
10
-
11
-# Output: environment with gene set statistics
12
-#       NOTE: should make into list object, env historical
13
-
14
-#'\code{calcGeneGSStat} calculates the probability that a gene
15
-#'listed in a gene set behaves like other genes in the set within
16
-#'the given data set
17
-#'
18
-#'@param Amean A matrix mean values
19
-#'@param Asd A matrix standard deviations
20
-#'@param GSGenes data.frame or list with gene sets
21
-#'@param numPerm number of permutations for null
22
-#'@param Pw weight on genes
23
-#'@param nullGenes - logical indicating gene adjustment
24
-#'@export
25
-
26
-calcGeneGSStat  <- function(Amean, Asd, GSGenes, numPerm, Pw=rep(1,ncol(Amean)), nullGenes=F) {
27
-    gsStat <- calcCoGAPSStat(Amean, Asd, data.frame(GSGenes),
28
-                             numPerm=numPerm)
29
-    gsStat <- gsStat$GSUpreg
30
-
31
-    gsStat <- -log(gsStat)
32
-
33
-  if (!all(is.na(Pw))) {
34
-    if (length(Pw) != length(gsStat)) {
35
-      stop('Invalid weighting')
36
-    }
37
-    gsStat <- gsStat*Pw
38
-  }
39
-
40
-    if (nullGenes) {
41
-        ZD <- Amean[setdiff(row.names(Amean), GSGenes),] /
42
-         Asd[setdiff(row.names(Amean), GSGenes),]
43
-    } else {
44
-        ZD <- Amean[GSGenes,]/Asd[GSGenes,]
45
-    }
46
-    outStats <- apply(sweep(ZD,2,gsStat,FUN="*"),1,sum) / (sum(gsStat))
47
-
48
-    outStats <- outStats / apply(ZD,1,sum)
49
-
50
-    outStats[which(apply(ZD,1,sum) < .Machine$double.eps)] <- 0.
51
-
52
-
53
-    if (sum(gsStat) < .Machine$double.eps) {
54
-        return(0.)
55
-    }
56
-
57
-    return(outStats)
58
-
59
-}
60
-
61
-
62
-computeGeneGSProb <- function(Amean, Asd, GSGenes, Pw=rep(1,ncol(Amean)),
63
-                              numPerm=500, PwNull=F) {
64
-    geneGSStat <- calcGeneGSStat(Amean=Amean, Asd=Asd, Pw=Pw,
65
-                                 GSGenes=GSGenes, numPerm=numPerm)
66
-
67
-
68
-
69
-    if (PwNull) {
70
-      permGSStat <- calcGeneGSStat(Amean=Amean, Asd=Asd,
71
-                                  GSGenes=GSGenes, numPerm=numPerm, Pw=Pw,
72
-                                  nullGenes=T)
73
-    } else {
74
-      permGSStat <- calcGeneGSStat(Amean=Amean, Asd=Asd,
75
-                                  GSGenes=GSGenes, numPerm=numPerm,
76
-                                  nullGenes=T)
77
-    }
78
-
79
-    finalStats <- sapply(GSGenes,
80
-                         function(x) length(which(permGSStat > geneGSStat[x])) / length(permGSStat))
81
-
82
-    return(finalStats)
83
-
84
-
85
-}
86 0
deleted file mode 100644
... ...
@@ -1,40 +0,0 @@
1
-# calcZ: function to generate Z-score matrix
2
-# History: v1.0 EJF original CoGAPS
3
-
4
-# Inputs: meanMat - matrix of mean values
5
-#         sdMat - matrix of standard deviations
6
-
7
-# Output: matrix of Z-scores
8
-
9
-#'\code{calcZ} calculates the Z-score for each element based
10
-#'on input mean and standard deviation matrices
11
-#'
12
-#'@param meanMat matrix of mean values
13
-#'@param sdMat matrix of standard deviation values
14
-#'@export
15
-
16
-calcZ <- function (meanMat, sdMat) {
17
-
18
-  # compute the z score for each gene's association to each pattern
19
-
20
-  # find matrix dimensions
21
-  nrows <- dim(meanMat)[1]
22
-  ncols <- dim(meanMat)[2]
23
-
24
-  check <- dim(sdMat)[1]
25
-  if (nrows != check) {
26
-    stop("Number of rows in the mean and standard deviation of A do not agree.")
27
-  }
28
-
29
-  check <- dim(sdMat)[2]
30
-  if (ncols != check) {
31
-    stop("Number of columns in the mean and standard deviation of A do not agree.")
32
-  }
33
-
34
-  # compute the matrix of z scores
35
-  zMat <- meanMat/sdMat
36
-  rownames(zMat) <- rownames(meanMat)
37
-  colnames(zMat) <- colnames(meanMat)
38
-
39
-  return(zMat)
40
-}
41 0
new file mode 100644
... ...
@@ -0,0 +1,107 @@
1
+#' @title GapsParams
2
+#' @description Parameters for running CoGAPS
3
+#'
4
+#' @slot nFactor number of patterns (basis vectors, metagenes)
5
+#' @slot nEquil number of iterations for burn-in
6
+#' @slot nSample number of iterations for sampling
7
+#' @slot nOutput how often (number of iterations) to print status in R console
8
+#' @slot numSnapshots the number of individual samples to capture
9
+#' @slot alphaA sparsity parameter for A domain
10
+#' @slot alphaP sparsity parameter for P domain
11
+#' @slot maxGibbmassA limit truncated normal to max size
12
+#' @slot maxGibbmassP limit truncated normal to max size
13
+#' @slot seed positive values are kept, negative values will be overwritten
14
+#'  by a seed generated from the current time
15
+#' @slot messages display messages during the run
16
+#' @slot singleCelLRNASeq indicate if the data is single cell RNA-seq data
17
+#' @slot fixedPatterns matrix of fixed values in either A or P matrix
18
+#' @slot whichMatrixFixed characted indicating whether A or P contains
19
+#'  the fixed patterns
20
+#' @slot checkpointInterval how often (number of seconds) to create a checkpoint file
21
+#' @slot nCores max number of cores to run on
22
+#' @export
23
+setClass('GapsParams', slots = c(
24
+    nFactor = 'integer',
25
+    nEquil = 'integer',
26
+    nSample = 'integer',
27
+    nOutput = 'integer',
28
+    nSnapshots = 'integer',
29
+    alphaA  = 'numeric',
30
+    alphaP  = 'numeric',
31
+    maxGibbmassA = 'numeric',
32
+    maxGibbmassP = 'numeric',
33
+    seed = 'integer',
34
+    messages  = 'logical',
35
+    singleCelLRNASeq = 'logical',
36
+    fixedPatterns = 'matrix',
37
+    whichMatrixFixed = 'character',
38
+    checkpointInterval = 'integer',
39
+    nCores = 'integer'
40
+))
41
+
42
+setMethod('initialize', 'GapsParams', 
43
+    function(.Object, nFactor, nEquil, nSample, ...)
44
+    {
45
+        .Object@nFactor <- nFactor
46
+        .Object@nEquil <- nEquil
47
+        .Object@nSample <- nSample
48
+        .Object@nOutput <- 1000
49
+        .Object@numSnapshots <- 0
50
+        .Object@alphaA <- 0.01
51
+        .Object@alphaP <- 0.01
52
+        .Object@maxGibbmassA <- 100.0
53
+        .Object@maxGibbmassP <- 100.0
54
+        .Object@seed <- -1
55
+        .Object@messages <- TRUE
56
+        .Object@singleCelLRNASeq <- FALSE
57
+        .Object@fixedPatterns <- matrix(0)
58
+        .Object@whichMatrixFixed <- 'N'
59
+        .Object@checkpointInterval <- 0
60
+
61
+        .Object <- callNextMethod(.Object, ...)
62
+        .Object
63
+    }
64
+)
65
+
66
+setValidity('GapsParams',
67
+    function(object)
68
+    {
69
+        if (object@whichMatrixFixed == 'P' & nrow(object@fixedPatterns) > object@nFactor)
70
+            'number of fixed patterns greater than nFactor'
71
+        if (object@whichMatrixFixed == 'A' & ncol(object@fixedPatterns) > object@nFactor)
72
+            'number of fixed patterns greater than nFactor'
73
+    }
74
+)
75
+
76
+#' @rdname getParam
77
+#' @importFrom methods slot
78
+setMethod("getParam", "GapsParams", function(object, name)
79
+{
80
+    slot(object, name)
81
+})
82
+
83
+#' @rdname setParam
84
+#' @importFrom methods slot<- validObject
85
+setMethod("setParam", "GapsParams", function(object, name, value)
86
+{
87
+    slot(object, name) <- value
88
+    validObject(object)
89
+    return(object)
90
+})
91
+
92
+#' @importFrom methods slotNames
93
+setMethod("show", "GapsParams", function(object)
94
+{
95
+    cat("CoGAPS Parameters Object\n")
96
+    cat("Number of patterns to estimate: ", object@nFactor)
97
+})
98
+
99
+checkParamsWithData <- function(params, nRD, nCD, nRS, nCS)
100
+{
101
+    if (nRD != nRS | nCD != nCS)
102
+        stop('D and S matrices have mismached dimensions')
103
+    if (params@whichMatrixFixed == 'A' & nrow(params@fixedPatterns) != nRD)
104
+        stop('invalid fixed pattern length')
105
+    if (params@whichMatrixFixed == 'P' & ncol(params@fixedPatterns) != nCD)
106
+        stop('invalid fixed pattern length')
107
+})
0 108
deleted file mode 100644
... ...
@@ -1,34 +0,0 @@
1
-#' createGWCoGAPSSets
2
-#'
3
-#'\code{createGWCoGAPSSets} factors whole genome data into randomly generated sets for indexing;
4
-#'
5
-#'@param data data matrix with unique rownames
6
-#'@param nSets number of sets for parallelization
7
-#'@param outRDA name of output file
8
-#'@param keep logical indicating whether or not to save gene set list. Default is TRUE.
9
-#'@export
10
-#'@return list with randomly generated sets of genes from whole genome data
11
-#'@examples \dontrun{
12
-#'createGWCoGAPSSet(D,nSets=nSets)
13
-#'}
14
-#'
15
-
16
-createGWCoGAPSSets<-function(data=D, #data matrix with unique rownames
17
-	nSets=nSets, #number of sets for parallelization
18
-	outRDA="GenesInCoGAPSSets.Rda", #name of output file
19
-	keep=TRUE #logical indicating whether or not to save gene set list. Default is TRUE.
20
-	){
21
-genes=rownames(data)
22
-setSize=floor(length(genes)/nSets)
23
-genesInSets <- list()
24
-for (set in 1:nSets) {
25
-  if(set!=nSets){genesInSets[[set]] <- sample(genes,setSize)}
26
-  if(set==nSets){genesInSets[[set]] <- genes}
27
-  genes=genes[!genes%in%genesInSets[[set]]]
28
-}
29
-if(!identical(sort(unlist(genesInSets)),sort(rownames(data)))){print("Gene identifiers not unique!")}
30
-if(keep==TRUE){save(list=c('genesInSets'),file=outRDA)}
31
-return(genesInSets)
32
-}
33
-
34
-
35 0
deleted file mode 100755
... ...
@@ -1,144 +0,0 @@
1
-#Calculates significant genes in each pattern according to certain threshold
2
-#Returns the significant gene names as well as well as the means of these matrices and number of genes in each
3
-
4
-gapsInterPattern <- function(Amean, Asd, sdThreshold = 3)
5
-{
6
-    #number of rows and cols of Asd
7
-    numGenes = length(Asd[,1]);
8
-    numCols = length(Asd[1,]);
9
-
10
-    #Vector holding the number of each significant gene in each column
11
-    sigGeneNums = data.frame();
12
-
13
-    #Temp number of sig genes in the col
14
-    sigCount = 0;
15
-
16
-    #Number to catch the amount of columns without significant genes
17
-    numEmptyCols = 0;
18
-
19
-    #Keep an array of the significant gene counts
20
-    significantGeneNums = c(0);
21
-
22
-    #Names of the genes from the data matrix
23
-    geneNames = names(Asd[,1]);
24
-
25
-    #Names of the genes that are significant from the data matrix
26
-    sigGeneNames = "0";
27
-
28
-    #The numerator of our statistic
29
-    dimensionStatNumerator = 0;
30
-
31
-    #The denominator of our statistic
32
-    dimensionStatDenominator = 0;
33
-
34
-    #The value of our statistic
35
-    dimensionStatistic = 0;
36
-
37
-    #A matrix holding the values of our statistics
38
-    dimensionStatisticMatrix = matrix(nrow = numCols, ncol = numCols);
39
-
40
-    #The mean of the statistic matrix
41
-    sbar = 0;
42
-
43
-    #A list to return the significant genes in each col and the statistic matrix
44
-    results = list(list());
45
-
46
-    #Scan in the significant genes from each column of Asd
47
-    #The columns of sigGeneNums hold the significant genes for each col of Asd
48
-    for(i in 1:numCols)
49
-    {
50
-        sigCount = 0;
51
-        for(j in 1:numGenes)
52
-        {
53
-            if((Amean[j,i] - (sdThreshold*Asd[j,i])) > 0)
54
-            {
55
-                sigCount = sigCount + 1;
56
-                sigGeneNums[sigCount, i] = j;
57
-            }
58
-        }
59
-
60
-        if(sigCount == 0)
61
-        {
62
-            sigGeneNums[1, i] = 0;
63
-            numEmptyCols = numEmptyCols + 1;
64
-        }
65
-
66
-        #Save the number of sigGenes
67
-        significantGeneNums[i] = sigCount;
68
-
69
-        #Get the names and store them
70
-        if(sigCount != 0)
71
-        {
72
-            for(k in 1:sigCount)
73
-            {
74
-                sigGeneNames[k] = geneNames[sigGeneNums[k, i]];
75
-            }
76
-            results[[1]][[i]] = sigGeneNames;
77
-            sigGeneNames = "0";
78
-        }
79
-        else
80
-        {
81
-            results[[1]][[i]] = "None";
82
-        }
83
-    }
84
-
85
-    if(any(significantGeneNums == 0))
86
-    {
87
-        zeroSigCols = which(significantGeneNums == 0);
88
-        print("Warning: No Significant Genes in Pattern(s): ");
89
-
90
-        for(z in 1:length(zeroSigCols))
91
-        {
92
-            print(zeroSigCols[z]);
93
-        }
94
-    }
95
-
96
-    #Now that we have the significant genes want to see if these genes are significant in other columns
97
-    #Fill across the row then down the column
98
-
99
-    #This compares the significant genes in the specified by 'j' to the same genes in Asd in the column specified by 'i'
100
-    for(i in 1:numCols)
101
-    {
102
-        for(j in 1:numCols)
103
-        {
104
-            #Grab the number of significant genes from the interested column
105
-            sigCount = sum(sigGeneNums[,j] > 0, na.rm = TRUE);
106
-
107
-            if(sigCount != 0)
108
-            {
109
-                dimensionStatDenominator = sigCount;
110
-                dimensionStatNumerator = 0;
111
-
112
-                #loop through the number of significant genes and compare to these genes in the 'ith' col of Asd.
113
-                #Find the significance of these genes, Amean - threshold * Asd
114
-                for(k in 1:sigCount)
115
-                {
116
-                    if((Amean[sigGeneNums[k,j],i] - (sdThreshold*Asd[sigGeneNums[k,j],i])) > 0)
117
-                    {
118
-                        dimensionStatNumerator = dimensionStatNumerator + 1;
119
-                    }
120
-                }
121
-
122
-                dimensionStatistic = dimensionStatNumerator/dimensionStatDenominator;
123
-
124
-                dimensionStatisticMatrix[i, j] = dimensionStatistic;
125
-            }
126
-            else
127
-            {
128
-                dimensionStatisticMatrix[i, j] = 0;
129
-            }
130
-        }
131
-    }
132
-
133
-    #Find mean of the matrices (excluding the diagonal elements)
134
-    #we can subtract numCols as the diagonal elements are 1
135
-    sbar = ((sum(dimensionStatisticMatrix) - (numCols - numEmptyCols))/(length(dimensionStatisticMatrix) - (numCols - numEmptyCols)));
136
-
137
-    results[[2]] = significantGeneNums;
138
-    results[[3]] = t(dimensionStatisticMatrix);
139
-    results[[4]] = sbar;
140
-
141
-    names(results) = c("SignificantGeneNames", "SignificantGeneTotals", "SeparationMatrix", "InterPatternValue");
142
-
143
-    return(results);
144
-}
145 0
deleted file mode 100755
... ...
@@ -1,135 +0,0 @@
1
-#Calculates significant genes in each pattern according to certain threshold
2
-#Returns the significant gene names as well as well as the correlation matrices between these genes and the means of these matrices
3
-
4
-gapsIntraPattern <- function(Amean, Asd, DMatrix, sdThreshold = 3)
5
-{
6
-    #number of rows and cols of Asd
7
-    numGenes = length(Asd[,1]);
8
-    numCols = length(Asd[1,]);
9
-
10
-    #number of samples in DMatrix
11
-    numSamp = ncol(DMatrix);
12
-
13
-    #Vector holding the number of each significant gene in each column
14
-    sigGeneNums = data.frame();
15
-
16
-    #Temp number of sig genes in the col
17
-    sigCount = 0;
18
-
19
-    #Keep an array of the significant gene counts
20
-    significantGeneNums = c(0);
21
-
22
-    #A matrix to hold the significant genes in D for the current pattern
23
-    #The matrix just acts as a subset of D, just eliminates non relevant rows
24
-    tempSubsetD = matrix();
25
-
26
-    #A matrix holding the values of our correlation coefficients between genes for the current column
27
-    tempGeneCorrMatrix = matrix();
28
-
29
-    #A list to hold all the correlation matrices
30
-    geneCorrMatrices = list();
31
-
32
-    #A list to hold all the means
33
-    geneCorrMatrMeans = list();
34
-
35
-    #The mean of all the correlation matrices
36
-    cbar = 0;
37
-
38
-    #A list to return the means and the matrices
39
-    results = list();
40
-
41
-    #Scan in the significant genes from each column of Asd
42
-    #The columns of sigGeneNums hold the significant genes for each col of Asd
43
-    for(i in 1:numCols)
44
-    {
45
-        sigCount = 0;
46
-        for(j in 1:numGenes)
47
-        {
48
-            if((Amean[j,i] - (sdThreshold*Asd[j,i])) > 0)
49
-            {
50
-                sigCount = sigCount + 1;
51
-                sigGeneNums[sigCount, i] = j;
52
-            }
53
-        }
54
-
55
-        if(sigCount == 0)
56
-        {
57
-            sigGeneNums[1, i] = 0;
58
-        }
59
-
60
-        #Save the number of sigGenes
61
-        significantGeneNums[i] = sigCount;
62
-    }
63
-
64
-    #If a pattern has no significant genes this is clearly an error so return such
65
-    if(any(significantGeneNums == 0))
66
-    {
67
-        zeroSigCols = which(significantGeneNums == 0);
68
-        warning("Warning: No Significant Genes in Pattern(s): ");
69
-
70
-        for(z in 1:length(zeroSigCols))
71
-        {
72
-            message(zeroSigCols[z]);
73
-        }
74
-    }
75
-
76
-
77
-    #Now that we have the significant genes want to grab these from our original D matrix
78
-    #and find the sigGene x sigGene correlation matrix and find its mean
79
-
80
-    for(j in 1:numCols)
81
-    {
82
-        #Grab the number of significant genes from the interested column
83
-        sigCount = sum(sigGeneNums[,j] > 0, na.rm = TRUE);
84
-
85
-        if(sigCount != 0)
86
-        {
87
-
88
-            #loop through the number of significant genes and pull out the rows of D that represent these genes.
89
-            #Then find the correlation between them with the built in R corr function
90
-            tempSubsetD = matrix(nrow = sigCount, ncol = numSamp);
91
-            for(k in 1:sigCount)
92
-            {
93
-                #Subset D based on significant Genes
94
-                #need to transpose as it reads this in as column vector otherwise
95
-                tempSubsetD[k,] = t(DMatrix[sigGeneNums[k,j], ]);
96
-            }
97
-
98
-            #Find the correlation between these genes in D
99
-            #Need to transpose as it calculates correlations between the columns
100
-            tempGeneCorrMatrix = cor(t(tempSubsetD));
101
-
102
-            #Find the mean of this matrix
103
-            tempGeneCorrMatrMean = mean(tempGeneCorrMatrix);
104
-
105
-        }
106
-        else
107
-        {
108
-            tempGeneCorrMatrix = 0;
109
-            tempGeneCorrMatrMean = 0;
110
-        }
111
-
112
-        #Save these in the overall list
113
-        geneCorrMatrices[[j]] = tempGeneCorrMatrix;
114
-        geneCorrMatrMeans[[j]] = tempGeneCorrMatrMean;
115
-
116
-    }
117
-
118
-    #Return as an overall list of lists
119
-    # We return Corr Matrices themselves, their means, and the means of the means (cbar)
120
-    results[[1]] = geneCorrMatrices;
121
-    results[[2]] = geneCorrMatrMeans;
122
-
123
-    #Return as an overall list of lists
124
-    for(i in 1:numCols)
125
-    {
126
-        cbar = cbar + results[[2]][[i]];
127
-    }
128
-
129
-    cbar = cbar/numCols;
130
-    results[[3]] = cbar;
131
-
132
-    names(results) = c("CorrelationMatrices", "CorrelationMatrixMeans", "IntraPatternValue");
133
-
134
-    return(results);
135
-}
136 0
deleted file mode 100644
... ...
@@ -1,69 +0,0 @@
1
-
2
-# gapsMapRun: function to call C++ cogaps code
3
-# History: v1.0 CK with MFO edits, August 2014
4
-
5
-# Inputs: D - data matrix
6
-#         S - uncertainty matrix (std devs for chi-squared of Log Likelihood)
7
-#         FP - fixed A columns or P rows
8
-#         nFactor - number of patterns (basis vectors, metagenes)
9
-#         simulation_id - name to attach to atoms files if created
10
-#         nEquil - number of iterations for burn-in
11
-#         nSample - number of iterations for sampling
12
-#         nOutR - how often to print status into R by iterations
13
-#         output_atomic - whether to write atom files (large)
14
-#         alphaA, alphaP - sparsity parameters for A and P domains
15
-#         max_gibbmass_paraA(P) - limit truncated normal to max size
16
-#         nMaxA, nMaxP - PRESENTLY UNUSED, future = limit number of atoms
17
-
18
-
19
-# Output: list with A and P matrix estimates, chi-squared and atom
20
-#         numbers of sample by iteration, and chi-squared of mean
21
-
22
-#'\code{gapsMapRun} calls the C++ MCMC code and performs Bayesian
23
-#'matrix factorization returning the two matrices that reconstruct
24
-#'the data matrix; as opposed to gapsRun, this method takes an
25
-#'additional input specifying set patterns in the P matrix
26
-#'
27
-#'@param D data matrix
28
-#'@param S uncertainty matrix (std devs for chi-squared of Log Likelihood)
29
-#'@param FP data.frame with rows giving fixed patterns for P
30
-#'@param ABins a matrix of same size as A which gives relative
31
-#' probability of that element being non-zero
32
-#'@param PBins a matrix of same size as P which gives relative
33
-#' probability of that element being non-zero
34
-#'@param nFactor number of patterns (basis vectors, metagenes), which must be
35
-#'greater than or equal to the number of rows of FP
36
-#'@param  simulation_id name to attach to atoms files if created
37
-#'@param  nEquil number of iterations for burn-in
38
-#'@param  nSample number of iterations for sampling
39
-#'@param  nOutR how often to print status into R by iterations
40
-#'@param  output_atomic whether to write atom files (large)
41
-#'@param  fixedMatrix character indicating whether A or P matrix
42
-#' has fixed columns or rows respectively
43
-#'@param fixedBinProbs Boolean for using relative probabilities
44
-#' given in Abins and Pbins
45
-#'@param fixedDomain character to indicate whether A or P is
46
-#' domain for relative probabilities
47
-#'@param sampleSnapshots Boolean to indicate whether to capture
48
-#' individual samples from Markov chain during sampling
49
-#'@param numSnapshots the number of individual samples to capture
50
-#'@param alphaA sparsity parameter for A domain
51
-#'@param nMaxA PRESENTLY UNUSED, future = limit number of atoms
52
-#'@param max_gibbmass_paraA limit truncated normal to max size
53
-#'@param alphaP sparsity parameter for P domain
54
-#'@param nMaxP PRESENTLY UNUSED, future = limit number of atoms
55
-#'@param max_gibbmass_paraP limit truncated normal to max size
56
-#'@param seed Set seed for reproducibility. Positive values provide initial seed, negative values just use the time.
57
-#'@param messages Display progress messages
58
-#'@export
59
-
60
-gapsMapRun <- function(D, S, FP, ABins = data.frame(), PBins = data.frame(), nFactor = 7, simulation_id = "simulation",
61
-                       nEquil = 1000, nSample = 1000, nOutR = 1000, output_atomic = FALSE, fixedMatrix = "P",
62
-                       fixedBinProbs = FALSE, fixedDomain = "N", sampleSnapshots = TRUE, numSnapshots = 100, alphaA = 0.01,
63
-                       nMaxA = 100000, max_gibbmass_paraA = 100.0, alphaP = 0.01, nMaxP = 100000, max_gibbmass_paraP = 100.0,
64
-                       seed=-1, messages=TRUE)
65
-{
66
-    gapsRun(D, S, ABins, PBins, nFactor, simulation_id, nEquil, nSample, nOutR, output_atomic, fixedBinProbs,
67
-        fixedDomain, sampleSnapshots, numSnapshots, alphaA, nMaxA, max_gibbmass_paraA, alphaP, nMaxP,
68
-        max_gibbmass_paraP, seed, messages, FALSE, FP, 'P')
69
-}
70 0
deleted file mode 100644
... ...
@@ -1,202 +0,0 @@
1
-
2
-# gapsRun: function to call C++ cogaps code
3
-# History: v1.0 CK with MFO edits, August 2014
4
-
5
-# Inputs: D - data matrix
6
-#         S - uncertainty matrix (std devs for chi-squared of Log Likelihood)
7
-#         nFactor - number of patterns (basis vectors, metagenes)
8
-#         simulation_id - name to attach to atoms files if created
9
-#         nEquil - number of iterations for burn-in
10
-#         nSample - number of iterations for sampling
11
-#         nOutR - how often to print status into R by iterations
12
-#         output_atomic - whether to write atom files (large)
13
-#         alphaA, alphaP - sparsity parameters for A and P domains
14
-#         max_gibbmass_paraA(P) - limit truncated normal to max size
15
-#         nMaxA, nMaxP - PRESENTLY UNUSED, future = limit number of atoms
16
-
17
-
18
-# Output: list with A and P matrix estimates, chi-squared and atom
19
-#         numbers of sample by iteration, and chi-squared of mean
20
-#'\code{gapsRun} calls the C++ MCMC code and performs Bayesian
21
-#'matrix factorization returning the two matrices that reconstruct
22
-#'the data matrix
23
-#'
24
-#'@param D data matrix
25
-#'@param S uncertainty matrix (std devs for chi-squared of Log Likelihood)
26
-#'@param ABins a matrix of same size as A which gives relative
27
-#' probability of that element being non-zero
28
-#'@param PBins a matrix of same size as P which gives relative
29
-#' probability of that element being non-zero
30
-#'@param nFactor number of patterns (basis vectors, metagenes), which must be
31
-#'greater than or equal to the number of rows of FP
32
-#'@param  simulation_id name to attach to atoms files if created
33
-#'@param  nEquil number of iterations for burn-in
34
-#'@param  nSample number of iterations for sampling
35
-#'@param  nOutR how often to print status into R by iterations
36
-#'@param  output_atomic whether to write atom files (large)
37
-#'@param fixedBinProbs Boolean for using relative probabilities
38
-#' given in Abins and Pbins
39
-#'@param fixedDomain character to indicate whether A or P is
40
-#' domain for relative probabilities
41
-#'@param numSnapshots the number of individual samples to capture
42
-#'@param alphaA sparsity parameter for A domain
43
-#'@param nMaxA PRESENTLY UNUSED, future = limit number of atoms
44
-#'@param max_gibbmass_paraA limit truncated normal to max size
45
-#'@param alphaP sparsity parameter for P domain
46
-#'@param nMaxP PRESENTLY UNUSED, future = limit number of atoms
47
-#'@param max_gibbmass_paraP limit truncated normal to max size
48
-#'@param seed Set seed for reproducibility. Positive values provide initial seed, negative values just use the time.
49
-#'@param messages Display progress messages
50
-#'@param singleCellRNASeq T/F indicating if the data is single cell RNA-seq data
51
-#'@param fixedPatterns matrix of fixed values in either A or P matrix
52
-#'@param whichMatrixFixed character to indicate whether A or P matrix
53
-#'  contains the fixed patterns
54
-#'@param checkpoint_interval time (in seconds) between cogaps checkpoints
55
-#'@export
56
-
57
-#--CHANGES 1/20/15--
58
-#Added FixedPatt frame to C++ version to match Ondrej's code for updating bin sizes based on an input matrix
59
-gapsRun <- function(D, S, ABins = data.frame(), PBins = data.frame(),
60
-                    nFactor = 7, simulation_id = "simulation",
61
-                    nEquil = 1000, nSample = 1000, nOutR = 1000,
62
-                    output_atomic = FALSE, fixedBinProbs = FALSE,
63
-                    fixedDomain = "N", numSnapshots = 0, alphaA = 0.01,
64
-                    nMaxA = 100000, max_gibbmass_paraA = 100.0,
65
-                    alphaP = 0.01, nMaxP = 100000, max_gibbmass_paraP = 100.0,
66
-                    seed=-1, messages=TRUE, singleCellRNASeq=FALSE,
67
-                    fixedPatterns = matrix(0), whichMatrixFixed = 'N',
68
-                    checkpoint_interval = 0)
69
-{
70
-    # Floor the parameters that are integers to prevent allowing doubles.
71
-    nFactor <- floor(nFactor)
72
-    nEquil <- floor(nEquil)
73
-    nSample <- floor(nSample)
74
-    nOutR <- floor(nOutR)
75
-    numSnapshots <- floor(numSnapshots)
76
-    nMaxA <- floor(nMaxA)
77
-    nMaxP <- floor(nMaxP)
78
-
79
-    # check the fixed patterns
80
-    if ((whichMatrixFixed == 'A' & nrow(fixedPatterns) != nrow(D))
81
-    | (whichMatrixFixed == 'P' & nrow(fixedPatterns) > nFactor)) 
82
-    {
83
-        stop('invalid number of rows in fixed pattern matrix')
84
-    }
85
-    if ((whichMatrixFixed == 'A' & ncol(fixedPatterns) > nFactor)
86
-    | (whichMatrixFixed == 'P' & ncol(fixedPatterns) != ncol(D))) 
87
-    {
88
-        stop('invalid number of cols in fixed pattern matrix')
89
-    }
90
-
91
-    geneNames <- rownames(D);
92
-    sampleNames <- colnames(D);
93
-
94
-    # label patterns as Patt N
95
-    patternNames <- c("0");
96
-    for(i in 1:nFactor)
97
-    {
98
-        patternNames[i] <- paste('Patt', i);
99
-    }
100
-
101
-    # call to C++ Rcpp code
102
-    cogapResult <- cogaps(as.matrix(D), as.matrix(S), nFactor, alphaA, alphaP,
103
-        nEquil, floor(nEquil/10), nSample, max_gibbmass_paraA, max_gibbmass_paraP,
104
-        fixedPatterns, whichMatrixFixed, seed, messages, singleCellRNASeq,
105
-        nOutR, numSnapshots, checkpoint_interval)
106
-
107
-    # convert returned files to matrices to simplify visualization and processing
108
-    cogapResult$Amean <- as.matrix(cogapResult$Amean);
109
-    cogapResult$Asd <- as.matrix(cogapResult$Asd);
110
-    cogapResult$Pmean <- as.matrix(cogapResult$Pmean);
111
-    cogapResult$Psd <- as.matrix(cogapResult$Psd);
112
-
113
-    ## label matrices
114
-    colnames(cogapResult$Amean) <- patternNames;
115
-    rownames(cogapResult$Amean) <- geneNames;
116
-    colnames(cogapResult$Asd) <- patternNames;
117
-    rownames(cogapResult$Asd) <- geneNames;
118
-    colnames(cogapResult$Pmean) <- sampleNames;
119
-    rownames(cogapResult$Pmean) <- patternNames;
120