Tom Sherman authored on 30/04/2018 17:17:10
Showing 37 changed files

1 1
deleted file mode 100644
... ...
@@ -1 +0,0 @@
1
-load("/Users/maggiewodicka/Desktop/CoGAPS/GWCoGAPS.AP.fixed.Rda")
... ...
@@ -1,16 +1,18 @@
1 1
 Package: CoGAPS
2
-Version: 2.99.0
3
-Date: 2014-08-23
2
+Version: 2.99.2
3
+Date: 2018-04-24
4 4
 Title: Coordinated Gene Activity in Pattern Sets
5 5
 Author: Thomas Sherman, Wai-shing Lee, Conor Kelton, Ondrej Maxian, Jacob Carey,
6
-    Genevieve Stein-O'Brien, Michael Considine, Maggie Wodicka, John Stansfield, Shawn Sivy, Carlo Colantuoni, Alexander Favorov, Mike Ochs, Elana Fertig
6
+    Genevieve Stein-O'Brien, Michael Considine, Maggie Wodicka, John Stansfield,
7
+    Shawn Sivy, Carlo Colantuoni, Alexander Favorov, Mike Ochs, Elana Fertig
7 8
 Description: Coordinated Gene Activity in Pattern Sets (CoGAPS)
8 9
     implements a Bayesian MCMC matrix factorization algorithm,
9 10
     GAPS, and links it to gene set statistic methods to infer biological
10 11
     process activity.  It can be used to perform sparse matrix factorization on
11 12
     any data, and when this data represents biomolecules, to do gene set
12 13
     analysis.
13
-Maintainer: Elana J. Fertig <ejfertig@jhmi.edu>
14
+Maintainer: Elana J. Fertig <ejfertig@jhmi.edu>,
15
+    Thomas D. Sherman <tomsherman159@gmail.com>
14 16
 Depends:
15 17
     R (>= 3.4.0),
16 18
     Rcpp (>= 0.11.0)
... ...
@@ -1,7 +1,6 @@
1 1
 # Generated by roxygen2: do not edit by hand
2 2
 
3 3
 export(CoGAPS)
4
-export(CoGapsFromCheckpoint)
5 4
 export(GWCoGAPS)
6 5
 export(GWCoGapsFromCheckpoint)
7 6
 export(binaryA)
... ...
@@ -9,16 +8,21 @@ export(buildReport)
9 8
 export(calcCoGAPSStat)
10 9
 export(calcGeneGSStat)
11 10
 export(calcZ)
11
+export(cellMatchR)
12 12
 export(computeGeneGSProb)
13 13
 export(createGWCoGAPSSets)
14
+export(createscCoGAPSSets)
14 15
 export(gapsMapRun)
15 16
 export(gapsRun)
17
+export(patternMarkers)
18
+export(patternMatch4Parallel)
16 19
 export(plotAtoms)
17 20
 export(plotDiag)
18 21
 export(plotGAPS)
19 22
 export(plotP)
20 23
 export(reconstructGene)
21 24
 export(residuals)
25
+export(scCoGAPS)
22 26
 import(doParallel)
23 27
 import(foreach)
24 28
 import(ggplot2)
... ...
@@ -34,8 +34,6 @@
34 34
 #' @param fixedPatterns matrix of fixed values in either A or P matrix
35 35
 #' @param checkpointInterval time (in seconds) between creating a checkpoint
36 36
 #' @param checkpointFile name of the checkpoint file
37
-#' @param pumpThreshold type of threshold for pump statistic
38
-#' @param nPumpSamples number of samples used in pump statistic
39 37
 #' @param ... keeps backwards compatibility with arguments from older versions
40 38
 #' @return list with A and P matrix estimates
41 39
 #' @importFrom methods new
... ...
@@ -47,8 +45,7 @@ CoGAPS <- function(D, S, nFactor=7, nEquil=1000, nSample=1000, nOutputs=1000,
47 45
 nSnapshots=0, alphaA=0.01, alphaP=0.01, maxGibbmassA=100, maxGibbmassP=100,
48 46
 seed=-1, messages=TRUE, singleCellRNASeq=FALSE, whichMatrixFixed='N',
49 47
 fixedPatterns=matrix(0), checkpointInterval=0, 
50
-checkpointFile="gaps_checkpoint.out", pumpThreshold="unique",
51
-nPumpSamples=0, ...)
48
+checkpointFile="gaps_checkpoint.out", ...)
52 49
 {
53 50
     # get v2 arguments
54 51
     oldArgs <- list(...)
... ...
@@ -67,6 +64,14 @@ nPumpSamples=0, ...)
67 64
     if (missing(S) & !is.null(oldArgs$unc))
68 65
         S <- oldArgs$unc
69 66
 
67
+    # get pump arguments - hidden for now from user
68
+    pumpThreshold <- "unique"
69
+    nPumpSamples <- 0
70
+    if (!is.null(list(...)$pumpThreshold))
71
+        pumpThreshold <- list(...)$pumpThreshold
72
+    if (!is.null(list(...)$nPumpSamples))
73
+        pumpThreshold <- list(...)$nPumpSamples
74
+
70 75
     # check arguments
71 76
     if (class(D) != "matrix" | class(S) != "matrix")
72 77
         stop('D and S must be matrices')
... ...
@@ -115,7 +120,6 @@ nPumpSamples=0, ...)
115 120
 #' @param path path to checkpoint file
116 121
 #' @param checkpointFile name for future checkpooints made
117 122
 #' @return list with A and P matrix estimates
118
-#' @export
119 123
 CoGapsFromCheckpoint <- function(D, S, path, checkpointFile=NA)
120 124
 {
121 125
     if (is.na(checkpointFile))
... ...
@@ -129,7 +133,7 @@ CoGapsFromCheckpoint <- function(D, S, path, checkpointFile=NA)
129 133
 #'  compiler/version was used, which compile time options were enabled, etc...
130 134
 #' @return display builds information
131 135
 #' @examples
132
-#'  CoGAPS::displayBuildReport()
136
+#'  CoGAPS::buildReport()
133 137
 #' @export
134 138
 buildReport <- function()
135 139
 {
... ...
@@ -166,7 +170,7 @@ output_atomic=FALSE, fixedBinProbs=FALSE, fixedDomain="N", sampleSnapshots=TRUE,
166 170
 numSnapshots=100, alphaA=0.01, nMaxA=100000, max_gibbmass_paraA=100.0,
167 171
 alphaP=0.01, nMaxP=100000, max_gibbmass_paraP=100.0, seed=-1, messages=TRUE)
168 172
 {
169
-    #warning('gapsRun is deprecated with v3.0, use CoGAPS')
173
+    warning('gapsRun is deprecated with v3.0, use CoGAPS')
170 174
     CoGAPS(D, S, nFactor=nFactor, nEquil=nEquil, nSample=nSample,
171 175
         nOutputs=nOutR, nSnapshots=ifelse(sampleSnapshots,numSnapshots,0),
172 176
         alphaA=alphaA, alphaP=alphaP, maxGibbmassA=max_gibbmass_paraA,
... ...
@@ -196,7 +200,7 @@ sampleSnapshots=TRUE, numSnapshots=100, alphaA=0.01, nMaxA=100000,
196 200
 max_gibbmass_paraA=100.0, alphaP=0.01, nMaxP=100000, max_gibbmass_paraP=100.0,
197 201
 seed=-1, messages=TRUE)
198 202
 {
199
-    #warning('gapsMapRun is deprecated with v3.0, use CoGaps')
203
+    warning('gapsMapRun is deprecated with v3.0, use CoGaps')
200 204
     CoGAPS(D, S, nFactor=nFactor, nEquil=nEquil, nSample=nSample,
201 205
         nOutputs=nOutR, nSnapshots=ifelse(sampleSnapshots,numSnapshots,0),
202 206
         alphaA=alphaA, alphaP=alphaP, maxGibbmassA=max_gibbmass_paraA,
... ...
@@ -209,7 +213,7 @@ v2CoGAPS <- function(result, ...)
209 213
 {
210 214
     if (!is.null(list(...)$GStoGenes))
211 215
     {
212
-        #warning('GStoGenes is deprecated with v3.0, see CoGAPS documentation')
216
+        warning('GStoGenes is deprecated with v3.0, see CoGAPS documentation')
213 217
         if (is.null(list(...)$plot) | list(...)$plot)
214 218
         {
215 219
             plotGAPS(result$Amean, result$Pmean)
... ...
@@ -9,6 +9,8 @@
9 9
 #' @param nCores number of cores for parallelization. If left to the default NA, nCores = nSets.
10 10
 #' @param cut number of branches at which to cut dendrogram used in patternMatch4Parallel
11 11
 #' @param minNS minimum of individual set contributions a cluster must contain
12
+#' @param manualMatch logical indicating whether or not to stop after initial phase for manual pattern matching
13
+#' @param consensusPatterns fixed pattern matrix to be used to ensure reciprocity of A weights accross sets 
12 14
 #' @param ... additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun}
13 15
 #' @return list of A and P estimates
14 16
 #' @seealso \code{\link{gapsRun}}, \code{\link{patternMatch4Parallel}}, and \code{\link{gapsMapRun}}
... ...
@@ -16,18 +18,30 @@
16 18
 #' data(SimpSim)
17 19
 #' sim_name <- "example"
18 20
 #' createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, sim_name)
19
-#' result <- GWCoGAPS(sim_name, nFactor=3, nEquil=1000, nSample=1000)
21
+#' result <- GWCoGAPS(sim_name, nFactor=3, nEquil=200, nSample=200)
20 22
 #' @export
21
-GWCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, ...)
23
+GWCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, manualMatch=FALSE, consensusPatterns=NULL, ...)
22 24
 {
23 25
     if (!is.null(list(...)$checkpointFile))
26
+    {
24 27
         stop("checkpoint file name automatically set in GWCoGAPS - don't pass this parameter")
25
-    allDataSets <- preInitialPhase(simulationName, nCores)
26
-    initialResult <- runInitialPhase(simulationName, allDataSets, nFactor, ...)
27
-    matchedPatternSets <- postInitialPhase(initialResult, length(allDataSets), cut, minNS)
28
-    save(matchedPatternSets, file=paste(simulationName, "_matched_ps.RData", sep=""))
29
-    finalResult <- runFinalPhase(simulationName, allDataSets, matchedPatternSets, ...)
30
-    return(postFinalPhase(finalResult, matchedPatternSets))
28
+    }
29
+
30
+    if (is.null(consensusPatterns))
31
+    {
32
+        allDataSets <- preInitialPhase(simulationName, nCores)
33
+        initialResult <- runInitialPhase(simulationName, allDataSets, nFactor, ...)
34
+        if (manualMatch)
35
+        {
36
+            saveRDS(initialResult,file=paste(simulationName,"_initial.rds", sep=""))
37
+            stop("Please provide consensus patterns upon restarting.")
38
+        }
39
+        matchedPatternSets <- postInitialPhase(initialResult, length(allDataSets), cut, minNS)
40
+        save(matchedPatternSets, file=paste(simulationName, "_matched_ps.RData", sep=""))
41
+        consensusPatterns <- matchedPatternSets[[1]]
42
+    } 
43
+    finalResult <- runFinalPhase(simulationName, allDataSets, consensusPatterns, nCores, ...)
44
+    return(postFinalPhase(finalResult, consensusPatterns))
31 45
 }
32 46
 
33 47
 #' Restart a GWCoGaps Run from Checkpoint
... ...
@@ -35,15 +49,23 @@ GWCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, ...)
35 49
 #' @inheritParams GWCoGAPS
36 50
 #' @return list of A and P estimates
37 51
 #' @importFrom utils file_test
52
+#' @examples
53
+#' data(SimpSim)
54
+#' sim_name <- "example"
55
+#' createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, sim_name)
56
+#' trash <- GWCoGAPS(sim_name, nFactor=3, nEquil=200, nSample=200)
57
+#' result <- GWCoGapsFromCheckpoint(sim_name, 2)
38 58
 #' @export
39
-GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA, ...)
59
+GWCoGapsFromCheckpoint <- function(simulationName, nCores, cut=NA, minNS=NA, ...)
40 60
 {
41 61
     # find data files
42 62
     allDataSets <- preInitialPhase(simulationName, nCores)
43 63
 
44 64
     # figure out phase from file signature
45
-    initialCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_initial_cpt_[0-9]+.out", sep=""))
46
-    finalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_final_cpt_[0-9]+.out", sep=""))
65
+    initialCpts <- list.files(full.names=TRUE, pattern=paste(simulationName,
66
+        "_initial_cpt_[0-9]+.out", sep=""))
67
+    finalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName,
68
+        "_final_cpt_[0-9]+.out", sep=""))
47 69
 
48 70
     if (length(finalCpts))
49 71
     {
... ...
@@ -51,10 +73,12 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA,
51 73
         {
52 74
             # load data set and shift values so gene minimum is zero
53 75
             load(allDataSets[[i]])
54
-            sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x))))
76
+            sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x)
77
+                pmin(0,min(x))))
55 78
     
56 79
             # run CoGAPS with fixed patterns
57
-            cptFileName <- paste(simulationName, "_final_cpt_", i, ".out", sep="")
80
+            cptFileName <- paste(simulationName, "_final_cpt_", i, ".out",
81
+                sep="")
58 82
             CoGapsFromCheckpoint(sampleD, sampleS, cptFileName)
59 83
         }
60 84
         load(paste(simulationName, "_matched_ps.RData", sep=""))
... ...
@@ -62,7 +86,9 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA,
62 86
     else if (file_test("-f", paste(simulationName, "_matched_ps.RData", sep="")))
63 87
     {
64 88
         load(paste(simulationName, "_matched_ps.RData", sep=""))
65
-        finalResult <- runFinalPhase(simulationName, allDataSets, matchedPatternSets, ...)
89
+        consensusPatterns<-matchedPatternSets[[1]]
90
+        finalResult <- runFinalPhase(simulationName, allDataSets,
91
+            consensusPatterns, nCores, ...)
66 92
     }
67 93
     else if (length(initialCpts))
68 94
     {
... ...
@@ -71,21 +97,27 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA,
71 97
         {
72 98
             # load data set and shift values so gene minimum is zero
73 99
             load(allDataSets[[i]])
74
-            sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x))))
100
+            sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x)
101
+                pmin(0,min(x))))
75 102
 
76 103
             # run CoGAPS from checkpoint
77
-            cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out", sep="")
104
+            cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out",
105
+                sep="")
78 106
             CoGapsFromCheckpoint(sampleD, sampleS, cptFileName)
79 107
         }
80
-        matchedPatternSets <- postInitialPhase(initialResult, length(allDataSets), cut, minNS)
81
-        save(matchedPatternSets, file=paste(simulationName, "_matched_ps.RData", sep=""))
82
-        finalResult <- runFinalPhase(simulationName, allDataSets, matchedPatternSets, ...)
108
+        matchedPatternSets <- postInitialPhase(initialResult,
109
+            length(allDataSets), cut, minNS)
110
+        save(matchedPatternSets, file=paste(simulationName, "_matched_ps.RData",
111
+            sep=""))
112
+        consensusPatterns<-matchedPatternSets[[1]]
113
+        finalResult <- runFinalPhase(simulationName, allDataSets,
114
+            consensusPatterns, nCores, ...)
83 115
     }
84 116
     else
85 117
     {
86 118
         stop("no checkpoint files found")
87 119
     }
88
-    return(postFinalPhase(finalResult, matchedPatternSets))
120
+    return(postFinalPhase(finalResult, consensusPatterns))
89 121
 }
90 122
 
91 123
 preInitialPhase <- function(simulationName, nCores)
... ...
@@ -96,7 +128,9 @@ preInitialPhase <- function(simulationName, nCores)
96 128
 
97 129
     # establish the number of cores that we are able to use
98 130
     if (is.na(nCores))
131
+    {
99 132
         nCores <- length(allDataSets)
133
+    }
100 134
     registerDoParallel(cores=nCores)
101 135
     return(allDataSets)
102 136
 }
... ...
@@ -128,18 +162,36 @@ postInitialPhase <- function(initialResult, nSets, cut, minNS)
128 162
 
129 163
     #run postpattern match function
130 164
     if (is.na(cut))
165
+    {
131 166
         cut <- nFactor
167
+    }
168
+
132 169
     return(patternMatch4Parallel(Ptot=BySet$P, nP=nFactor, nSets=nSets, cnt=cut,
133 170
         minNS=minNS, bySet=TRUE))
134 171
 }
135 172
 
136
-runFinalPhase <- function(simulationName, allDataSets, matchedPatternSets, ...)
137
-{
173
+runFinalPhase <- function(simulationName, allDataSets, consensusPatterns, nCores, ...)
174
+{    
175
+    if (length(dim(consensusPatterns)) != 2)
176
+    {
177
+        stop("consensusPatterns must be a matrix")
178
+    }
179
+
180
+    # find data files if providing consensus patterns
181
+    fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="")
182
+    allDataSets <- list.files(full.names=TRUE, pattern=fileSig)
183
+
184
+    if (is.na(nCores))
185
+    {
186
+        nCores <- length(allDataSets)
187
+    }
188
+    registerDoParallel(cores=nCores)
189
+
138 190
     # generate seeds for parallelization
139 191
     nut <- generateSeeds(chains=length(allDataSets), seed=-1)
140 192
 
141 193
     # final number of factors
142
-    nFactorFinal <- nrow(matchedPatternSets[[1]])
194
+    nFactorFinal <- nrow(consensusPatterns)
143 195
 
144 196
     # run fixed CoGAPS
145 197
     finalResult <- foreach(i=1:length(allDataSets)) %dopar%
... ...
@@ -150,7 +202,7 @@ runFinalPhase <- function(simulationName, allDataSets, matchedPatternSets, ...)
150 202
 
151 203
         # run CoGAPS with fixed patterns
152 204
         cptFileName <- paste(simulationName, "_final_cpt_", i, ".out", sep="")
153
-        CoGAPS(sampleD, sampleS, fixedPatterns=matchedPatternSets[[1]],
205
+        CoGAPS(sampleD, sampleS, fixedPatterns=consensusPatterns,
154 206
             nFactor=nFactorFinal, seed=nut[i], checkpointFile=cptFileName,
155 207
             whichMatrixFixed='P', ...)
156 208
 
... ...
@@ -158,11 +210,10 @@ runFinalPhase <- function(simulationName, allDataSets, matchedPatternSets, ...)
158 210
     return(finalResult)
159 211
 }
160 212
 
161
-postFinalPhase <- function(finalResult, matchedPatternSets)
213
+postFinalPhase <- function(finalResult, consensusPatterns)
162 214
 {
163
-    Aresult <- postFixed4Parallel(finalResult, matchedPatternSets[[1]])
164
-    finalResult <- list("Amean"=Aresult$A, "Asd"=Aresult$Asd,
165
-        "Pmean"=matchedPatternSets, "PbySet"=matchedPatternSets[["PBySet"]])
215
+    Aresult <- postFixed4Parallel(finalResult, consensusPatterns)
216
+    finalResult <- list("Amean"=Aresult$A, "Asd"=Aresult$Asd,"Pmean"=consensusPatterns)
166 217
     class(finalResult) <- append(class(finalResult), "CoGAPS")
167 218
     return(finalResult)
168 219
 }
... ...
@@ -11,8 +11,8 @@
11 11
 #' @param nullGenes logical indicating gene adjustment
12 12
 #' @return gene similiarity statistic
13 13
 #' @examples
14
-#' data('SimpSim')
15
-#' calcGeneGSStat(SimpSim.result$Amean, SimpSim.result$Asd, GStoGenes=GSets[[1]],
14
+#' data("SimpSim")
15
+#' calcGeneGSStat(SimpSim.result$Amean, SimpSim.result$Asd, GSGenes=GSets[[1]],
16 16
 #' numPerm=500)
17 17
 #' @export
18 18
 calcGeneGSStat  <- function(Amean, Asd, GSGenes, numPerm, Pw=rep(1,ncol(Amean)),
... ...
@@ -67,8 +67,8 @@ nullGenes=FALSE)
67 67
 #' @return A vector of length GSGenes containing the p-values of set membership
68 68
 #' for each gene containined in the set specified in GSGenes.
69 69
 #' @examples
70
-#' data('SimpSim')
71
-#' computeGeneGSProb(SimpSim.result$Amean, SimpSim.result$Asd, GStoGenes=GSets[[1]],
70
+#' data("SimpSim")
71
+#' computeGeneGSProb(SimpSim.result$Amean, SimpSim.result$Asd, GSGenes=GSets[[1]],
72 72
 #' numPerm=500)
73 73
 #' @export
74 74
 computeGeneGSProb <- function(Amean, Asd, GSGenes, Pw=rep(1,ncol(Amean)),
75 75
new file mode 100644
... ...
@@ -0,0 +1,121 @@
1
+#' cellMatchR
2
+#' @export
3
+#'
4
+#' @param Atot a matrix containing the total by set estimates of Pmean output from \code{reOrderBySet}
5
+#' @param nSets number of parallel sets used to generate \code{Atot}
6
+#' @param cnt  number of branches at which to cut dendrogram
7
+#' @param minNS minimum of individual set contributions a cluster must contain
8
+#' @param maxNS maximum of individual set contributions a cluster must contain
9
+#' @param ignore.NA logical indicating whether or not to ignore NAs from potential over dimensionalization. Default is FALSE.
10
+#' @param bySet logical indicating whether to return list of matched set solutions from \code{Atot}
11
+#' @param plotDen plot
12
+#' @param ... additional parameters for \code{agnes}
13
+#' @return a matrix of concensus patterns by samples. If \code{bySet=TRUE} then a list of the set contributions to each
14
+#' concensus pattern is also returned.
15
+cellMatchR <- function(Atot, nSets, cnt, minNS=NA, maxNS=NA,
16
+ignore.NA=FALSE, bySet=FALSE, plotDen=FALSE, ...)
17
+{
18
+    if (is.na(minNS))
19
+    {
20
+        minNS <- nSets / 2
21
+    }
22
+    if (is.na(maxNS))
23
+    {
24
+        maxNS <- nSets + minNS
25
+    }
26
+    if (!ignore.NA)
27
+    {
28
+        if (anyNA(Atot))
29
+        {
30
+            warning(paste("Non-sparse matrixes produced. Reducing the number",
31
+                "of patterns asked for and rerun."))
32
+        }
33
+    }
34
+    else
35
+    {
36
+        Atot <- Atot[complete.cases(Atot),]
37
+    }
38
+
39
+    corcut <- function(Atot, minNS, cnt, cluster.method)
40
+    {
41
+        corr.dist <- cor(Atot)
42
+        corr.dist <- 1-corr.dist
43
+
44
+        clust <- agnes(x=corr.dist, diss=TRUE, cluster.method)
45
+        #clust=fastcluster::hclust(dist(corr.dist))
46
+        cut <- cutree(as.hclust(clust),k=cnt)
47
+
48
+        cls <- sort(unique(cut))
49
+        cMNs <- matrix(ncol=cnt,nrow=dim(Atot)[1])
50
+        colnames(cMNs) <- cls
51
+        rownames(cMNs) <- rownames(Atot)
52
+
53
+        RtoMeanPattern <- list()
54
+        AByClust <- list()
55
+        for (i in cls)
56
+        {
57
+            if (!is.null(dim(Atot[,cut == i])))
58
+            {
59
+                if (dim(Atot[,cut == i])[2] >= minNS)
60
+                {
61
+                    cMNs[,i] <- rowMeans(Atot[,cut==i])
62
+                    AByClust[[i]] <- Atot[,cut==i]
63
+                    nIN <- sum(cut==i)
64
+                    RtoMeanPattern[[i]] <- sapply(1:nIN, function(j) 
65
+                        round(cor(x=Atot[,cut==i][,j], y=cMNs[,i]),3))
66
+                }
67
+            }
68
+        }
69
+
70
+        AByClust[sapply(AByClust,is.null)] <- NULL
71
+        RtoMeanPattern[sapply(RtoMeanPattern,is.null)] <- NULL
72
+        return(list("RtoMeanPattern"=RtoMeanPattern, "AByClust"=AByClust))
73
+    }   
74
+
75
+    cc <- corcut(Atot, minNS, cnt, cluster.method)
76
+
77
+    ### split by maxNS
78
+    indx <- which(unlist(lapply(cc$AByClust, function(x) dim(x)[1] > maxNS)))
79
+    i <- 1
80
+    while (length(indx) > 0)
81
+    { 
82
+        icc <- corcut(cc$AByClust[[indx[1]]], minNS, 2, cluster.method)
83
+        if (length(icc[[2]]) == 0)
84
+        {
85
+            indx <- indx[-1]
86
+            next
87
+        }
88
+        else
89
+        {
90
+            cc$AByClust[[indx[1]]] <- icc[[2]][[1]]
91
+            cc$RtoMeanPattern[[indx[1]]] <- icc[[1]][[1]]
92
+            if (length(icc[[2]]) > 1)
93
+            {
94
+                cc$AByClust<-append(cc$AByClust,icc[[2]][2])
95
+                cc$RtoMeanPattern<-append(cc$RtoMeanPattern,icc[[1]][2])
96
+            } 
97
+            indx <- which(unlist(lapply(cc$AByClust,function(x) dim(x)[1]>maxNS)))
98
+        }
99
+    }
100
+
101
+    #weighted.mean(AByClustDrop[[1]],RtoMPDrop[[1]])
102
+    AByCDSWavg <- t(sapply(1:length(cc$AByClust), function(z)
103
+        apply(cc$AByClust[[z]], 1, function(x)
104
+            weighted.mean(x, (cc$RtoMeanPattern[[z]])^3))))
105
+    rownames(AByCDSWavg) <- lapply(1:length(cc$AByClust), function(x)
106
+        paste("Pattern",x))
107
+    #scale As
108
+    Amax <- apply(AByCDSWavg, 1, max)
109
+    AByCDSWavgScaled <- t(sapply(1:dim(AByCDSWavg)[1], function(x)
110
+        AByCDSWavg[x,] / Amax[x]))
111
+    rownames(AByCDSWavgScaled) <- rownames(AByCDSWavg)
112
+
113
+    if (bySet)
114
+    {  
115
+        return(list("consenusAs"=t(AByCDSWavgScaled),"ABySet"=cc))
116
+    }
117
+    else
118
+    {
119
+        return(AByCDSWavgScaled)
120
+    }
121
+}
... ...
@@ -36,4 +36,4 @@ createGWCoGAPSSets <- function(D, S, nSets, simulationName)
36 36
             ".RData", sep=""));
37 37
     }
38 38
     return(simulationName)
39
-}
39
+}
40 40
\ No newline at end of file
41 41
new file mode 100644
... ...
@@ -0,0 +1,57 @@
1
+#' Create Gene Sets for scCoGAPS
2
+#' @export
3
+#'
4
+#' @description factors whole genome data into randomly generated sets for indexing
5
+#' @param D data matrix
6
+#' @param nSets number of sets to partition the data into
7
+#' @param simulationName name used to identify files created by this simulation
8
+#' @param samplingRatio vector of relative quantities to use for sampling celltypes
9
+#' @param anotionObj vector of same length as number of columns of D 
10
+#' @param path character string indicating were to save resulting data objects. default is current working dir
11
+#' @return simulationName used to identify saved files
12
+#' @examples
13
+#' data(SimpSim)
14
+#' createscCoGAPSSets(SimpSim.D, nSets=2, simulationName="example")
15
+createscCoGAPSSets <- function(D, nSets, simulationName, samplingRatio=NULL,
16
+path="", anotionObj=NULL)
17
+{
18
+    # check gene names
19
+    if (length(unique(colnames(D))) != length(colnames(D)))
20
+    {
21
+        warning("Cell identifiers not unique!")
22
+    }
23
+
24
+    # partition data by sampling random sets of cells
25
+    cells <- 1:ncol(D)
26
+    setSize <- floor(length(cells) / nSets)
27
+    for (set in 1:nSets)
28
+    {
29
+        if (is.null(samplingRatio))
30
+        {
31
+            # sample cell names
32
+            sampleSize <- ifelse(set == nSets, length(cells), setSize)
33
+            cellset <- sample(cells, sampleSize, replace=FALSE)
34
+            cells <- cells[!(cells %in% cellset)]
35
+        }
36
+        else
37
+        {
38
+            if (length(unique(anotionObj)) != length(samplingRatio))
39
+            {
40
+                warning("Not all celltypes will be sampled from.")
41
+            }
42
+            ct.indx <- lapply(unique(anotionObj), function(x) which(anotionObj == x))
43
+            cellset <- lapply(unique(anotionObj), function(x)
44
+                sample(colnames(D)[ct.indx[[x]]], samplingRatio[x],replace=TRUE))
45
+        }
46
+
47
+        # partition data
48
+        sampleD <- D[,cellset]
49
+        #log transform 
50
+        sampleD <- log2(sampleD+1)
51
+        # generate S
52
+        sampleS <- pmax(.1*sampleD, .1)
53
+        save(sampleD, sampleS, file=paste0(path,simulationName, "_partition_",
54
+            set,".RData"));
55
+    }
56
+    return(simulationName)
57
+}
... ...
@@ -8,70 +8,79 @@
8 8
 #' @param full logical indicating whether to return the ranks of each gene for each pattern
9 9
 #' @return By default a non-overlapping list of genes associated with each \code{lp}. If \code{full=TRUE} a data.frame of
10 10
 #' genes rankings with a column for each \code{lp} will also be returned.
11
+#' @export
11 12
 patternMarkers <- function(Amatrix=NA, scaledPmatrix=FALSE, Pmatrix=NA,
12 13
 threshold="all", lp=NA, full=FALSE)
13 14
 {
14
-    if(scaledPmatrix==FALSE)
15
+    if (scaledPmatrix==FALSE)
15 16
     {
16
-        if(!is.na(Pmatrix)){
17
-        pscale <- apply(Pmatrix,1,max)   # rescale p's to have max 1
18
-        Amatrix <- sweep(Amatrix, 2, pscale, FUN="*")   # rescale A in accordance with p's having max 1
17
+        if(!is.na(Pmatrix))
18
+        {
19
+            pscale <- apply(Pmatrix,1,max)   # rescale p's to have max 1
20
+            Amatrix <- sweep(Amatrix, 2, pscale, FUN="*")   # rescale A in accordance with p's having max 1
21
+        }
22
+        else
23
+        {
24
+            warning("P values must be provided if not already scaled")
19 25
         }
20
-        else(warning("P values must be provided if not already scaled"))
21 26
     }
22 27
 
23 28
     # find the A with the highest magnitude
24 29
     Arowmax <- t(apply(Amatrix, 1, function(x) x/max(x)))
25
-    pmax<-apply(Amatrix, 1, max)
26
-    # determine which genes are most associated with each pattern
27
-    ssranks<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=dimnames(Amatrix))#list()
28
-    ssgenes<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=NULL)
29
-    nP=dim(Amatrix)[2]
30
-    if(!is.na(lp))
30
+
31
+    if (!is.na(lp))
31 32
     {
32
-        if(length(lp)!=dim(Amatrix)[2])
33
+        if (length(lp) != dim(Amatrix)[2])
33 34
         {
34 35
             warning("lp length must equal the number of columns of the Amatrix")
35 36
         }
36
-        for (i in 1:nP)
37
-        {
38
-            sstat <- apply(Arowmax, 1, function(x) sqrt(t(x-lp)%*%(x-lp)))
39
-            ssranks[order(sstat),i] <- 1:length(sstat)
40
-            ssgenes[,i]<-names(sort(sstat,decreasing=FALSE))
41
-        }
37
+        sstat <- apply(Arowmax, 1, function(x) sqrt(t(x-lp)%*%(x-lp)))
38
+        ssranks <-rank(sstat)
39
+        ssgenes.th <- names(sort(sstat,decreasing=FALSE,na.last=TRUE))
42 40
     }
43 41
     else
44 42
     {
45
-        for(i in 1:nP)
43
+        # determine which genes are most associated with each pattern
44
+
45
+        sstat<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=dimnames(Amatrix))
46
+        ssranks<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=dimnames(Amatrix))#list()
47
+        ssgenes<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=NULL)
48
+        for (i in 1:nP)
46 49
         {
47 50
             lp <- rep(0,dim(Amatrix)[2])
48 51
             lp[i] <- 1
49
-            sstat <- apply(Arowmax, 1, function(x) sqrt(t(x-lp)%*%(x-lp)))
50
-            ssranks[order(sstat),i] <- 1:length(sstat)
51
-            ssgenes[,i]<-names(sort(sstat,decreasing=FALSE))
52
+            sstat[,i] <- unlist(apply(Arowmax, 1, function(x) sqrt(t(x-lp)%*%(x-lp))))
53
+            ssranks[,i]<-rank(sstat[,i])
54
+            ssgenes[,i]<-names(sort(sstat[,i],decreasing=FALSE,na.last=TRUE))
55
+        }
56
+        if (threshold=="cut")
57
+        {
58
+            geneThresh <- sapply(1:nP,function(x) min(which(ssranks[ssgenes[,x],x] > apply(ssranks[ssgenes[,x],],1,min))))
59
+            ssgenes.th <- sapply(1:nP,function(x) ssgenes[1:geneThresh[x],x])
60
+        }
61
+        else if (threshold=="all")
62
+        {
63
+            pIndx<-unlist(apply(sstat,1,which.min))
64
+            gBYp <- list()
65
+            for(i in sort(unique(pIndx)))
66
+            {
67
+                gBYp[[i]]<-sapply(strsplit(names(pIndx[pIndx==i]),"[.]"),function(x) x[[1]][1])
68
+            }
69
+            ssgenes.th <- lapply(1:max(sort(unique(pIndx))), function(x)
70
+                ssgenes[which(ssgenes[,x] %in% gBYp[[x]]),x])
71
+        }
72
+        else
73
+        {
74
+            stop("Threshold arguement not viable option")
52 75
         }
53 76
     }
54
-
55
-    if(threshold=="cut")
56
-    {
57
-        geneThresh <- sapply(1:nP,function(x) min(which(ssranks[ssgenes[,x],x] > apply(ssranks[ssgenes[,x],],1,min))))
58
-        ssgenes.th <- sapply(1:nP,function(x) ssgenes[1:geneThresh[x],x])
59
-        #geneThresh <- apply(sweep(ssranks,1,t(apply(ssranks, 1, min)),"-"),2,function(x) which(x==0))
60
-        #ssgenes.th <- lapply(geneThresh,names)
61
-    }
62
-    else if (threshold=="all")
77
+    if (full)
63 78
     {
64
-        pIndx<-apply(ssranks,1,which.min)
65
-        gBYp <- lapply(sort(unique(pIndx)),function(x) names(pIndx[pIndx==x]))
66
-        ssgenes.th <- lapply(1:nP, function(x) ssgenes[which(ssgenes[,x] %in% gBYp[[x]]),x])
79
+        return(list("PatternMarkers"=ssgenes.th, "PatternRanks"=ssranks,
80
+            "PatternMarkerScores"=sstat))
67 81
     }
68 82
     else
69 83
     {
70
-        stop("Threshold arguement not viable option")
71
-    }
72
-
73
-    if (full)
74
-        return(list("PatternMarkers"=ssgenes.th,"PatternRanks"=ssranks))
75
-    else
76 84
         return("PatternMarkers"=ssgenes.th)
85
+    }
77 86
 }
... ...
@@ -1,129 +1,129 @@
1 1
 #' patternMatch4Parallel
2 2
 #'
3
-#' @param Ptot a matrix containing the total by set estimates of Pmean output from \code{reOrderBySet}
3
+#' @param Ptot a matrix containing the total by set estimates of Pmean output
4
+#' from \code{reOrderBySet}
4 5
 #' @param nSets number of parallel sets used to generate \code{Ptot}
5 6
 #' @param cnt  number of branches at which to cut dendrogram
6 7
 #' @param minNS minimum of individual set contributions a cluster must contain
8
+#' @param maxNS max of individual set contributions a cluster must contain.
9
+#' default is nSets+minNS
7 10
 #' @param cluster.method the agglomeration method to be used for clustering
8
-#' @param ignore.NA logical indicating whether or not to ignore NAs from potential over dimensionalization. Default is FALSE.
9
-#' @param bySet logical indicating whether to return list of matched set solutions from \code{Ptot}
11
+#' @param ignore.NA logical indicating whether or not to ignore NAs from
12
+#' potential over dimensionalization. Default is FALSE.
13
+#' @param bySet logical indicating whether to return list of matched set
14
+#' solutions from \code{Ptot}
10 15
 #' @param ... additional parameters for \code{agnes}
11
-#' @return a matrix of concensus patterns by samples. If \code{bySet=TRUE} then a list of the set contributions to each
16
+#' @return a matrix of concensus patterns by samples. If \code{bySet=TRUE} then
17
+#' a list of the set contributions to each
12 18
 #' concensus pattern is also returned.
13 19
 #' @seealso \code{\link{agnes}}
14
-patternMatch4Parallel <- function(Ptot, nSets, cnt, minNS, 
20
+#' @export
21
+patternMatch4Parallel <- function(Ptot, nSets, cnt, minNS=NA, maxNS=NULL, 
15 22
 cluster.method="complete", ignore.NA=FALSE, bySet=FALSE, ...)
16 23
 {
17
-    if (!is.null(minNS))
18
-        minNS=nSets/2
19
-
24
+    if (is.na(minNS))
25
+    {
26
+        minNS <- ceiling(nSets / 2)
27
+    }
28
+    if (is.null(maxNS))
29
+    {
30
+        maxNS=nSets+minNS
31
+    }
20 32
     if (ignore.NA==FALSE & anyNA(Ptot))
21
-        warning('Non-sparse matrixes produced. Reducing the number of patterns asked for and rerun.')
33
+    {
34
+        warning(paste("Non-sparse matrixes produced. Reducing the number of",
35
+            "patterns asked for and rerun."))
36
+    }
22 37
     if (ignore.NA==TRUE)
38
+    {
23 39
         Ptot <- Ptot[complete.cases(Ptot),]
40
+    }
24 41
 
25
-    # corr dist
26
-    corr.dist=cor(t(Ptot))
27
-    corr.dist=1-corr.dist
28
-    # cluster
29
-    #library(cluster)
30
-    clust=agnes(x=corr.dist,diss=TRUE,method=cluster.method)
31
-    cut=cutree(as.hclust(clust),k=cnt)
32
-    #save.image(file=paste("CoGAPS.",nP,"P.",nS,"Set.CorrClustCut",cnt,".RData"))
42
+    corcut <-function(Ptot,minNS,cnt,cluster.method)
43
+    {
44
+        # corr dist
45
+        corr.dist <- cor(t(Ptot))
46
+        corr.dist <- 1-corr.dist
47
+        # cluster
48
+        #library(cluster)
49
+        clust <- agnes(x=corr.dist, diss=TRUE, method=cluster.method)
50
+        cut <- cutree(as.hclust(clust), k=cnt)
51
+        #save.image(file=paste("CoGAPS.", nP, "P.", nS, "Set.CorrClustCut",
52
+            #cnt,".RData"))
33 53
 
34
-    #drop n<4 and get weighted Avg
35
-    cls=sort(unique(cut))
36
-    cMNs=matrix(nrow=cnt,ncol=dim(Ptot)[2])
37
-    rownames(cMNs)=cls
38
-    colnames(cMNs)=colnames(Ptot)
54
+        cls <- sort(unique(cut))
55
+        cMNs <- matrix(nrow=cnt, ncol=dim(Ptot)[2])
56
+        rownames(cMNs) <- cls
57
+        colnames(cMNs) <- colnames(Ptot)
39 58
 
40
-    RtoMeanPattern <- list()
41
-    PByClust <- list()
42
-    for(i in cls)
43
-    {
44
-        if (is.null(dim(Ptot[cut == i, ]))==TRUE)
59
+        RtoMeanPattern <- list()
60
+        PByClust <- list()
61
+        for (i in cls)
45 62
         {
46
-            cMNs[i,] <- Ptot[cut == i, ]
47
-            RtoMeanPattern[[i]] <- rep(1,length(Ptot[cut == i, ]))
48
-            PByClust[[i]] <- t(as.matrix(Ptot[cut == i, ]))
49
-        }
50
-        else
51
-        {
52
-            cMNs[i,]=colMeans(Ptot[cut==i,])
53
-            PByClust[[i]] <- Ptot[cut==i,]
54
-            nIN=sum(cut==i)
55
-            RtoMeanPattern[[i]] <- sapply(1:nIN,function(j) {round(cor(x=Ptot[cut==i,][j,],y=cMNs[i,]),3)})
63
+            if (!is.null(dim(Ptot[cut == i,])))
64
+            {
65
+                if (dim(Ptot[cut == i,])[1] >= minNS)
66
+                {
67
+                    cMNs[i,] <- colMeans(Ptot[cut==i,])
68
+                    PByClust[[i]] <- Ptot[cut==i,]
69
+                    nIN <- sum(cut==i)
70
+                    RtoMeanPattern[[i]] <- sapply(1:nIN, function(j)
71
+                        round(cor(x=Ptot[cut==i,][j,], y=cMNs[i,]),3)
72
+                    )
73
+                }
74
+            }
56 75
         }
57
-    }
58 76
 
59
-    #drop n<minNS 
60
-    PByClustDrop <- list()
61
-    RtoMPDrop <- list()
62
-    for(i in cls)
63
-    {
64
-        if (is.null(dim(PByClust[[i]])) == TRUE)
65
-            next
66
-        if (dim(PByClust[[i]])[1] < minNS)
67
-        {
68
-            next
69
-        }
70
-        else
71
-        {
72
-            #indx <- which(RtoMeanPattern[[i]]>.7,arr.ind = TRUE)
73
-            PByClustDrop <- append(PByClustDrop,list(PByClust[[i]]))
74
-            RtoMPDrop <- append(RtoMPDrop,list(RtoMeanPattern[[i]]))
75
-        }
76
-    }
77
+        PByClust[sapply(PByClust,is.null)] <- NULL
78
+        RtoMeanPattern[sapply(RtoMeanPattern,is.null)] <- NULL
79
+        return(list("RtoMeanPattern"=RtoMeanPattern, "PByClust"=PByClust))
80
+    }    
77 81
 
78
-    ### split by corr  (build in drop if below minNS)
79
-    PByCDS <- list()
80
-    RtoMPDS <- list()
81
-    for (j in 1:length(PByClustDrop))
82
+    cc <- corcut(Ptot,minNS,cnt,cluster.method)
83
+
84
+    ### split by maxNS
85
+    indx <- which(unlist(lapply(cc$PByClust,function(x) dim(x)[1]>maxNS)))
86
+    while(length(indx) > 0)
82 87
     {
83
-        if (is.null(dim(PByClustDrop[[j]]))==TRUE)
88
+        icc <- corcut(cc$PByClust[[indx[1]]],minNS,2,cluster.method)
89
+        if (length(icc[[2]])==0)
84 90
         {
85
-            next
91
+          indx <- indx[-1]
92
+          next
86 93
         }
87
-        if (dim(PByClustDrop[[j]])[1]<minNS+nSets)
88
-        {
89
-            PByCDS <- append(PByCDS,PByClustDrop[j])
90
-            RtoMPDS <- append(RtoMPDS,RtoMPDrop[j])
91
-        }
92
-        if (dim(PByClustDrop[[j]])[1]>=minNS+nSets)
94
+        else
93 95
         {
94
-            corr.distPBCD=cor(t(PByClustDrop[[j]]))
95
-            corr.distPBCD=1-corr.distPBCD
96
-            clustPBCD=agnes(x=corr.distPBCD,diss=TRUE,method="complete")
97
-            cutPBCD=cutree(as.hclust(clustPBCD),k=2)
98
-            g1 <- PByClustDrop[[j]][cutPBCD==1,]
99
-            PByCDS <- append(PByCDS,list(g1))
100
-            RtoMPDS <- append(RtoMPDS,list(sapply(1:dim(g1)[1],function(z) round(cor(x=g1[z,],y=colMeans(PByClustDrop[[j]][cutPBCD==1,])),3))))
101
-            g2 <- PByClustDrop[[j]][cutPBCD==2,]
102
-            if (is.null(dim(g2)[1])==FALSE)
96
+            cc$PByClust[[indx[1]]] <- icc[[2]][[1]]
97
+            cc$RtoMeanPattern[[indx[1]]] <- icc[[1]][[1]]
98
+            if (length(icc[[2]]) > 1)
103 99
             {
104
-                PByCDS <- append(PByCDS,list(g2))
105
-                RtoMPDS <- append(RtoMPDS,list(sapply(1:dim(g2)[1],function(z) round(cor(x=g2[z,],y=colMeans(PByClustDrop[[j]][cutPBCD==2,])),3))))
106
-            }
100
+                cc$PByClust <- append(cc$PByClust,icc[[2]][2])
101
+                cc$RtoMeanPattern <- append(cc$RtoMeanPattern,icc[[1]][2])
102
+            } 
103
+            indx <- which(unlist(lapply(cc$PByClust, function(x)
104
+                dim(x)[1] > maxNS)))
107 105
         }
108 106
     }
109 107
 
110
-    #weighted.mean(PByClustDrop[[1]],RtoMPDrop[[1]])
111
-    PByCDSWavg<- t(sapply(1:length(PByCDS),function(z) apply(PByCDS[[z]],2,function(x) weighted.mean(x,(RtoMPDS[[z]])^3))))
112
-    rownames(PByCDSWavg) <- lapply(1:length(PByCDS),function(x) paste("Pattern",x))
108
+    #weighted.mean
109
+    PByCDSWavg <- t(sapply(1:length(cc$PByClust), function(z)
110
+        apply(cc$PByClust[[z]],2, function(x)
111
+            weighted.mean(x, (cc$RtoMeanPattern[[z]])^3))))
112
+    rownames(PByCDSWavg) <- lapply(1:length(cc$PByClust), function(x)
113
+        paste("Pattern", x))
113 114
 
114 115
     #scale ps
115 116
     Pmax <- apply(PByCDSWavg,1,max)
116
-    PByCDSWavgScaled <- t(sapply(1:dim(PByCDSWavg)[1],function(x) PByCDSWavg[x,]/Pmax[x]))
117
+    PByCDSWavgScaled <- t(sapply(1:dim(PByCDSWavg)[1], function(x)
118
+        PByCDSWavg[x,] / Pmax[x]))
117 119
     rownames(PByCDSWavgScaled) <- rownames(PByCDSWavg)
118 120
 
119
-    if(bySet)
121
+    if (bySet)
120 122
     {
121
-        # return by set and final
122
-        PBySet<-PByCDS
123
-        return(list("consenusPatterns"=PByCDSWavgScaled,"PBySet"=PBySet,"RtoMPDS"=RtoMPDS))
123
+        return(list("consenusPatterns"=PByCDSWavgScaled, "PBySet"=cc))
124 124
     }
125 125
     else
126 126
     {
127
-        return(PByCDSWavgScaled)
127
+        return("consenusPatterns"=PByCDSWavgScaled)
128 128
     }
129 129
 }
... ...
@@ -13,7 +13,7 @@
13 13
 #' @return heatmap of the \code{data} values for the \code{patternMarkers}
14 14
 #' @seealso  \code{\link{heatmap.2}}
15 15
 plotPatternMarkers <- function(data=NA, patternMarkers=NA, patternPalette=NA,
16
-sampleNames=NA, samplePalette=NULL, colDenogram=TRUE, heatmapCol="bluered",
16
+sampleNames=NA, samplePalette=NULL, colDenogram=TRUE, heatmapCol,
17 17
 scale='row', ...)
18 18
 {
19 19
     if (is.null(samplePalette))
... ...
@@ -1,27 +1,61 @@
1 1
 #' Post Processing of Parallel Output
2 2
 #'
3 3
 #' @param AP.fixed output of parallel gapsMapRun calls with same FP
4
-#' @param setPs data.frame with rows giving fixed patterns for P used as input
4
+#' @param setValues data.frame with rows giving fixed patterns for P used as input
5 5
 #' for gapsMapRun
6
+#' @param setMatrix which matrix, A or P
6 7
 #' @return list of two data.frames containing the A matrix estimates or their
7 8
 #' corresponding standard deviations from output of parallel CoGAPS
8
-postFixed4Parallel <- function(AP.fixed, setPs)
9
+postFixed4Parallel <- function(AP.fixed, setValues, setMatrix="P")
9 10
 {
10
-    ASummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Amean))
11
-    Asd <- do.call(rbind,lapply(AP.fixed, function(x) x$Asd))
12
-    #PSummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Pmean))
13
-    PSummary <- AP.fixed[[1]]$Pmean
11
+    if (setMatrix=="P")
12
+    {
13
+        ASummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Amean))
14
+        Asd <- do.call(rbind,lapply(AP.fixed, function(x) x$Asd))
15
+        #PSummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Pmean))
16
+        PSummary <- AP.fixed[[1]]$Pmean
14 17
 
15
-    Pmax <- apply(PSummary,1,max)
16
-    Pneu <- sweep(PSummary,1,Pmax,FUN="/")
17
-    Aneu <- sweep(ASummary,2,Pmax,FUN="*")
18
+        Pmax <- apply(PSummary,1,max)
19
+        Pneu <- sweep(PSummary,1,Pmax,FUN="/")
20
+        Aneu <- sweep(ASummary,2,Pmax,FUN="*")
18 21
 
19
-    X <- apply(Pneu,1,range)
20
-    Y <- apply(setPs,1,range)
21
-    colnames(X) <- colnames(Y)
22
-    if (all.equal(X,Y,tolerance=0.01) != TRUE)
23
-        warning("Patterns do not match fixed values.")
22
+        X <- apply(Pneu,1,range)
23
+        Y <- apply(setValues,1,range)
24
+        colnames(X) <- colnames(Y)
25
+        if (!all.equal(X,Y,tolerance=0.01))
26
+        {
27
+            warning("Patterns do not match fixed values.")
28
+        }
29
+
30
+        As4fixPs<-list("A"=Aneu,"Asd"=Asd)
31
+        return(As4fixPs)
32
+    }
33
+    else if (setMatrix=="A")
34
+    {
35
+        PSummary <- do.call(cbind, lapply(AP.fixed, function(x) x$Pmean))
36
+        Psd <- do.call(cbind, lapply(AP.fixed, function(x) x$Psd))
37
+        #PSummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Pmean))
38
+        ASummary <- AP.fixed[[1]]$Amean
39
+
40
+        Amax <- apply(ASummary,2,max)
41
+        Aneu <- sweep(ASummary,2,Amax,FUN="/")
42
+        Pneu <- sweep(PSummary,1,Amax,FUN="*")
43
+
44
+        X <- apply(Aneu, 2, range)
45
+        Y <- apply(setValues, 2, range)
46
+        rownames(X) <- rownames(Y)
47
+        colnames(X) <- colnames(Y)
48
+        if (!all.equal(X, Y, tolerance=0.01))
49
+        {
50
+            warning("As do not match fixed values.")
51
+        }
52
+
53
+        Ps4fixAs <- list("P"=Pneu,"Psd"=Psd)
54
+        return(Ps4fixAs)
55
+    }
56
+    else
57
+    {
58
+        warning("setMatrix can only take values of 'A' or 'P'")
59
+    }
24 60
 
25
-    As4fixPs<-list("A"=Aneu,"Asd"=Asd)
26
-    return(As4fixPs)
27 61
 }
28 62
new file mode 100644
... ...
@@ -0,0 +1,26 @@
1
+#' Post Processing of Parallel Output
2
+#'
3
+#' @param AP.fixed output of parallel gapsMapRun calls with same FP
4
+#' @param setAs data.frame with rows giving fixed patterns for A used as input
5
+#' for gapsMapRun
6
+#' @return list of two data.frames containing the A matrix estimates or their
7
+#' corresponding standard deviations from output of parallel CoGAPS
8
+postFixed4SC <- function(AP.fixed, setAs)
9
+{
10
+    ASummary <- AP.fixed[[1]]$Amean
11
+    PSummary <- do.call(cbind,lapply(AP.fixed, function(x) x$Pmean))
12
+    Psd <- do.call(cbind,lapply(AP.fixed, function(x) x$Psd))
13
+
14
+    Amax <- apply(ASummary,2,max)
15
+    Aneu <- sweep(ASummary,2,Amax,FUN="/")
16
+    Pneu <- sweep(PSummary,1,Amax,FUN="*")
17
+
18
+    X <- apply(Aneu,2,range)
19
+    Y <- apply(setAs,2,range)
20
+    colnames(X) <- colnames(Y)
21
+    if (all.equal(X,Y,tolerance=0.01) != TRUE)
22
+        warning("Patterns do not match fixed values.")
23
+
24
+    Ps4fixAs<-list("P"=Pneu,"Psd"=Psd)
25
+    return(Ps4fixAs)
26
+}
0 27
new file mode 100644
... ...
@@ -0,0 +1,206 @@
1
+#' scCoGAPS
2
+#' @export
3
+#'
4
+#' @details calls the C++ MCMC code and performs Bayesian
5
+#' matrix factorization returning the two matrices that reconstruct
6
+#' the data matrix for whole genome data;
7
+#' @param simulationName name of this simulation
8
+#' @param nFactor number of patterns (basis vectors, metagenes), which must be
9
+#' greater than or equal to the number of rows of FP
10
+#' @param nCores number of cores for parallelization. If left to the default NA, nCores = nSets.
11
+#' @param cut number of branches at which to cut dendrogram used in patternMatch4singleCell
12
+#' @param minNS minimum of individual set contributions a cluster must contain
13
+#' @param manualMatch logical indicating whether or not to stop after initial phase for manual pattern matching
14
+#' @param consensusAs fixed pattern matrix to be used to ensure reciprocity of A weights accross sets 
15
+#' @param ... additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun}
16
+#' @return list of A and P estimates
17
+scCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, manualMatch=FALSE, consensusAs=NULL, ...)
18
+{
19
+    if (!is.null(list(...)$checkpointFile))
20
+    {
21
+        stop("checkpoint file name automatically set in GWCoGAPS - don't pass this parameter")
22
+    }
23
+
24
+    if (is.null(consensusAs))
25
+    {
26
+        allDataSets <- sc_preInitialPhase(simulationName, nCores)
27
+        initialResult <- sc_runInitialPhase(simulationName, allDataSets, nFactor, ...)
28
+        if (manualMatch)
29
+        {
30
+            saveRDS(initialResult,file=paste(simulationName,"_initial.rds", sep=""))
31
+            stop("Please provide concensus gene weights upon restarting.")
32
+        }
33
+        matchedAmplitudes <- sc_postInitialPhase(initialResult, length(allDataSets), cut, minNS)
34
+        consensusAs <- matchedAmplitudes[[1]]
35
+        save(consensusAs, file=paste(simulationName, "_matched_As.RData", sep=""))
36
+    } 
37
+    finalResult <- sc_runFinalPhase(simulationName, allDataSets, consensusAs, nCores, ...)
38
+    return(sc_postFinalPhase(finalResult, consensusAs))
39
+}
40
+
41
+#' Restart a scCoGAPS run from a Checkpoint
42
+#'
43
+#' @inheritParams GWCoGAPS
44
+#' @return list of A and P estimates
45
+#' @importFrom utils file_test
46
+scCoGapsFromCheckpoint <- function(simulationName, nCores, cut=NA, minNS=NA, ...)
47
+{
48
+    # find data files
49
+    allDataSets <- sc_preInitialPhase(simulationName, nCores)
50
+
51
+    # figure out phase from file signature
52
+    initialCpts <- list.files(full.names=TRUE, pattern=paste(simulationName,
53
+            "_initial_cpt_[0-9]+.out", sep=""))
54
+    finalCpts <- list.files(full.names=TRUE, pattern=paste(simulationName,
55
+            "_final_cpt_[0-9]+.out", sep=""))
56
+
57
+    if (length(finalCpts))
58
+    {
59
+        finalResult <- foreach(i=1:length(allDataSets)) %dopar%
60
+        {
61
+            # load data set and shift values so gene minimum is zero
62
+            load(allDataSets[[i]])
63
+            sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x)
64
+                pmin(0,min(x))))
65
+    
66
+            # run CoGAPS with fixed patterns
67
+            cptFileName <- paste(simulationName, "_final_cpt_", i, ".out",
68
+                sep="")
69
+            CoGapsFromCheckpoint(sampleD, sampleS, cptFileName)
70
+        }
71
+        load(paste(simulationName, "_matched_As.RData", sep=""))
72
+    }
73
+    else if (file_test("-f", paste(simulationName, "_matched_As.RData", sep="")))
74
+    {
75
+        load(paste(simulationName, "_matched_As.RData", sep=""))
76
+        finalResult <- sc_runFinalPhase(simulationName, allDataSets,
77
+            consensusAs, nCores, ...)
78
+    }
79
+    else if (length(initialCpts))
80
+    {
81
+        # initial phase - always needs to be run to get matchedAmplitudes
82
+        initialResult <- foreach(i=1:length(allDataSets)) %dopar%
83
+        {
84
+            # load data set and shift values so gene minimum is zero
85
+            load(allDataSets[[i]])
86
+            sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x)
87
+                pmin(0,min(x))))
88
+
89
+            # run CoGAPS from checkpoint
90
+            cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out",
91
+                sep="")
92
+            CoGapsFromCheckpoint(sampleD, sampleS, cptFileName)
93
+        }
94
+        matchedAmplitudes <- sc_postInitialPhase(initialResult,
95
+            length(allDataSets), cut, minNS)
96
+        consensusAs <- matchedAmplitudes[[1]]
97
+        save(consensusAs, file=paste(simulationName, "_matched_As.RData",
98
+            sep=""))
99
+        finalResult <- sc_runFinalPhase(simulationName, allDataSets,
100
+            consensusAs, nCores, ...)
101
+    }
102
+    else
103
+    {
104
+        stop("no checkpoint files found")
105
+    }
106
+    return(sc_postFinalPhase(finalResult, consensusAs))
107
+}
108
+
109
+sc_preInitialPhase <- function(simulationName, nCores)
110
+{
111
+    # find data files
112
+    fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="")
113
+    allDataSets <- list.files(full.names=TRUE, pattern=fileSig)
114
+
115
+    # establish the number of cores that we are able to use
116
+    if (is.na(nCores))
117
+    {
118
+        nCores <- length(allDataSets)
119
+    }
120
+    registerDoParallel(cores=nCores)
121
+    return(allDataSets)
122
+}
123
+
124
+sc_runInitialPhase <- function(simulationName, allDataSets, nFactor, ...)
125
+{
126
+    #generate seeds for parallelization
127
+    nut <- generateSeeds(chains=length(allDataSets), seed=-1)
128
+
129
+    # run CoGAPS for each set
130
+    initialResult <- foreach(i=1:length(allDataSets)) %dopar%
131
+    {
132
+        # load data set and shift values so gene minimum is zero
133
+        load(allDataSets[[i]])
134
+        sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x))))
135
+
136
+        # run CoGAPS without any fixed patterns
137
+        cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out", sep="")
138
+        CoGAPS(sampleD, sampleS, nFactor=nFactor, seed=nut[i],
139
+            checkpointFile=cptFileName, singleCellRNASeq=TRUE, ...)
140
+    }
141
+    return(initialResult)
142
+}
143
+
144
+sc_postInitialPhase <- function(initialResult, nSets, cut, minNS)
145
+{
146
+    nFactor <- ncol(initialResult[[1]]$Amean)
147
+    BySet <- reOrderBySet(AP=initialResult, nFactor=nFactor, nSets=nSets,match="A")
148
+
149
+    #run postpattern match function
150
+    if (is.na(cut))
151
+    {
152
+        cut <- nFactor
153
+    }
154
+
155
+    return(cellMatchR(Atot=BySet$A, nSets=nSets, cnt=cut, minNS=minNS,
156
+        bySet=TRUE))
157
+}
158
+
159
+sc_runFinalPhase <- function(simulationName, allDataSets, consensusAs, nCores, ...)
160
+{    
161
+    if (length(dim(consensusAs)) != 2)
162
+    {
163
+        stop("consensusAs must be a matrix")
164
+    }
165
+
166
+    # find data files if providing consensus patterns
167
+    fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="")
168
+    allDataSets <- list.files(full.names=TRUE, pattern=fileSig)
169
+
170
+    if (is.na(nCores))
171
+    {
172
+        nCores <- length(allDataSets)
173
+    }
174
+    registerDoParallel(cores=nCores)
175
+
176
+    # generate seeds for parallelization
177
+    nut <- generateSeeds(chains=length(allDataSets), seed=-1)
178
+
179
+    # final number of factors
180
+    nFactorFinal <- ncol(consensusAs)
181
+
182
+    # run fixed CoGAPS
183
+    finalResult <- foreach(i=1:length(allDataSets)) %dopar%
184
+    {
185
+        # load data set and shift values so gene minimum is zero
186
+        load(allDataSets[[i]])
187
+        sampleD <- sweep(sampleD, 1, apply(sampleD, 1, function(x) pmin(0,min(x))))
188
+
189
+        # run CoGAPS with fixed patterns
190
+        cptFileName <- paste(simulationName, "_final_cpt_", i, ".out", sep="")
191
+        CoGAPS(sampleD, sampleS, fixedPatterns=consensusAs,
192
+            nFactor=nFactorFinal, seed=nut[i], checkpointFile=cptFileName,
193
+            whichMatrixFixed='A', singleCellRNASeq=TRUE, ...)
194
+
195
+    }
196
+    return(finalResult)
197
+}
198
+
199
+sc_postFinalPhase <- function(finalResult, consensusAs)
200
+{
201
+    Aresult <- postFixed4Parallel(finalResult, consensusAs, setMatrix="A")
202
+    finalResult <- list("Pmean"=Aresult$P, "Psd"=Aresult$Psd,"Amean"=consensusAs)
203
+    class(finalResult) <- append(class(finalResult), "CoGAPS")
204
+    return(finalResult)
205
+}
206
+
... ...
@@ -1,5 +1,8 @@
1 1
 # CoGAPS
2 2
 
3
+[![Bioc](https://bioconductor.org/images/logo_bioconductor.gif)](https://bioconductor.org/packages/CoGAPS)
4
+[![downloads](https://bioconductor.org/shields/downloads/CancerInSilico.svg)](https://bioconductor.org/packages/CoGAPS)
5
+
3 6
 [![Travis-CI Build Status](https://travis-ci.org/CoGAPS/CoGAPS.svg?branch=master)](https://travis-ci.org/CoGAPS/CoGAPS)
4 7
 [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/CoGAPS/CoGAPS?branch=master&svg=true)](https://ci.appveyor.com/project/CoGAPS/CoGAPS)
5 8
 [![Coverage Status](https://img.shields.io/codecov/c/github/CoGAPS/CoGAPS/master.svg)](https://codecov.io/github/CoGAPS/CoGAPS?branch=master)
... ...
@@ -42,6 +42,5 @@ artifacts:
42 42
 
43 43
 notifications:
44 44
   - provider: Slack
45
-    auth_token:
46
-        secure: kwWklRUn6l8enJHTEE1OtUmlx+qMLfr3WBWG/fjConXijD8BAAZb2I+lL7cdoEIACu3w==
47
-    channel: cogaps
45
+    incoming_webhook:
46
+        secure: https://hooks.slack.com/services/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
48 47
\ No newline at end of file
... ...
@@ -136,8 +136,7 @@ AtomicProposal AtomicSupport::proposeExchange() const
136 136
 
137 137
     // calculate new mass
138 138
     float pupper = gaps::random::p_gamma(mass1 + mass2, 2.f, 1.f / mLambda);
139
-    float newMass = gaps::random::q_gamma(gaps::random::uniform(0.f, pupper),
140
-        2.f, 1.f / mLambda);
139
+    float newMass = gaps::random::inverseGammaSample(0.f, pupper, 2.f, 1.f / mLambda);
141 140
 
142 141
     // calculate mass changes
143 142
     float delta1 = mass1 > mass2 ? newMass - mass1 : mass2 - newMass;
... ...
@@ -9,8 +9,7 @@ CoGAPS(D, S, nFactor = 7, nEquil = 1000, nSample = 1000,
9 9
   maxGibbmassA = 100, maxGibbmassP = 100, seed = -1, messages = TRUE,
10 10
   singleCellRNASeq = FALSE, whichMatrixFixed = "N",
11 11
   fixedPatterns = matrix(0), checkpointInterval = 0,
12
-  checkpointFile = "gaps_checkpoint.out", pumpThreshold = "unique",
13
-  nPumpSamples = 0, ...)
12
+  checkpointFile = "gaps_checkpoint.out", ...)
14 13
 }
15 14
 \arguments{
16 15
 \item{D}{data matrix}
... ...
@@ -52,10 +51,6 @@ the fixed patterns}
52 51
 
53 52
 \item{checkpointFile}{name of the checkpoint file}
54 53
 
55
-\item{pumpThreshold}{type of threshold for pump statistic}
56
-
57
-\item{nPumpSamples}{number of samples used in pump statistic}
58
-
59 54
 \item{...}{keeps backwards compatibility with arguments from older versions}
60 55
 }
61 56
 \value{
... ...
@@ -4,7 +4,8 @@
4 4
 \alias{GWCoGAPS}
5 5
 \title{GWCoGAPS}
6 6
 \usage{
7
-GWCoGAPS(simulationName, nFactor, nCores = NA, cut = NA, minNS = NA, ...)
7
+GWCoGAPS(simulationName, nFactor, nCores = NA, cut = NA, minNS = NA,
8
+  manualMatch = FALSE, consensusPatterns = NULL, ...)
8 9
 }
9 10
 \arguments{
10 11
 \item{simulationName}{name of this simulation}
... ...
@@ -18,6 +19,10 @@ greater than or equal to the number of rows of FP}
18 19
 
19 20
 \item{minNS}{minimum of individual set contributions a cluster must contain}
20 21
 
22
+\item{manualMatch}{logical indicating whether or not to stop after initial phase for manual pattern matching}
23
+
24
+\item{consensusPatterns}{fixed pattern matrix to be used to ensure reciprocity of A weights accross sets}
25
+
21 26
 \item{...}{additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun}}
22 27
 }
23 28
 \value{
... ...
@@ -35,7 +40,7 @@ the data matrix for whole genome data;
35 40
 data(SimpSim)
36 41
 sim_name <- "example"
37 42
 createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, sim_name)
38
-result <- GWCoGAPS(sim_name, nFactor=3, nEquil=1000, nSample=1000)
43
+result <- GWCoGAPS(sim_name, nFactor=3, nEquil=200, nSample=200)
39 44
 }
40 45
 \seealso{
41 46
 \code{\link{gapsRun}}, \code{\link{patternMatch4Parallel}}, and \code{\link{gapsMapRun}}
... ...
@@ -4,8 +4,7 @@
4 4
 \alias{GWCoGapsFromCheckpoint}
5 5
 \title{Restart a GWCoGaps Run from Checkpoint}
6 6
 \usage{
7
-GWCoGapsFromCheckpoint(simulationName, nCores = NA, cut = NA, minNS = NA,
8
-  ...)
7
+GWCoGapsFromCheckpoint(simulationName, nCores, cut = NA, minNS = NA, ...)
9 8
 }
10 9
 \arguments{
11 10
 \item{simulationName}{name of this simulation}
... ...
@@ -24,4 +23,11 @@ list of A and P estimates
24 23
 \description{
25 24
 Restart a GWCoGaps Run from Checkpoint
26 25
 }
26
+\examples{
27
+data(SimpSim)
28
+sim_name <- "example"
29
+createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, sim_name)
30
+trash <- GWCoGAPS(sim_name, nFactor=3, nEquil=200, nSample=200)
31
+result <- GWCoGapsFromCheckpoint(sim_name, 2)
32
+}
27 33
 
... ...
@@ -32,8 +32,8 @@ listed in a gene set behaves like other genes in the set within
32 32
 the given data set
33 33
 }
34 34
 \examples{
35
-data('SimpSim')
36
-calcGeneGSStat(SimpSim.result$Amean, SimpSim.result$Asd, GStoGenes=GSets[[1]],
35
+data("SimpSim")
36
+calcGeneGSStat(SimpSim.result$Amean, SimpSim.result$Asd, GSGenes=GSets[[1]],
37 37
 numPerm=500)
38 38
 }
39 39
 
40 40
new file mode 100644
... ...
@@ -0,0 +1,36 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/cellMatchR.R
3
+\name{cellMatchR}
4
+\alias{cellMatchR}
5
+\title{cellMatchR}
6
+\usage{
7
+cellMatchR(Atot, nSets, cnt, minNS = NA, maxNS = NA, ignore.NA = FALSE,
8
+  bySet = FALSE, plotDen = FALSE, ...)
9
+}
10
+\arguments{
11
+\item{Atot}{a matrix containing the total by set estimates of Pmean output from \code{reOrderBySet}}
12
+
13
+\item{nSets}{number of parallel sets used to generate \code{Atot}}
14
+
15
+\item{cnt}{number of branches at which to cut dendrogram}
16
+
17
+\item{minNS}{minimum of individual set contributions a cluster must contain}
18
+
19
+\item{maxNS}{maximum of individual set contributions a cluster must contain}
20
+
21
+\item{ignore.NA}{logical indicating whether or not to ignore NAs from potential over dimensionalization. Default is FALSE.}
22
+
23
+\item{bySet}{logical indicating whether to return list of matched set solutions from \code{Atot}}
24
+
25
+\item{plotDen}{plot}
26
+
27
+\item{...}{additional parameters for \code{agnes}}
28
+}
29
+\value{
30
+a matrix of concensus patterns by samples. If \code{bySet=TRUE} then a list of the set contributions to each
31
+concensus pattern is also returned.
32
+}
33
+\description{
34
+cellMatchR
35
+}
36
+
... ...
@@ -35,8 +35,8 @@ comparing the inferred activity of that gene to the average activity of the
35 35
 set.
36 36
 }
37 37
 \examples{
38
-data('SimpSim')
39
-computeGeneGSProb(SimpSim.result$Amean, SimpSim.result$Asd, GStoGenes=GSets[[1]],
38
+data("SimpSim")
39
+computeGeneGSProb(SimpSim.result$Amean, SimpSim.result$Asd, GSGenes=GSets[[1]],
40 40
 numPerm=500)
41 41
 }
42 42
 
43 43
new file mode 100644
... ...
@@ -0,0 +1,33 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/createscCoGAPSSets.R
3
+\name{createscCoGAPSSets}
4
+\alias{createscCoGAPSSets}
5
+\title{Create Gene Sets for scCoGAPS}
6
+\usage{
7
+createscCoGAPSSets(D, nSets, simulationName, samplingRatio = NULL,
8
+  path = "", anotionObj = NULL)
9
+}