... | ... |
@@ -1,9 +1,10 @@ |
1 | 1 |
Package: CoGAPS |
2 |
-Version: 2.7.0 |
|
3 |
-Date: 2014-08-23 |
|
2 |
+Version: 2.99.0 |
|
3 |
+Date: 2018-04-24 |
|
4 | 4 |
Title: Coordinated Gene Activity in Pattern Sets |
5 | 5 |
Author: Thomas Sherman, Wai-shing Lee, Conor Kelton, Ondrej Maxian, Jacob Carey, |
6 |
- Genevieve Stein-O'Brien, Michael Considine, Maggie Wodicka, John Stansfield, Shawn Sivy, Carlo Colantuoni, Alexander Favorov, Mike Ochs, Elana Fertig |
|
6 |
+ Genevieve Stein-O'Brien, Michael Considine, Maggie Wodicka, John Stansfield, |
|
7 |
+ Shawn Sivy, Carlo Colantuoni, Alexander Favorov, Mike Ochs, Elana Fertig |
|
7 | 8 |
Description: Coordinated Gene Activity in Pattern Sets (CoGAPS) |
8 | 9 |
implements a Bayesian MCMC matrix factorization algorithm, |
9 | 10 |
GAPS, and links it to gene set statistic methods to infer biological |
... | ... |
@@ -41,4 +42,4 @@ biocViews: GeneExpression, Transcription, GeneSetEnrichment, |
41 | 42 |
DifferentialExpression, Bayesian, Clustering, TimeCourse, RNASeq, Microarray, |
42 | 43 |
MultipleComparison, DimensionReduction |
43 | 44 |
NeedsCompilation: yes |
44 |
-RoxygenNote: 6.0.1 |
|
45 |
+RoxygenNote: 5.0.1 |
... | ... |
@@ -24,8 +24,6 @@ |
24 | 24 |
#' @param fixedPatterns matrix of fixed values in either A or P matrix |
25 | 25 |
#' @param checkpointInterval time (in seconds) between creating a checkpoint |
26 | 26 |
#' @param checkpointFile name of the checkpoint file |
27 |
-#' @param pumpThreshold type of threshold for pump statistic |
|
28 |
-#' @param nPumpSamples number of samples used in pump statistic |
|
29 | 27 |
#' @param ... keeps backwards compatibility with arguments from older versions |
30 | 28 |
#' @return list with A and P matrix estimates |
31 | 29 |
#' @importFrom methods new |
... | ... |
@@ -37,8 +35,7 @@ CoGAPS <- function(D, S, nFactor=7, nEquil=1000, nSample=1000, nOutputs=1000, |
37 | 35 |
nSnapshots=0, alphaA=0.01, alphaP=0.01, maxGibbmassA=100, maxGibbmassP=100, |
38 | 36 |
seed=-1, messages=TRUE, singleCellRNASeq=FALSE, whichMatrixFixed='N', |
39 | 37 |
fixedPatterns=matrix(0), checkpointInterval=0, |
40 |
-checkpointFile="gaps_checkpoint.out", pumpThreshold="unique", |
|
41 |
-nPumpSamples=0, ...) |
|
38 |
+checkpointFile="gaps_checkpoint.out", ...) |
|
42 | 39 |
{ |
43 | 40 |
# get v2 arguments |
44 | 41 |
oldArgs <- list(...) |
... | ... |
@@ -57,6 +54,14 @@ nPumpSamples=0, ...) |
57 | 54 |
if (missing(S) & !is.null(oldArgs$unc)) |
58 | 55 |
S <- oldArgs$unc |
59 | 56 |
|
57 |
+ # get pump arguments - hidden for now from user |
|
58 |
+ pumpThreshold <- "unique" |
|
59 |
+ nPumpSamples <- 0 |
|
60 |
+ if (!is.null(list(...)$pumpThreshold)) |
|
61 |
+ pumpThreshold <- list(...)$pumpThreshold |
|
62 |
+ if (!is.null(list(...)$nPumpSamples)) |
|
63 |
+ pumpThreshold <- list(...)$nPumpSamples |
|
64 |
+ |
|
60 | 65 |
# check arguments |
61 | 66 |
if (class(D) != "matrix" | class(S) != "matrix") |
62 | 67 |
stop('D and S must be matrices') |
... | ... |
@@ -149,7 +154,7 @@ output_atomic=FALSE, fixedBinProbs=FALSE, fixedDomain="N", sampleSnapshots=TRUE, |
149 | 154 |
numSnapshots=100, alphaA=0.01, nMaxA=100000, max_gibbmass_paraA=100.0, |
150 | 155 |
alphaP=0.01, nMaxP=100000, max_gibbmass_paraP=100.0, seed=-1, messages=TRUE) |
151 | 156 |
{ |
152 |
- #warning('gapsRun is deprecated with v3.0, use CoGAPS') |
|
157 |
+ warning('gapsRun is deprecated with v3.0, use CoGAPS') |
|
153 | 158 |
CoGAPS(D, S, nFactor=nFactor, nEquil=nEquil, nSample=nSample, |
154 | 159 |
nOutputs=nOutR, nSnapshots=ifelse(sampleSnapshots,numSnapshots,0), |
155 | 160 |
alphaA=alphaA, alphaP=alphaP, maxGibbmassA=max_gibbmass_paraA, |
... | ... |
@@ -179,7 +184,7 @@ sampleSnapshots=TRUE, numSnapshots=100, alphaA=0.01, nMaxA=100000, |
179 | 184 |
max_gibbmass_paraA=100.0, alphaP=0.01, nMaxP=100000, max_gibbmass_paraP=100.0, |
180 | 185 |
seed=-1, messages=TRUE) |
181 | 186 |
{ |
182 |
- #warning('gapsMapRun is deprecated with v3.0, use CoGaps') |
|
187 |
+ warning('gapsMapRun is deprecated with v3.0, use CoGaps') |
|
183 | 188 |
CoGAPS(D, S, nFactor=nFactor, nEquil=nEquil, nSample=nSample, |
184 | 189 |
nOutputs=nOutR, nSnapshots=ifelse(sampleSnapshots,numSnapshots,0), |
185 | 190 |
alphaA=alphaA, alphaP=alphaP, maxGibbmassA=max_gibbmass_paraA, |
... | ... |
@@ -192,7 +197,7 @@ v2CoGAPS <- function(result, ...) |
192 | 197 |
{ |
193 | 198 |
if (!is.null(list(...)$GStoGenes)) |
194 | 199 |
{ |
195 |
- #warning('GStoGenes is deprecated with v3.0, see CoGAPS documentation') |
|
200 |
+ warning('GStoGenes is deprecated with v3.0, see CoGAPS documentation') |
|
196 | 201 |
if (is.null(list(...)$plot) | list(...)$plot) |
197 | 202 |
{ |
198 | 203 |
plotGAPS(result$Amean, result$Pmean) |
... | ... |
@@ -18,16 +18,21 @@ |
18 | 18 |
#' data(SimpSim) |
19 | 19 |
#' sim_name <- "example" |
20 | 20 |
#' createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, sim_name) |
21 |
-#' result <- GWCoGAPS(sim_name, nFactor=3, nEquil=1000, nSample=1000) |
|
21 |
+#' result <- GWCoGAPS(sim_name, nFactor=3, nEquil=200, nSample=200) |
|
22 | 22 |
#' @export |
23 | 23 |
GWCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, manualMatch=FALSE, consensusPatterns=NULL, ...) |
24 | 24 |
{ |
25 | 25 |
if (!is.null(list(...)$checkpointFile)) |
26 |
+ { |
|
26 | 27 |
stop("checkpoint file name automatically set in GWCoGAPS - don't pass this parameter") |
27 |
- if (is.null(consensusPatterns)){ |
|
28 |
+ } |
|
29 |
+ |
|
30 |
+ if (is.null(consensusPatterns)) |
|
31 |
+ { |
|
28 | 32 |
allDataSets <- preInitialPhase(simulationName, nCores) |
29 | 33 |
initialResult <- runInitialPhase(simulationName, allDataSets, nFactor, ...) |
30 |
- if(manualMatch){ |
|
34 |
+ if (manualMatch) |
|
35 |
+ { |
|
31 | 36 |
saveRDS(initialResult,file=paste(simulationName,"_initial.rds", sep="")) |
32 | 37 |
stop("Please provide consensus patterns upon restarting.") |
33 | 38 |
} |
... | ... |
@@ -107,7 +112,9 @@ preInitialPhase <- function(simulationName, nCores) |
107 | 112 |
|
108 | 113 |
# establish the number of cores that we are able to use |
109 | 114 |
if (is.na(nCores)) |
115 |
+ { |
|
110 | 116 |
nCores <- length(allDataSets) |
117 |
+ } |
|
111 | 118 |
registerDoParallel(cores=nCores) |
112 | 119 |
return(allDataSets) |
113 | 120 |
} |
... | ... |
@@ -139,21 +146,29 @@ postInitialPhase <- function(initialResult, nSets, cut, minNS) |
139 | 146 |
|
140 | 147 |
#run postpattern match function |
141 | 148 |
if (is.na(cut)) |
149 |
+ { |
|
142 | 150 |
cut <- nFactor |
151 |
+ } |
|
152 |
+ |
|
143 | 153 |
return(patternMatch4Parallel(Ptot=BySet$P, nP=nFactor, nSets=nSets, cnt=cut, |
144 | 154 |
minNS=minNS, bySet=TRUE)) |
145 | 155 |
} |
146 | 156 |
|
147 | 157 |
runFinalPhase <- function(simulationName, allDataSets, consensusPatterns, nCores, ...) |
148 | 158 |
{ |
149 |
- if(length(dim(consensusPatterns))!=2){stop("consensusPatterns must be a matrix")} |
|
159 |
+ if (length(dim(consensusPatterns)) != 2) |
|
160 |
+ { |
|
161 |
+ stop("consensusPatterns must be a matrix") |
|
162 |
+ } |
|
150 | 163 |
|
151 | 164 |
# find data files if providing consensus patterns |
152 | 165 |
fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="") |
153 | 166 |
allDataSets <- list.files(full.names=TRUE, pattern=fileSig) |
154 | 167 |
|
155 | 168 |
if (is.na(nCores)) |
156 |
- nCores <- length(allDataSets) |
|
169 |
+ { |
|
170 |
+ nCores <- length(allDataSets) |
|
171 |
+ } |
|
157 | 172 |
registerDoParallel(cores=nCores) |
158 | 173 |
|
159 | 174 |
# generate seeds for parallelization |
... | ... |
@@ -1,95 +1,121 @@ |
1 |
- |
|
2 |
-#' patternMatch4Parallel |
|
1 |
+#' cellMatchR |
|
2 |
+#' @export |
|
3 | 3 |
#' |
4 | 4 |
#' @param Atot a matrix containing the total by set estimates of Pmean output from \code{reOrderBySet} |
5 | 5 |
#' @param nSets number of parallel sets used to generate \code{Atot} |
6 | 6 |
#' @param cnt number of branches at which to cut dendrogram |
7 | 7 |
#' @param minNS minimum of individual set contributions a cluster must contain |
8 |
+#' @param maxNS maximum of individual set contributions a cluster must contain |
|
8 | 9 |
#' @param ignore.NA logical indicating whether or not to ignore NAs from potential over dimensionalization. Default is FALSE. |
9 | 10 |
#' @param bySet logical indicating whether to return list of matched set solutions from \code{Atot} |
10 |
-#' @param R2mean |
|
11 |
-#' @param |
|
11 |
+#' @param plotDen plot |
|
12 | 12 |
#' @param ... additional parameters for \code{agnes} |
13 | 13 |
#' @return a matrix of concensus patterns by samples. If \code{bySet=TRUE} then a list of the set contributions to each |
14 | 14 |
#' concensus pattern is also returned. |
15 |
-#' @export |
|
16 |
-#' @seealso \code{\link{fastcluster}} |
|
17 |
-#' |
|
18 |
-#' |
|
15 |
+cellMatchR <- function(Atot, nSets, cnt, minNS=NA, maxNS=NA, |
|
16 |
+ignore.NA=FALSE, bySet=FALSE, plotDen=FALSE, ...) |
|
17 |
+{ |
|
18 |
+ if (is.na(minNS)) |
|
19 |
+ { |
|
20 |
+ minNS <- nSets / 2 |
|
21 |
+ } |
|
22 |
+ if (is.na(maxNS)) |
|
23 |
+ { |
|
24 |
+ maxNS <- nSets + minNS |
|
25 |
+ } |
|
26 |
+ if (!ignore.NA) |
|
27 |
+ { |
|
28 |
+ if (anyNA(Atot)) |
|
29 |
+ { |
|
30 |
+ warning(paste("Non-sparse matrixes produced. Reducing the number", |
|
31 |
+ "of patterns asked for and rerun.")) |
|
32 |
+ } |
|
33 |
+ } |
|
34 |
+ else |
|
35 |
+ { |
|
36 |
+ Atot <- Atot[complete.cases(Atot),] |
|
37 |
+ } |
|
19 | 38 |
|
20 |
-cellMatchR <- function(Atot,nSets, cnt, minNS=NULL, maxNS=NULL, ignore.NA=FALSE, bySet=FALSE, plotDen=FALSE,...){ |
|
39 |
+ corcut <- function(Atot, minNS, cnt, cluster.method) |
|
40 |
+ { |
|
41 |
+ corr.dist <- cor(Atot) |
|
42 |
+ corr.dist <- 1-corr.dist |
|
21 | 43 |
|
22 |
- if(is.null(minNS)){minNS=nSets/2} |
|
23 |
- if(is.null(maxNS)){maxNS=nSets+minNS} |
|
44 |
+ clust <- agnes(x=corr.dist, diss=TRUE, cluster.method) |
|
45 |
+ #clust=fastcluster::hclust(dist(corr.dist)) |
|
46 |
+ cut <- cutree(as.hclust(clust),k=cnt) |
|
24 | 47 |
|
25 |
- if(ignore.NA==FALSE){if(anyNA(Atot)){ |
|
26 |
- warning('Non-sparse matrixes produced. Reducing the number of patterns asked for and rerun.') |
|
27 |
- }} |
|
28 |
- if(ignore.NA==TRUE){Atot<-Atot[complete.cases(Atot),]} |
|
48 |
+ cls <- sort(unique(cut)) |
|
49 |
+ cMNs <- matrix(ncol=cnt,nrow=dim(Atot)[1]) |
|
50 |
+ colnames(cMNs) <- cls |
|
51 |
+ rownames(cMNs) <- rownames(Atot) |
|
29 | 52 |
|
30 |
-corcut<-function(Atot,minNS,cnt,cluster.method){ |
|
31 |
- corr.dist=cor(Atot) |
|
32 |
- corr.dist=1-corr.dist |
|
53 |
+ RtoMeanPattern <- list() |
|
54 |
+ AByClust <- list() |
|
55 |
+ for (i in cls) |
|
56 |
+ { |
|
57 |
+ if (!is.null(dim(Atot[,cut == i]))) |
|
58 |
+ { |
|
59 |
+ if (dim(Atot[,cut == i])[2] >= minNS) |
|
60 |
+ { |
|
61 |
+ cMNs[,i] <- rowMeans(Atot[,cut==i]) |
|
62 |
+ AByClust[[i]] <- Atot[,cut==i] |
|
63 |
+ nIN <- sum(cut==i) |
|
64 |
+ RtoMeanPattern[[i]] <- sapply(1:nIN, function(j) |
|
65 |
+ round(cor(x=Atot[,cut==i][,j], y=cMNs[,i]),3)) |
|
66 |
+ } |
|
67 |
+ } |
|
68 |
+ } |
|
33 | 69 |
|
34 |
- clust=agnes(x=corr.dist,diss=TRUE,cluster.method) |
|
35 |
- #clust=fastcluster::hclust(dist(corr.dist)) |
|
36 |
- cut=cutree(as.hclust(clust),k=cnt) |
|
70 |
+ AByClust[sapply(AByClust,is.null)] <- NULL |
|
71 |
+ RtoMeanPattern[sapply(RtoMeanPattern,is.null)] <- NULL |
|
72 |
+ return(list("RtoMeanPattern"=RtoMeanPattern, "AByClust"=AByClust)) |
|
73 |
+ } |
|
37 | 74 |
|
38 |
- cls=sort(unique(cut)) |
|
39 |
- cMNs=matrix(ncol=cnt,nrow=dim(Atot)[1]) |
|
40 |
- colnames(cMNs)=cls |
|
41 |
- rownames(cMNs)=rownames(Atot) |
|
75 |
+ cc <- corcut(Atot, minNS, cnt, cluster.method) |
|
42 | 76 |
|
43 |
- RtoMeanPattern <- list() |
|
44 |
- AByClust <- list() |
|
45 |
- for(i in cls){ |
|
46 |
- if (is.null(dim(Atot[,cut == i]))==TRUE){ |
|
47 |
- next |
|
48 |
- } else if(dim(Atot[,cut == i])[2] < minNS){ |
|
77 |
+ ### split by maxNS |
|
78 |
+ indx <- which(unlist(lapply(cc$AByClust, function(x) dim(x)[1] > maxNS))) |
|
79 |
+ i <- 1 |
|
80 |
+ while (length(indx) > 0) |
|
81 |
+ { |
|
82 |
+ icc <- corcut(cc$AByClust[[indx[1]]], minNS, 2, cluster.method) |
|
83 |
+ if (length(icc[[2]]) == 0) |
|
84 |
+ { |
|
85 |
+ indx <- indx[-1] |
|
49 | 86 |
next |
50 |
- } else{ |
|
51 |
- cMNs[,i]=rowMeans(Atot[,cut==i]) |
|
52 |
- AByClust[[i]] <- Atot[,cut==i] |
|
53 |
- nIN=sum(cut==i) |
|
54 |
- RtoMeanPattern[[i]] <- sapply(1:nIN,function(j) {round(cor(x=Atot[,cut==i][,j],y=cMNs[,i]),3)}) |
|
55 | 87 |
} |
56 |
- } |
|
57 |
- AByClust[sapply(AByClust,is.null)]<-NULL |
|
58 |
- RtoMeanPattern[sapply(RtoMeanPattern,is.null)]<-NULL |
|
59 |
- return(list("RtoMeanPattern"=RtoMeanPattern,"AByClust"=AByClust)) |
|
60 |
- } |
|
61 |
- cc<-corcut(Atot,minNS,cnt,cluster.method) |
|
62 |
- |
|
63 |
- ### split by maxNS |
|
64 |
- indx<-which(unlist(lapply(cc$AByClust,function(x) dim(x)[1]>maxNS))) |
|
65 |
- i<-1 |
|
66 |
- while(length(indx)>0){ |
|
67 |
- icc<-corcut(cc$AByClust[[indx[1]]],minNS,2,cluster.method) |
|
68 |
- if(length(icc[[2]])==0){ |
|
69 |
- indx<-indx[-1] |
|
70 |
- next |
|
71 |
- } else{ |
|
72 |
- cc$AByClust[[indx[1]]]<-icc[[2]][[1]] |
|
73 |
- cc$RtoMeanPattern[[indx[1]]]<-icc[[1]][[1]] |
|
74 |
- if(length(icc[[2]])>1){ |
|
88 |
+ else |
|
89 |
+ { |
|
90 |
+ cc$AByClust[[indx[1]]] <- icc[[2]][[1]] |
|
91 |
+ cc$RtoMeanPattern[[indx[1]]] <- icc[[1]][[1]] |
|
92 |
+ if (length(icc[[2]]) > 1) |
|
93 |
+ { |
|
75 | 94 |
cc$AByClust<-append(cc$AByClust,icc[[2]][2]) |
76 | 95 |
cc$RtoMeanPattern<-append(cc$RtoMeanPattern,icc[[1]][2]) |
77 |
- } |
|
78 |
- indx<-which(unlist(lapply(cc$AByClust,function(x) dim(x)[1]>maxNS))) |
|
79 |
- } |
|
96 |
+ } |
|
97 |
+ indx <- which(unlist(lapply(cc$AByClust,function(x) dim(x)[1]>maxNS))) |
|
98 |
+ } |
|
80 | 99 |
} |
81 | 100 |
|
101 |
+ #weighted.mean(AByClustDrop[[1]],RtoMPDrop[[1]]) |
|
102 |
+ AByCDSWavg <- t(sapply(1:length(cc$AByClust), function(z) |
|
103 |
+ apply(cc$AByClust[[z]], 1, function(x) |
|
104 |
+ weighted.mean(x, (cc$RtoMeanPattern[[z]])^3)))) |
|
105 |
+ rownames(AByCDSWavg) <- lapply(1:length(cc$AByClust), function(x) |
|
106 |
+ paste("Pattern",x)) |
|
107 |
+ #scale As |
|
108 |
+ Amax <- apply(AByCDSWavg, 1, max) |
|
109 |
+ AByCDSWavgScaled <- t(sapply(1:dim(AByCDSWavg)[1], function(x) |
|
110 |
+ AByCDSWavg[x,] / Amax[x])) |
|
111 |
+ rownames(AByCDSWavgScaled) <- rownames(AByCDSWavg) |
|
82 | 112 |
|
83 |
-#weighted.mean(AByClustDrop[[1]],RtoMPDrop[[1]]) |
|
84 |
-AByCDSWavg<- t(sapply(1:length(cc$AByClust),function(z) apply(cc$AByClust[[z]],1,function(x) weighted.mean(x,(cc$RtoMeanPattern[[z]])^3)))) |
|
85 |
-rownames(AByCDSWavg) <- lapply(1:length(cc$AByClust),function(x) paste("Pattern",x)) |
|
86 |
-#scale As |
|
87 |
-Amax <- apply(AByCDSWavg,1,max) |
|
88 |
-AByCDSWavgScaled <- t(sapply(1:dim(AByCDSWavg)[1],function(x) AByCDSWavg[x,]/Amax[x])) |
|
89 |
-rownames(AByCDSWavgScaled) <- rownames(AByCDSWavg) |
|
90 |
- |
|
91 |
- if(bySet){ |
|
92 |
- return(list("consenusAs"=t(AByCDSWavgScaled),"ABySet"=cc)) |
|
93 |
- } else {return(AByCDSWavgScaled)} |
|
94 |
- |
|
113 |
+ if (bySet) |
|
114 |
+ { |
|
115 |
+ return(list("consenusAs"=t(AByCDSWavgScaled),"ABySet"=cc)) |
|
116 |
+ } |
|
117 |
+ else |
|
118 |
+ { |
|
119 |
+ return(AByCDSWavgScaled) |
|
120 |
+ } |
|
95 | 121 |
} |
... | ... |
@@ -1,42 +1,39 @@ |
1 |
-#' createGWCoGAPSSets |
|
1 |
+#' Create Gene Sets for GWCoGAPS |
|
2 | 2 |
#' |
3 |
-#'\code{createGWCoGAPSSets} factors whole genome data into randomly generated sets for indexing; |
|
3 |
+#' @details factors whole genome data into randomly generated sets for indexing |
|
4 | 4 |
#' |
5 |
-#'@param data data matrix with unique rownames |
|
6 |
-#'@param nSets number of sets for parallelization |
|
7 |
-#'@param outRDA name of output file |
|
8 |
-#'@param keep logical indicating whether or not to save gene set list. Default is TRUE. |
|
9 |
-#'@param path character string indicating were to save resulting data objects |
|
10 |
-#'@export |
|
11 |
-#'@return list with randomly generated sets of genes from whole genome data |
|
12 |
-#'@examples \dontrun{ |
|
13 |
-#'createGWCoGAPSSet(D,nSets=nSets) |
|
14 |
-#'} |
|
15 |
-#' |
|
16 |
- |
|
17 |
-createGWCoGAPSSets <- function(D, S, nSets, simulationName, path="") |
|
5 |
+#' @param D data matrix |
|
6 |
+#' @param S uncertainty matrix |
|
7 |
+#' @param nSets number of sets to partition the data into |
|
8 |
+#' @param simulationName name used to identify files created by this simulation |
|
9 |
+#' @return simulationName used to identify saved files |
|
10 |
+#' @examples |
|
11 |
+#' data(SimpSim) |
|
12 |
+#' createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, "example") |
|
13 |
+#' @export |
|
14 |
+createGWCoGAPSSets <- function(D, S, nSets, simulationName) |
|
18 | 15 |
{ |
19 | 16 |
# check gene names |
20 |
- if (length(unique(colnames(D))) != length(colnames(D))) |
|
17 |
+ if (length(unique(rownames(D))) != length(rownames(D))) |
|
21 | 18 |
{ |
22 |
- warning("Cell identifiers not unique!") |
|
19 |
+ warning("Gene identifiers not unique!") |
|
23 | 20 |
} |
24 | 21 |
|
25 |
- # partition data by sampling random sets of cells |
|
22 |
+ # partition data by sampling random sets of genes |
|
26 | 23 |
genes <- 1:nrow(D) |
27 | 24 |
setSize <- floor(length(genes) / nSets) |
28 | 25 |
for (set in 1:nSets) |
29 | 26 |
{ |
30 |
- |
|
31 |
- # sample genes |
|
32 |
- sampleSize <- ifelse(set == nSets, length(genes), setSize) |
|
33 |
- geneset <- sample(genes, sampleSize, replace=FALSE) |
|
34 |
- genes <- genes[!(genes %in% geneset)] |
|
27 |
+ # sample gene names |
|
28 |
+ sampleSize <- ifelse(set == nSets, length(genes), setSize) |
|
29 |
+ geneSet <- sample(genes, sampleSize, replace=FALSE) |
|
30 |
+ genes <- genes[!(genes %in% geneSet)] |
|
31 |
+ |
|
35 | 32 |
# partition data |
36 |
- sampleD <- D[geneset,] |
|
37 |
- sampleS <- S[geneset,] |
|
38 |
- save(sampleD, sampleS, file=paste(path, simulationName, "_partition_", set, |
|
33 |
+ sampleD <- D[geneSet,] |
|
34 |
+ sampleS <- S[geneSet,] |
|
35 |
+ save(sampleD, sampleS, file=paste(simulationName, "_partition_", set, |
|
39 | 36 |
".RData", sep="")); |
40 | 37 |
} |
41 | 38 |
return(simulationName) |
42 |
-} |
|
39 |
+} |
|
43 | 40 |
\ No newline at end of file |
... | ... |
@@ -1,9 +1,8 @@ |
1 | 1 |
#' Create Gene Sets for scCoGAPS |
2 |
+#' @export |
|
2 | 3 |
#' |
3 |
-#' @details factors whole genome data into randomly generated sets for indexing |
|
4 |
-#' |
|
4 |
+#' @description factors whole genome data into randomly generated sets for indexing |
|
5 | 5 |
#' @param D data matrix |
6 |
-#' @param S uncertainty matrix |
|
7 | 6 |
#' @param nSets number of sets to partition the data into |
8 | 7 |
#' @param simulationName name used to identify files created by this simulation |
9 | 8 |
#' @param samplingRatio vector of relative quantities to use for sampling celltypes |
... | ... |
@@ -12,9 +11,9 @@ |
12 | 11 |
#' @return simulationName used to identify saved files |
13 | 12 |
#' @examples |
14 | 13 |
#' data(SimpSim) |
15 |
-#' createscCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, "example") |
|
16 |
-#' @export |
|
17 |
-createscCoGAPSSets <- function(D, nSets, simulationName,samplingRatio=NULL,path="",anotionObj=NULL) |
|
14 |
+#' createscCoGAPSSets(SimpSim.D, nSets=2, simulationName="example") |
|
15 |
+createscCoGAPSSets <- function(D, nSets, simulationName, samplingRatio=NULL, |
|
16 |
+path="", anotionObj=NULL) |
|
18 | 17 |
{ |
19 | 18 |
# check gene names |
20 | 19 |
if (length(unique(colnames(D))) != length(colnames(D))) |
... | ... |
@@ -27,16 +26,22 @@ createscCoGAPSSets <- function(D, nSets, simulationName,samplingRatio=NULL,path= |
27 | 26 |
setSize <- floor(length(cells) / nSets) |
28 | 27 |
for (set in 1:nSets) |
29 | 28 |
{ |
30 |
- |
|
31 |
- if(is.null(samplingRatio)){ |
|
32 |
- # sample cell names |
|
29 |
+ if (is.null(samplingRatio)) |
|
30 |
+ { |
|
31 |
+ # sample cell names |
|
33 | 32 |
sampleSize <- ifelse(set == nSets, length(cells), setSize) |
34 | 33 |
cellset <- sample(cells, sampleSize, replace=FALSE) |
35 | 34 |
cells <- cells[!(cells %in% cellset)] |
36 |
- } else { |
|
37 |
- if(length(unique(anotionObj))!=length(samplingRatio)){warning("Not all celltypes will be sampled from.")} |
|
38 |
- ct.indx<-lapply(unique(anotionObj),function(x) which(anotionObj == x)) |
|
39 |
- cellset<-sample(colnames(D)[ct.indx[[x]]], samplingRatio[x],replace=TRUE) |
|
35 |
+ } |
|
36 |
+ else |
|
37 |
+ { |
|
38 |
+ if (length(unique(anotionObj)) != length(samplingRatio)) |
|
39 |
+ { |
|
40 |
+ warning("Not all celltypes will be sampled from.") |
|
41 |
+ } |
|
42 |
+ ct.indx <- lapply(unique(anotionObj), function(x) which(anotionObj == x)) |
|
43 |
+ cellset <- lapply(unique(anotionObj), function(x) |
|
44 |
+ sample(colnames(D)[ct.indx[[x]]], samplingRatio[x],replace=TRUE)) |
|
40 | 45 |
} |
41 | 46 |
|
42 | 47 |
# partition data |
... | ... |
@@ -45,7 +50,8 @@ createscCoGAPSSets <- function(D, nSets, simulationName,samplingRatio=NULL,path= |
45 | 50 |
sampleD <- log2(sampleD+1) |
46 | 51 |
# generate S |
47 | 52 |
sampleS <- pmax(.1*sampleD, .1) |
48 |
- save(sampleD, sampleS, file=paste0(path,simulationName, "_partition_", set,".RData")); |
|
53 |
+ save(sampleD, sampleS, file=paste0(path,simulationName, "_partition_", |
|
54 |
+ set,".RData")); |
|
49 | 55 |
} |
50 | 56 |
return(simulationName) |
51 | 57 |
} |
... | ... |
@@ -12,56 +12,75 @@ |
12 | 12 |
patternMarkers <- function(Amatrix=NA, scaledPmatrix=FALSE, Pmatrix=NA, |
13 | 13 |
threshold="all", lp=NA, full=FALSE) |
14 | 14 |
{ |
15 |
- if(scaledPmatrix==FALSE) |
|
15 |
+ if (scaledPmatrix==FALSE) |
|
16 | 16 |
{ |
17 |
- if(!is.na(Pmatrix)){ |
|
18 |
- pscale <- apply(Pmatrix,1,max) # rescale p's to have max 1 |
|
19 |
- Amatrix <- sweep(Amatrix, 2, pscale, FUN="*") # rescale A in accordance with p's having max 1 |
|
17 |
+ if(!is.na(Pmatrix)) |
|
18 |
+ { |
|
19 |
+ pscale <- apply(Pmatrix,1,max) # rescale p's to have max 1 |
|
20 |
+ Amatrix <- sweep(Amatrix, 2, pscale, FUN="*") # rescale A in accordance with p's having max 1 |
|
21 |
+ } |
|
22 |
+ else |
|
23 |
+ { |
|
24 |
+ warning("P values must be provided if not already scaled") |
|
20 | 25 |
} |
21 |
- else(warning("P values must be provided if not already scaled")) |
|
22 | 26 |
} |
23 | 27 |
|
24 | 28 |
# find the A with the highest magnitude |
25 | 29 |
Arowmax <- t(apply(Amatrix, 1, function(x) x/max(x))) |
26 | 30 |
|
27 |
- if(!is.na(lp)) |
|
31 |
+ if (!is.na(lp)) |
|
28 | 32 |
{ |
29 |
- if(length(lp)!=dim(Amatrix)[2]){ |
|
33 |
+ if (length(lp) != dim(Amatrix)[2]) |
|
34 |
+ { |
|
30 | 35 |
warning("lp length must equal the number of columns of the Amatrix") |
31 | 36 |
} |
32 | 37 |
sstat <- apply(Arowmax, 1, function(x) sqrt(t(x-lp)%*%(x-lp))) |
33 | 38 |
ssranks <-rank(sstat) |
34 |
- ssgenes.th <-names(sort(sstat,decreasing=FALSE,na.last=TRUE)) |
|
35 |
- } else { |
|
39 |
+ ssgenes.th <- names(sort(sstat,decreasing=FALSE,na.last=TRUE)) |
|
40 |
+ } |
|
41 |
+ else |
|
42 |
+ { |
|
36 | 43 |
# determine which genes are most associated with each pattern |
37 | 44 |
|
38 | 45 |
sstat<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=dimnames(Amatrix)) |
39 | 46 |
ssranks<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=dimnames(Amatrix))#list() |
40 | 47 |
ssgenes<-matrix(NA, nrow=nrow(Amatrix), ncol=ncol(Amatrix),dimnames=NULL) |
41 |
- for(i in 1:nP){ |
|
48 |
+ for (i in 1:nP) |
|
49 |
+ { |
|
42 | 50 |
lp <- rep(0,dim(Amatrix)[2]) |
43 | 51 |
lp[i] <- 1 |
44 | 52 |
sstat[,i] <- unlist(apply(Arowmax, 1, function(x) sqrt(t(x-lp)%*%(x-lp)))) |
45 | 53 |
ssranks[,i]<-rank(sstat[,i]) |
46 | 54 |
ssgenes[,i]<-names(sort(sstat[,i],decreasing=FALSE,na.last=TRUE)) |
47 | 55 |
} |
48 |
- if(threshold=="cut"){ |
|
56 |
+ if (threshold=="cut") |
|
57 |
+ { |
|
49 | 58 |
geneThresh <- sapply(1:nP,function(x) min(which(ssranks[ssgenes[,x],x] > apply(ssranks[ssgenes[,x],],1,min)))) |
50 | 59 |
ssgenes.th <- sapply(1:nP,function(x) ssgenes[1:geneThresh[x],x]) |
51 | 60 |
} |
52 |
- else if (threshold=="all"){ |
|
61 |
+ else if (threshold=="all") |
|
62 |
+ { |
|
53 | 63 |
pIndx<-unlist(apply(sstat,1,which.min)) |
54 | 64 |
gBYp <- list() |
55 |
- for(i in sort(unique(pIndx))){ |
|
65 |
+ for(i in sort(unique(pIndx))) |
|
66 |
+ { |
|
56 | 67 |
gBYp[[i]]<-sapply(strsplit(names(pIndx[pIndx==i]),"[.]"),function(x) x[[1]][1]) |
57 | 68 |
} |
58 |
- ssgenes.th <- lapply(1:max(sort(unique(pIndx))), function(x) { |
|
59 |
- ssgenes[which(ssgenes[,x] %in% gBYp[[x]]),x] |
|
60 |
- }) |
|
61 |
- } else{stop("Threshold arguement not viable option")} |
|
69 |
+ ssgenes.th <- lapply(1:max(sort(unique(pIndx))), function(x) |
|
70 |
+ ssgenes[which(ssgenes[,x] %in% gBYp[[x]]),x]) |
|
71 |
+ } |
|
72 |
+ else |
|
73 |
+ { |
|
74 |
+ stop("Threshold arguement not viable option") |
|
75 |
+ } |
|
62 | 76 |
} |
63 | 77 |
if (full) |
64 |
- return(list("PatternMarkers"=ssgenes.th,"PatternRanks"=ssranks,"PatternMarkerScores"=sstat)) |
|
78 |
+ { |
|
79 |
+ return(list("PatternMarkers"=ssgenes.th, "PatternRanks"=ssranks, |
|
80 |
+ "PatternMarkerScores"=sstat)) |
|
81 |
+ } |
|
65 | 82 |
else |
83 |
+ { |
|
66 | 84 |
return("PatternMarkers"=ssgenes.th) |
85 |
+ } |
|
67 | 86 |
} |
... | ... |
@@ -1,96 +1,129 @@ |
1 | 1 |
#' patternMatch4Parallel |
2 | 2 |
#' |
3 |
-#' @param Ptot a matrix containing the total by set estimates of Pmean output from \code{reOrderBySet} |
|
3 |
+#' @param Ptot a matrix containing the total by set estimates of Pmean output |
|
4 |
+#' from \code{reOrderBySet} |
|
4 | 5 |
#' @param nSets number of parallel sets used to generate \code{Ptot} |
5 | 6 |
#' @param cnt number of branches at which to cut dendrogram |
6 | 7 |
#' @param minNS minimum of individual set contributions a cluster must contain |
7 |
-#' @param maxNS max of individual set contributions a cluster must contain. default is nSets+minNS |
|
8 |
+#' @param maxNS max of individual set contributions a cluster must contain. |
|
9 |
+#' default is nSets+minNS |
|
8 | 10 |
#' @param cluster.method the agglomeration method to be used for clustering |
9 |
-#' @param ignore.NA logical indicating whether or not to ignore NAs from potential over dimensionalization. Default is FALSE. |
|
10 |
-#' @param bySet logical indicating whether to return list of matched set solutions from \code{Ptot} |
|
11 |
+#' @param ignore.NA logical indicating whether or not to ignore NAs from |
|
12 |
+#' potential over dimensionalization. Default is FALSE. |
|
13 |
+#' @param bySet logical indicating whether to return list of matched set |
|
14 |
+#' solutions from \code{Ptot} |
|
11 | 15 |
#' @param ... additional parameters for \code{agnes} |
12 |
-#' @return a matrix of concensus patterns by samples. If \code{bySet=TRUE} then a list of the set contributions to each |
|
16 |
+#' @return a matrix of concensus patterns by samples. If \code{bySet=TRUE} then |
|
17 |
+#' a list of the set contributions to each |
|
13 | 18 |
#' concensus pattern is also returned. |
14 | 19 |
#' @seealso \code{\link{agnes}} |
15 | 20 |
#' @export |
16 |
-patternMatch4Parallel <- function(Ptot, nSets, cnt, minNS=NULL, maxNS=NULL, |
|
17 |
- cluster.method="complete", ignore.NA=FALSE, bySet=FALSE, ...){ |
|
18 |
- |
|
19 |
- if (is.null(minNS)){minNS=ceiling(nSets/2)} |
|
20 |
- if (is.null(maxNS)){maxNS=nSets+minNS} |
|
21 |
- |
|
21 |
+patternMatch4Parallel <- function(Ptot, nSets, cnt, minNS=NA, maxNS=NULL, |
|
22 |
+cluster.method="complete", ignore.NA=FALSE, bySet=FALSE, ...) |
|
23 |
+{ |
|
24 |
+ if (is.na(minNS)) |
|
25 |
+ { |
|
26 |
+ minNS <- ceiling(nSets / 2) |
|
27 |
+ } |
|
28 |
+ if (is.null(maxNS)) |
|
29 |
+ { |
|
30 |
+ maxNS=nSets+minNS |
|
31 |
+ } |
|
22 | 32 |
if (ignore.NA==FALSE & anyNA(Ptot)) |
23 |
- warning('Non-sparse matrixes produced. Reducing the number of patterns asked for and rerun.') |
|
33 |
+ { |
|
34 |
+ warning(paste("Non-sparse matrixes produced. Reducing the number of", |
|
35 |
+ "patterns asked for and rerun.")) |
|
36 |
+ } |
|
24 | 37 |
if (ignore.NA==TRUE) |
38 |
+ { |
|
25 | 39 |
Ptot <- Ptot[complete.cases(Ptot),] |
40 |
+ } |
|
26 | 41 |
|
27 |
- corcut<-function(Ptot,minNS,cnt,cluster.method){ |
|
28 |
- # corr dist |
|
29 |
- corr.dist=cor(t(Ptot)) |
|
30 |
- corr.dist=1-corr.dist |
|
31 |
- # cluster |
|
32 |
- #library(cluster) |
|
33 |
- clust=agnes(x=corr.dist,diss=TRUE,method=cluster.method) |
|
34 |
- cut=cutree(as.hclust(clust),k=cnt) |
|
35 |
- #save.image(file=paste("CoGAPS.",nP,"P.",nS,"Set.CorrClustCut",cnt,".RData")) |
|
42 |
+ corcut <-function(Ptot,minNS,cnt,cluster.method) |
|
43 |
+ { |
|
44 |
+ # corr dist |
|
45 |
+ corr.dist <- cor(t(Ptot)) |
|
46 |
+ corr.dist <- 1-corr.dist |
|
47 |
+ # cluster |
|
48 |
+ #library(cluster) |
|
49 |
+ clust <- agnes(x=corr.dist, diss=TRUE, method=cluster.method) |
|
50 |
+ cut <- cutree(as.hclust(clust), k=cnt) |
|
51 |
+ #save.image(file=paste("CoGAPS.", nP, "P.", nS, "Set.CorrClustCut", |
|
52 |
+ #cnt,".RData")) |
|
36 | 53 |
|
37 |
- cls=sort(unique(cut)) |
|
38 |
- cMNs=matrix(nrow=cnt,ncol=dim(Ptot)[2]) |
|
39 |
- rownames(cMNs)=cls |
|
40 |
- colnames(cMNs)=colnames(Ptot) |
|
54 |
+ cls <- sort(unique(cut)) |
|
55 |
+ cMNs <- matrix(nrow=cnt, ncol=dim(Ptot)[2]) |
|
56 |
+ rownames(cMNs) <- cls |
|
57 |
+ colnames(cMNs) <- colnames(Ptot) |
|
41 | 58 |
|
42 |
- RtoMeanPattern <- list() |
|
43 |
- PByClust <- list() |
|
44 |
- for(i in cls){ |
|
45 |
- if (is.null(dim(Ptot[cut == i, ]))==TRUE){ |
|
46 |
- next |
|
47 |
- } else if(dim(Ptot[cut == i, ])[1]< minNS){ |
|
48 |
- next |
|
49 |
- } else { |
|
50 |
- cMNs[i,]=colMeans(Ptot[cut==i,]) |
|
51 |
- PByClust[[i]] <- Ptot[cut==i,] |
|
52 |
- nIN=sum(cut==i) |
|
53 |
- RtoMeanPattern[[i]] <- sapply(1:nIN,function(j) {round(cor(x=Ptot[cut==i,][j,],y=cMNs[i,]),3)}) |
|
59 |
+ RtoMeanPattern <- list() |
|
60 |
+ PByClust <- list() |
|
61 |
+ for (i in cls) |
|
62 |
+ { |
|
63 |
+ if (!is.null(dim(Ptot[cut == i,]))) |
|
64 |
+ { |
|
65 |
+ if (dim(Ptot[cut == i,])[1] >= minNS) |
|
66 |
+ { |
|
67 |
+ cMNs[i,] <- colMeans(Ptot[cut==i,]) |
|
68 |
+ PByClust[[i]] <- Ptot[cut==i,] |
|
69 |
+ nIN <- sum(cut==i) |
|
70 |
+ RtoMeanPattern[[i]] <- sapply(1:nIN, function(j) |
|
71 |
+ round(cor(x=Ptot[cut==i,][j,], y=cMNs[i,]),3) |
|
72 |
+ ) |
|
73 |
+ } |
|
74 |
+ } |
|
54 | 75 |
} |
55 |
- } |
|
56 |
- PByClust[sapply(PByClust,is.null)]<-NULL |
|
57 |
- RtoMeanPattern[sapply(RtoMeanPattern,is.null)]<-NULL |
|
58 |
- return(list("RtoMeanPattern"=RtoMeanPattern,"PByClust"=PByClust)) |
|
76 |
+ |
|
77 |
+ PByClust[sapply(PByClust,is.null)] <- NULL |
|
78 |
+ RtoMeanPattern[sapply(RtoMeanPattern,is.null)] <- NULL |
|
79 |
+ return(list("RtoMeanPattern"=RtoMeanPattern, "PByClust"=PByClust)) |
|
59 | 80 |
} |
60 | 81 |
|
61 |
- cc<-corcut(Ptot,minNS,cnt,cluster.method) |
|
82 |
+ cc <- corcut(Ptot,minNS,cnt,cluster.method) |
|
62 | 83 |
|
63 | 84 |
### split by maxNS |
64 |
- indx<-which(unlist(lapply(cc$PByClust,function(x) dim(x)[1]>maxNS))) |
|
65 |
- while(length(indx)>0){ |
|
66 |
- icc<-corcut(cc$PByClust[[indx[1]]],minNS,2,cluster.method) |
|
67 |
- if(length(icc[[2]])==0){ |
|
68 |
- indx<-indx[-1] |
|
69 |
- next |
|
70 |
- } else{ |
|
71 |
- cc$PByClust[[indx[1]]]<-icc[[2]][[1]] |
|
72 |
- cc$RtoMeanPattern[[indx[1]]]<-icc[[1]][[1]] |
|
73 |
- if(length(icc[[2]])>1){ |
|
74 |
- cc$PByClust<-append(cc$PByClust,icc[[2]][2]) |
|
75 |
- cc$RtoMeanPattern<-append(cc$RtoMeanPattern,icc[[1]][2]) |
|
76 |
- } |
|
77 |
- indx<-which(unlist(lapply(cc$PByClust,function(x) dim(x)[1]>maxNS))) |
|
78 |
- } |
|
85 |
+ indx <- which(unlist(lapply(cc$PByClust,function(x) dim(x)[1]>maxNS))) |
|
86 |
+ while(length(indx) > 0) |
|
87 |
+ { |
|
88 |
+ icc <- corcut(cc$PByClust[[indx[1]]],minNS,2,cluster.method) |
|
89 |
+ if (length(icc[[2]])==0) |
|
90 |
+ { |
|
91 |
+ indx <- indx[-1] |
|
92 |
+ next |
|
93 |
+ } |
|
94 |
+ else |
|
95 |
+ { |
|
96 |
+ cc$PByClust[[indx[1]]] <- icc[[2]][[1]] |
|
97 |
+ cc$RtoMeanPattern[[indx[1]]] <- icc[[1]][[1]] |
|
98 |
+ if (length(icc[[2]]) > 1) |
|
99 |
+ { |
|
100 |
+ cc$PByClust <- append(cc$PByClust,icc[[2]][2]) |
|
101 |
+ cc$RtoMeanPattern <- append(cc$RtoMeanPattern,icc[[1]][2]) |
|
102 |
+ } |
|
103 |
+ indx <- which(unlist(lapply(cc$PByClust, function(x) |
|
104 |
+ dim(x)[1] > maxNS))) |
|
105 |
+ } |
|
79 | 106 |
} |
80 | 107 |
|
81 | 108 |
#weighted.mean |
82 |
- PByCDSWavg<- t(sapply(1:length(cc$PByClust),function(z) apply(cc$PByClust[[z]],2,function(x) |
|
83 |
- weighted.mean(x,(cc$RtoMeanPattern[[z]])^3)))) |
|
84 |
- rownames(PByCDSWavg) <- lapply(1:length(cc$PByClust),function(x) paste("Pattern",x)) |
|
109 |
+ PByCDSWavg <- t(sapply(1:length(cc$PByClust), function(z) |
|
110 |
+ apply(cc$PByClust[[z]],2, function(x) |
|
111 |
+ weighted.mean(x, (cc$RtoMeanPattern[[z]])^3)))) |
|
112 |
+ rownames(PByCDSWavg) <- lapply(1:length(cc$PByClust), function(x) |
|
113 |
+ paste("Pattern", x)) |
|
85 | 114 |
|
86 | 115 |
#scale ps |
87 | 116 |
Pmax <- apply(PByCDSWavg,1,max) |
88 |
- PByCDSWavgScaled <- t(sapply(1:dim(PByCDSWavg)[1],function(x) PByCDSWavg[x,]/Pmax[x])) |
|
117 |
+ PByCDSWavgScaled <- t(sapply(1:dim(PByCDSWavg)[1], function(x) |
|
118 |
+ PByCDSWavg[x,] / Pmax[x])) |
|
89 | 119 |
rownames(PByCDSWavgScaled) <- rownames(PByCDSWavg) |
90 | 120 |
|
91 |
- if(bySet){ |
|
92 |
- return(list("consenusPatterns"=PByCDSWavgScaled,"PBySet"=cc)) |
|
93 |
- } else { |
|
121 |
+ if (bySet) |
|
122 |
+ { |
|
123 |
+ return(list("consenusPatterns"=PByCDSWavgScaled, "PBySet"=cc)) |
|
124 |
+ } |
|
125 |
+ else |
|
126 |
+ { |
|
94 | 127 |
return("consenusPatterns"=PByCDSWavgScaled) |
95 | 128 |
} |
96 | 129 |
} |
... | ... |
@@ -12,51 +12,56 @@ |
12 | 12 |
#' concensus pattern is also returned. |
13 | 13 |
#' @seealso \code{\link{agnes}} |
14 | 14 |
#' @export |
15 |
-#' |
|
16 |
-#' |
|
17 |
- |
|
18 | 15 |
patternMatch4singleCell <- function(Atot, nSets, cnt, minNS, |
19 | 16 |
cluster.method="complete", ignore.NA=FALSE, bySet=FALSE, ...) |
20 | 17 |
{ |
21 |
- if (!is.null(minNS)) |
|
22 |
- minNS=nSets/2 |
|
23 |
- |
|
24 |
- if (ignore.NA==FALSE & anyNA(Atot)) |
|
25 |
- warning('Non-sparse matrixes produced. Reducing the number of patterns asked for and rerun.') |
|
26 |
- if (ignore.NA==TRUE) |
|
18 |
+ if (is.na(minNS)) |
|
19 |
+ { |
|
20 |
+ minNS <- nSets/2 |
|
21 |
+ } |
|
22 |
+ if (!ignore.NA & anyNA(Atot)) |
|
23 |
+ { |
|
24 |
+ warning(paste("Non-sparse matrixes produced. Reducing the number of", |
|
25 |
+ "patterns asked for and rerun")) |
|
26 |
+ } |
|
27 |
+ if (ignore.NA) |
|
28 |
+ { |
|
27 | 29 |
Atot <- Atot[complete.cases(Atot),] |
30 |
+ } |
|
28 | 31 |
|
29 | 32 |
# corr dist |
30 |
- corr.dist=cor(Atot) |
|
31 |
- corr.dist=1-corr.dist |
|
33 |
+ corr.dist <- cor(Atot) |
|
34 |
+ corr.dist <- 1 - corr.dist |
|
32 | 35 |
# cluster |
33 | 36 |
#library(cluster) |
34 |
- clust=agnes(x=corr.dist,diss=TRUE,method=cluster.method) |
|
35 |
- cut=cutree(as.hclust(clust),k=cnt) |
|
37 |
+ clust <- agnes(x=corr.dist, diss=TRUE, method=cluster.method) |
|
38 |
+ cut <- cutree(as.hclust(clust), k=cnt) |
|
36 | 39 |
#save.image(file=paste("CoGAPS.",nP,"P.",nS,"Set.CorrClustCut",cnt,".RData")) |
37 | 40 |
|
38 | 41 |
#drop n<4 and get weighted Avg |
39 |
- cls=sort(unique(cut)) |
|
40 |
- cMNs=matrix(nrow=cnt,ncol=dim(Atot)[1]) |
|
41 |
- rownames(cMNs)=cls |
|
42 |
- colnames(cMNs)=colnames(Atot) |
|
42 |
+ cls <- sort(unique(cut)) |
|
43 |
+ cMNs <- matrix(nrow=cnt, ncol=dim(Atot)[1]) |
|
44 |
+ rownames(cMNs) <- cls |
|
45 |
+ colnames(cMNs) <- rownames(Atot) |
|
43 | 46 |
|
44 | 47 |
RtoMeanPattern <- list() |
45 | 48 |
PByClust <- list() |
46 | 49 |
for(i in cls) |
47 | 50 |
{ |
48 |
- if (is.null(dim(Atot[,cut == i ]))==TRUE) |
|
51 |
+ if (is.null(dim(Atot[,cut == i]))) |
|
49 | 52 |
{ |
50 |
- cMNs[i,] <- Atot[,cut == i ] |
|
53 |
+ cMNs[i,] <- Atot[,cut == i] |
|
51 | 54 |
RtoMeanPattern[[i]] <- rep(1,length(Atot[,cut == i ])) |
52 | 55 |
PByClust[[i]] <- t(as.matrix(Atot[,cut == i ])) |
53 | 56 |
} |
54 | 57 |
else |
55 | 58 |
{ |
56 |
- cMNs[i,]=colMeans(Atot[,cut==i]) |
|
59 |
+ cMNs[i,] <- rowMeans(Atot[,cut==i]) |
|
57 | 60 |
PByClust[[i]] <- Atot[,cut==i] |
58 |
- nIN=sum(cut==i) |
|
59 |
- RtoMeanPattern[[i]] <- sapply(1:nIN,function(j) {round(cor(x=Atot[,cut==i][j,],y=cMNs[i,]),3)}) |
|
61 |
+ nIN <- sum(cut==i) |
|
62 |
+ RtoMeanPattern[[i]] <- sapply(1:nIN, function(j) |
|
63 |
+ round(cor(x=Atot[,cut==i][,j], y=cMNs[i,]), 3) |
|
64 |
+ ) |
|
60 | 65 |
} |
61 | 66 |
} |
62 | 67 |
|
... | ... |
@@ -65,17 +70,14 @@ cluster.method="complete", ignore.NA=FALSE, bySet=FALSE, ...) |
65 | 70 |
RtoMPDrop <- list() |
66 | 71 |
for(i in cls) |
67 | 72 |
{ |
68 |
- if (is.null(dim(PByClust[[i]])) == TRUE) |
|
69 |
- next |
|
70 |
- if (dim(PByClust[[i]])[1] < minNS) |
|
73 |
+ if (!is.null(dim(PByClust[[i]]))) |
|
71 | 74 |
{ |
72 |
- next |
|
73 |
- } |
|
74 |
- else |
|
75 |
- { |
|
76 |
- #indx <- which(RtoMeanPattern[[i]]>.7,arr.ind = TRUE) |
|
77 |
- PByClustDrop <- append(PByClustDrop,list(PByClust[[i]])) |
|
78 |
- RtoMPDrop <- append(RtoMPDrop,list(RtoMeanPattern[[i]])) |
|
75 |
+ if (dim(PByClust[[i]])[1] >= minNS) |
|
76 |
+ { |
|
77 |
+ #indx <- which(RtoMeanPattern[[i]]>.7,arr.ind = TRUE) |
|
78 |
+ PByClustDrop <- append(PByClustDrop,list(PByClust[[i]])) |
|
79 |
+ RtoMPDrop <- append(RtoMPDrop,list(RtoMeanPattern[[i]])) |
|
80 |
+ } |
|
79 | 81 |
} |
80 | 82 |
} |
81 | 83 |
|
... | ... |
@@ -84,43 +86,48 @@ cluster.method="complete", ignore.NA=FALSE, bySet=FALSE, ...) |
84 | 86 |
RtoMPDS <- list() |
85 | 87 |
for (j in 1:length(PByClustDrop)) |
86 | 88 |
{ |
87 |
- if (is.null(dim(PByClustDrop[[j]]))==TRUE) |
|
89 |
+ if (is.null(dim(PByClustDrop[[j]]))) |
|
88 | 90 |
{ |
89 | 91 |
next |
90 | 92 |
} |
91 |
- if (dim(PByClustDrop[[j]])[1]<minNS+nSets) |
|
93 |
+ if (dim(PByClustDrop[[j]])[1] < minNS + nSets) |
|
92 | 94 |
{ |
93 |
- PByCDS <- append(PByCDS,PByClustDrop[j]) |
|
94 |
- RtoMPDS <- append(RtoMPDS,RtoMPDrop[j]) |
|
95 |
+ PByCDS <- append(PByCDS, PByClustDrop[j]) |
|
96 |
+ RtoMPDS <- append(RtoMPDS, RtoMPDrop[j]) |
|
95 | 97 |
} |
96 |
- if (dim(PByClustDrop[[j]])[1]>=minNS+nSets) |
|
98 |
+ if (dim(PByClustDrop[[j]])[1] >= minNS + nSets) |
|
97 | 99 |
{ |
98 |
- corr.distPBCD=cor(t(PByClustDrop[[j]])) |
|
99 |
- corr.distPBCD=1-corr.distPBCD |
|
100 |
- clustPBCD=agnes(x=corr.distPBCD,diss=TRUE,method="complete") |
|
101 |
- cutPBCD=cutree(as.hclust(clustPBCD),k=2) |
|
100 |
+ corr.distPBCD <- cor(t(PByClustDrop[[j]])) |
|
101 |
+ corr.distPBCD <- 1-corr.distPBCD |
|
102 |
+ clustPBCD <- agnes(x=corr.distPBCD, diss=TRUE, method="complete") |
|
103 |
+ cutPBCD <- cutree(as.hclust(clustPBCD),k=2) |
|
102 | 104 |
g1 <- PByClustDrop[[j]][cutPBCD==1,] |
103 | 105 |
PByCDS <- append(PByCDS,list(g1)) |
104 |
- RtoMPDS <- append(RtoMPDS,list(sapply(1:dim(g1)[1],function(z) round(cor(x=g1[z,],y=colMeans(PByClustDrop[[j]][cutPBCD==1,])),3)))) |
|
106 |
+ RtoMPDS <- append(RtoMPDS,list(sapply(1:dim(g1)[1], function(z) |
|
107 |
+ round(cor(x=g1[z,], y=colMeans(PByClustDrop[[j]][cutPBCD==1,])),3)))) |
|
105 | 108 |
g2 <- PByClustDrop[[j]][cutPBCD==2,] |
106 |
- if (is.null(dim(g2)[1])==FALSE) |
|
109 |
+ if (!is.null(dim(g2)[1])) |
|
107 | 110 |
{ |
108 | 111 |
PByCDS <- append(PByCDS,list(g2)) |
109 |
- RtoMPDS <- append(RtoMPDS,list(sapply(1:dim(g2)[1],function(z) round(cor(x=g2[z,],y=colMeans(PByClustDrop[[j]][cutPBCD==2,])),3)))) |
|
112 |
+ RtoMPDS <- append(RtoMPDS,list(sapply(1:dim(g2)[1], function(z) |
|
113 |
+ round(cor(x=g2[z,], y=colMeans(PByClustDrop[[j]][cutPBCD==2,])),3)))) |
|
110 | 114 |
} |
111 | 115 |
} |
112 | 116 |
} |
113 | 117 |
|
114 | 118 |
#weighted.mean(PByClustDrop[[1]],RtoMPDrop[[1]]) |
115 |
- PByCDSWavg<- t(sapply(1:length(PByCDS),function(z) apply(PByCDS[[z]],2,function(x) weighted.mean(x,(RtoMPDS[[z]])^3)))) |
|
116 |
- rownames(PByCDSWavg) <- lapply(1:length(PByCDS),function(x) paste("Pattern",x)) |
|
119 |
+ PByCDSWavg <- t(sapply(1:length(PByCDS),function(z) apply(PByCDS[[z]],2, |
|
120 |
+ function(x) weighted.mean(x,(RtoMPDS[[z]])^3)))) |
|
121 |
+ rownames(PByCDSWavg) <- lapply(1:length(PByCDS), |
|
122 |
+ function(x) paste("Pattern",x)) |
|
117 | 123 |
|
118 | 124 |
#scale ps |
119 | 125 |
Pmax <- apply(PByCDSWavg,1,max) |
120 |
- PByCDSWavgScaled <- t(sapply(1:dim(PByCDSWavg)[1],function(x) PByCDSWavg[x,]/Pmax[x])) |
|
126 |
+ PByCDSWavgScaled <- t(sapply(1:dim(PByCDSWavg)[1], |
|
127 |
+ function(x) PByCDSWavg[x,] / Pmax[x])) |
|
121 | 128 |
rownames(PByCDSWavgScaled) <- rownames(PByCDSWavg) |
122 | 129 |
|
123 |
- if(bySet) |
|
130 |
+ if (bySet) |
|
124 | 131 |
{ |
125 | 132 |
# return by set and final |
126 | 133 |
PBySet<-PByCDS |
... | ... |
@@ -1,13 +1,15 @@ |
1 | 1 |
#' Post Processing of Parallel Output |
2 | 2 |
#' |
3 | 3 |
#' @param AP.fixed output of parallel gapsMapRun calls with same FP |
4 |
-#' @param setPs data.frame with rows giving fixed patterns for P used as input |
|
4 |
+#' @param setValues data.frame with rows giving fixed patterns for P used as input |
|
5 | 5 |
#' for gapsMapRun |
6 |
+#' @param setMatrix which matrix, A or P |
|
6 | 7 |
#' @return list of two data.frames containing the A matrix estimates or their |
7 | 8 |
#' corresponding standard deviations from output of parallel CoGAPS |
8 | 9 |
postFixed4Parallel <- function(AP.fixed, setValues, setMatrix="P") |
9 | 10 |
{ |
10 |
- if(setMatrix=="P"){ |
|
11 |
+ if (setMatrix=="P") |
|
12 |
+ { |
|
11 | 13 |
ASummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Amean)) |
12 | 14 |
Asd <- do.call(rbind,lapply(AP.fixed, function(x) x$Asd)) |
13 | 15 |
#PSummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Pmean)) |
... | ... |
@@ -18,14 +20,16 @@ postFixed4Parallel <- function(AP.fixed, setValues, setMatrix="P") |
18 | 20 |
Aneu <- sweep(ASummary,2,Pmax,FUN="*") |
19 | 21 |
|
20 | 22 |
X <- apply(Pneu,1,range) |
21 |
- Y <- apply(setPs,1,range) |
|
23 |
+ Y <- apply(setValues,1,range) |
|
22 | 24 |
colnames(X) <- colnames(Y) |
23 | 25 |
if (all.equal(X,Y,tolerance=0.01) != TRUE) |
24 | 26 |
warning("Patterns do not match fixed values.") |
25 | 27 |
|
26 | 28 |
As4fixPs<-list("A"=Aneu,"Asd"=Asd) |
27 | 29 |
return(As4fixPs) |
28 |
- } else if(setMatrix=="A"){ |
|
30 |
+ } |
|
31 |
+ else if (setMatrix=="A") |
|
32 |
+ { |
|
29 | 33 |
PSummary <- do.call(cbind,lapply(AP.fixed, function(x) x$Pmean)) |
30 | 34 |
Psd <- do.call(cbind,lapply(AP.fixed, function(x) x$Psd)) |
31 | 35 |
#PSummary <- do.call(rbind,lapply(AP.fixed, function(x) x$Pmean)) |
... | ... |
@@ -36,14 +40,16 @@ postFixed4Parallel <- function(AP.fixed, setValues, setMatrix="P") |
36 | 40 |
Pneu <- sweep(PSummary,1,Amax,FUN="*") |
37 | 41 |
|
38 | 42 |
X <- apply(Aneu,2,range) |
39 |
- Y <- apply(setAs,2,range) |
|
43 |
+ Y <- apply(setValues,2,range) |
|
40 | 44 |
colnames(X) <- colnames(Y) |
41 | 45 |
if (all.equal(X,Y,tolerance=0.01) != TRUE) |
42 | 46 |
warning("As do not match fixed values.") |
43 | 47 |
|
44 | 48 |
Ps4fixAs<-list("P"=Pneu,"Psd"=Psd) |
45 | 49 |
return(Ps4fixAs) |
46 |
- } else{ |
|
50 |
+ } |
|
51 |
+ else |
|
52 |
+ { |
|
47 | 53 |
warning("setMatrix can only take values of 'A' or 'P'") |
48 | 54 |
} |
49 | 55 |
|
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
#' Post Processing of Parallel Output |
2 | 2 |
#' |
3 | 3 |
#' @param AP.fixed output of parallel gapsMapRun calls with same FP |
4 |
-#' @param setPs data.frame with rows giving fixed patterns for P used as input |
|
4 |
+#' @param setAs data.frame with rows giving fixed patterns for A used as input |
|
5 | 5 |
#' for gapsMapRun |
6 | 6 |
#' @return list of two data.frames containing the A matrix estimates or their |
7 | 7 |
#' corresponding standard deviations from output of parallel CoGAPS |
... | ... |
@@ -17,38 +17,43 @@ |
17 | 17 |
#' @examples |
18 | 18 |
#' data(SimpSim) |
19 | 19 |
#' sim_name <- "example" |
20 |
-#' createscCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, sim_name) |
|
21 |
-#' result <- scCoGAPS(sim_name, nFactor=3, nEquil=1000, nSample=1000) |
|
20 |
+#' createscCoGAPSSets(SimpSim.D, nSets=2, simulationName=sim_name) |
|
21 |
+#' result <- scCoGAPS(sim_name, nFactor=3, nEquil=200, nSample=200) |
|
22 | 22 |
#' @export |
23 | 23 |
scCoGAPS <- function(simulationName, nFactor, nCores=NA, cut=NA, minNS=NA, manualMatch=FALSE, consensusAs=NULL, ...) |
24 | 24 |
{ |
25 | 25 |
if (!is.null(list(...)$checkpointFile)) |
26 |
+ { |
|
26 | 27 |
stop("checkpoint file name automatically set in GWCoGAPS - don't pass this parameter") |
27 |
- if (is.null(consensusAs)){ |
|
28 |
- allDataSets <- preInitialPhase(simulationName, nCores) |
|
29 |
- initialResult <- runInitialPhase(simulationName, allDataSets, nFactor, ...) |
|
30 |
- if(manualMatch){ |
|
28 |
+ } |
|
29 |
+ |
|
30 |
+ if (is.null(consensusAs)) |
|
31 |
+ { |
|
32 |
+ allDataSets <- sc_preInitialPhase(simulationName, nCores) |
|
33 |
+ initialResult <- sc_runInitialPhase(simulationName, allDataSets, nFactor, ...) |
|
34 |
+ if (manualMatch) |
|
35 |
+ { |
|
31 | 36 |
saveRDS(initialResult,file=paste(simulationName,"_initial.rds", sep="")) |
32 | 37 |
stop("Please provide concensus gene weights upon restarting.") |
33 | 38 |
} |
34 |
- matchedAmplitudes <- postInitialPhase(initialResult, length(allDataSets), cut, minNS) |
|
39 |
+ matchedAmplitudes <- sc_postInitialPhase(initialResult, length(allDataSets), cut, minNS) |
|
35 | 40 |
#save(matchedAmplitudes, file=paste(simulationName, "_matched_As.RData", sep="")) |
36 | 41 |
consensusAs<-t(matchedAmplitudes[[1]]) |
37 | 42 |
} |
38 |
- finalResult <- runFinalPhase(simulationName, allDataSets, consensusAs, nCores, ...) |
|
39 |
- return(postFinalPhase(finalResult, consensusAs)) |
|
43 |
+ finalResult <- sc_runFinalPhase(simulationName, allDataSets, consensusAs, nCores, ...) |
|
44 |
+ return(sc_postFinalPhase(finalResult, consensusAs)) |
|
40 | 45 |
} |
41 | 46 |
|
42 |
-#' Restart a GWCoGaps Run from Checkpoint |
|
47 |
+#' Restart a scCoGAPS run from a Checkpoint |
|
43 | 48 |
#' |
44 | 49 |
#' @inheritParams GWCoGAPS |
45 | 50 |
#' @return list of A and P estimates |
46 | 51 |
#' @importFrom utils file_test |
47 | 52 |
#' @export |
48 |
-GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA, ...) |
|
53 |
+scCoGAPSFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA, ...) |
|
49 | 54 |
{ |
50 | 55 |
# find data files |
51 |
- allDataSets <- preInitialPhase(simulationName, nCores) |
|
56 |
+ allDataSets <- sc_preInitialPhase(simulationName, nCores) |
|
52 | 57 |
|
53 | 58 |
# figure out phase from file signature |
54 | 59 |
initialCpts <- list.files(full.names=TRUE, pattern=paste(simulationName, "_initial_cpt_[0-9]+.out", sep="")) |
... | ... |
@@ -72,7 +77,7 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA, |
72 | 77 |
{ |
73 | 78 |
load(paste(simulationName, "_matched_As.RData", sep="")) |
74 | 79 |
consensusAs<-matchedAmplitudes[[1]] |
75 |
- finalResult <- runFinalPhase(simulationName, allDataSets, consensusAs, ...) |
|
80 |
+ finalResult <- sc_runFinalPhase(simulationName, allDataSets, consensusAs, ...) |
|
76 | 81 |
} |
77 | 82 |
else if (length(initialCpts)) |
78 | 83 |
{ |
... | ... |
@@ -87,19 +92,19 @@ GWCoGapsFromCheckpoint <- function(simulationName, nCores=NA, cut=NA, minNS=NA, |
87 | 92 |
cptFileName <- paste(simulationName, "_initial_cpt_", i, ".out", sep="") |
88 | 93 |
CoGapsFromCheckpoint(sampleD, sampleS, cptFileName) |
89 | 94 |
} |
90 |
- matchedAmplitudes <- postInitialPhase(initialResult, length(allDataSets), cut, minNS) |
|
95 |
+ matchedAmplitudes <- sc_postInitialPhase(initialResult, length(allDataSets), cut, minNS) |
|
91 | 96 |
save(matchedAmplitudes, file=paste(simulationName, "_matched_As.RData", sep="")) |
92 | 97 |
consensusAs<-matchedAmplitudes[[1]] |
93 |
- finalResult <- runFinalPhase(simulationName, allDataSets, consensusAs, ...) |
|
98 |
+ finalResult <- sc_runFinalPhase(simulationName, allDataSets, consensusAs, ...) |
|
94 | 99 |
} |
95 | 100 |
else |
96 | 101 |
{ |
97 | 102 |
stop("no checkpoint files found") |
98 | 103 |
} |
99 |
- return(postFinalPhase(finalResult, matchedAmplitudes)) |
|
104 |
+ return(sc_postFinalPhase(finalResult, matchedAmplitudes)) |
|
100 | 105 |
} |
101 | 106 |
|
102 |
-preInitialPhase <- function(simulationName, nCores) |
|
107 |
+sc_preInitialPhase <- function(simulationName, nCores) |
|
103 | 108 |
{ |
104 | 109 |
# find data files |
105 | 110 |
fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="") |
... | ... |
@@ -107,12 +112,14 @@ preInitialPhase <- function(simulationName, nCores) |
107 | 112 |
|
108 | 113 |
# establish the number of cores that we are able to use |
109 | 114 |
if (is.na(nCores)) |
115 |
+ { |
|
110 | 116 |
nCores <- length(allDataSets) |
117 |
+ } |
|
111 | 118 |
registerDoParallel(cores=nCores) |
112 | 119 |
return(allDataSets) |
113 | 120 |
} |
114 | 121 |
|
115 |
-runInitialPhase <- function(simulationName, allDataSets, nFactor, ...) |
|
122 |
+sc_runInitialPhase <- function(simulationName, allDataSets, nFactor, ...) |
|
116 | 123 |
{ |
117 | 124 |
#generate seeds for parallelization |
118 | 125 |
nut <- generateSeeds(chains=length(allDataSets), seed=-1) |
... | ... |
@@ -132,28 +139,36 @@ runInitialPhase <- function(simulationName, allDataSets, nFactor, ...) |
132 | 139 |
return(initialResult) |
133 | 140 |
} |
134 | 141 |
|
135 |
-postInitialPhase <- function(initialResult, nSets, cut, minNS) |
|
142 |
+sc_postInitialPhase <- function(initialResult, nSets, cut, minNS) |
|
136 | 143 |
{ |
137 | 144 |
nFactor <- ncol(initialResult[[1]]$Amean) |
138 | 145 |
BySet <- reOrderBySet(AP=initialResult, nFactor=nFactor, nSets=nSets,match="A") |
139 | 146 |
|
140 | 147 |
#run postpattern match function |
141 | 148 |
if (is.na(cut)) |
149 |
+ { |
|
142 | 150 |
cut <- nFactor |
143 |
- return(patternMatch4singleCell(Ptot=BySet$A, nP=nFactor, nSets=nSets, cnt=cut, |
|
151 |
+ } |
|
152 |
+ |
|
153 |
+ return(patternMatch4singleCell(Atot=BySet$A, nP=nFactor, nSets=nSets, cnt=cut, |
|
144 | 154 |
minNS=minNS, bySet=TRUE)) |
145 | 155 |
} |
146 | 156 |
|
147 |
-runFinalPhase <- function(simulationName, allDataSets, consensusAs, nCores, ...) |
|
157 |
+sc_runFinalPhase <- function(simulationName, allDataSets, consensusAs, nCores, ...) |
|
148 | 158 |
{ |
149 |
- if(length(dim(consensusAs))!=2){stop("consensusAs must be a matrix")} |
|
159 |
+ if (length(dim(consensusAs)) != 2) |
|
160 |
+ { |
|
161 |
+ stop("consensusAs must be a matrix") |
|
162 |
+ } |
|
150 | 163 |
|
151 | 164 |
# find data files if providing consensus patterns |
152 | 165 |
fileSig <- paste(simulationName, "_partition_[0-9]+.RData", sep="") |
153 | 166 |
allDataSets <- list.files(full.names=TRUE, pattern=fileSig) |
154 | 167 |
|
155 | 168 |
if (is.na(nCores)) |
156 |
- nCores <- length(allDataSets) |
|
169 |
+ { |
|
170 |
+ nCores <- length(allDataSets) |
|
171 |
+ } |
|
157 | 172 |
registerDoParallel(cores=nCores) |
158 | 173 |
|
159 | 174 |
# generate seeds for parallelization |
... | ... |
@@ -179,7 +194,7 @@ runFinalPhase <- function(simulationName, allDataSets, consensusAs, nCores, ...) |
179 | 194 |
return(finalResult) |
180 | 195 |
} |
181 | 196 |
|
182 |
-postFinalPhase <- function(finalResult, consensusAs) |
|
197 |
+sc_postFinalPhase <- function(finalResult, consensusAs) |
|
183 | 198 |
{ |
184 | 199 |
Aresult <- postFixed4Parallel(finalResult, consensusAs) |
185 | 200 |
finalResult <- list("Pmean"=Aresult$P, "Psd"=Aresult$Psd,"Amean"=consensusAs) |
... | ... |
@@ -18,13 +18,14 @@ Date: \tab 2018-01-24\cr |
18 | 18 |
License: \tab LGPL\cr |
19 | 19 |
} |
20 | 20 |
} |
21 |
+\author{ |
|
22 |
+Maintainer: Elana J. Fertig \email{ejfertig@jhmi.edu}, |
|
23 |
+ Michael F. Ochs \email{ochsm@tcnj.edu} |
|
24 |
+} |
|
21 | 25 |
\references{ |
22 | 26 |
Fertig EJ, Ding J, Favorov AV, Parmigiani G, Ochs MF. |
23 | 27 |
CoGAPS: an R/C++ package to identify patterns and biological |
24 | 28 |
process activity in transcriptomic data. |
25 | 29 |
Bioinformatics. 2010 Nov 1;26(21):2792-3 |
26 | 30 |
} |
27 |
-\author{ |
|
28 |
-Maintainer: Elana J. Fertig \email{ejfertig@jhmi.edu}, |
|
29 |
- Michael F. Ochs \email{ochsm@tcnj.edu} |
|
30 |
-} |
|
31 |
+ |
... | ... |
@@ -9,8 +9,7 @@ CoGAPS(D, S, nFactor = 7, nEquil = 1000, nSample = 1000, |
9 | 9 |
maxGibbmassA = 100, maxGibbmassP = 100, seed = -1, messages = TRUE, |
10 | 10 |
singleCellRNASeq = FALSE, whichMatrixFixed = "N", |
11 | 11 |
fixedPatterns = matrix(0), checkpointInterval = 0, |
12 |
- checkpointFile = "gaps_checkpoint.out", pumpThreshold = "unique", |
|
13 |
- nPumpSamples = 0, ...) |
|
12 |
+ checkpointFile = "gaps_checkpoint.out", ...) |
|
14 | 13 |
} |
15 | 14 |
\arguments{ |
16 | 15 |
\item{D}{data matrix} |
... | ... |
@@ -52,10 +51,6 @@ the fixed patterns} |
52 | 51 |
|
53 | 52 |
\item{checkpointFile}{name of the checkpoint file} |
54 | 53 |
|
55 |
-\item{pumpThreshold}{type of threshold for pump statistic} |
|
56 |
- |
|
57 |
-\item{nPumpSamples}{number of samples used in pump statistic} |
|
58 |
- |
|
59 | 54 |
\item{...}{keeps backwards compatibility with arguments from older versions} |
60 | 55 |
} |
61 | 56 |
\value{ |
... | ... |
@@ -73,3 +68,4 @@ the data matrix |
73 | 68 |
data(SimpSim) |
74 | 69 |
result <- CoGAPS(SimpSim.D, SimpSim.S, nFactor=3, nOutputs=250) |
75 | 70 |
} |
71 |
+ |
... | ... |
@@ -40,8 +40,9 @@ the data matrix for whole genome data; |
40 | 40 |
data(SimpSim) |
41 | 41 |
sim_name <- "example" |
42 | 42 |
createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, sim_name) |
43 |
-result <- GWCoGAPS(sim_name, nFactor=3, nEquil=1000, nSample=1000) |
|
43 |
+result <- GWCoGAPS(sim_name, nFactor=3, nEquil=200, nSample=200) |
|
44 | 44 |
} |
45 | 45 |
\seealso{ |
46 | 46 |
\code{\link{gapsRun}}, \code{\link{patternMatch4Parallel}}, and \code{\link{gapsMapRun}} |
47 | 47 |
} |
48 |
+ |
... | ... |
@@ -1,13 +1,9 @@ |
1 | 1 |
% Generated by roxygen2: do not edit by hand |
2 |
-% Please edit documentation in R/GWCoGAPS.R, R/scCoGAPS.R |
|
2 |
+% Please edit documentation in R/GWCoGAPS.R |
|
3 | 3 |
\name{GWCoGapsFromCheckpoint} |
4 | 4 |
\alias{GWCoGapsFromCheckpoint} |
5 |
-\alias{GWCoGapsFromCheckpoint} |
|
6 | 5 |
\title{Restart a GWCoGaps Run from Checkpoint} |
7 | 6 |
\usage{ |
8 |
-GWCoGapsFromCheckpoint(simulationName, nCores = NA, cut = NA, minNS = NA, |
|
9 |
- ...) |
|
10 |
- |
|
11 | 7 |
GWCoGapsFromCheckpoint(simulationName, nCores = NA, cut = NA, minNS = NA, |
12 | 8 |
...) |
13 | 9 |
} |
... | ... |
@@ -23,12 +19,9 @@ GWCoGapsFromCheckpoint(simulationName, nCores = NA, cut = NA, minNS = NA, |
23 | 19 |
\item{...}{additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun}} |
24 | 20 |
} |
25 | 21 |
\value{ |
26 |
-list of A and P estimates |
|
27 |
- |
|
28 | 22 |
list of A and P estimates |
29 | 23 |
} |
30 | 24 |
\description{ |
31 |
-Restart a GWCoGaps Run from Checkpoint |
|
32 |
- |
|
33 | 25 |
Restart a GWCoGaps Run from Checkpoint |
34 | 26 |
} |
27 |
+ |
... | ... |
@@ -2,10 +2,10 @@ |
2 | 2 |
% Please edit documentation in R/cellMatchR.R |
3 | 3 |
\name{cellMatchR} |
4 | 4 |
\alias{cellMatchR} |
5 |
-\title{patternMatch4Parallel} |
|
5 |
+\title{cellMatchR} |
|
6 | 6 |
\usage{ |
7 |
-cellMatchR(Atot, nSets, cnt, minNS = NULL, maxNS = NULL, |
|
8 |
- ignore.NA = FALSE, bySet = FALSE, plotDen = FALSE, ...) |
|
7 |
+cellMatchR(Atot, nSets, cnt, minNS = NA, maxNS = NA, ignore.NA = FALSE, |
|
8 |
+ bySet = FALSE, plotDen = FALSE, ...) |
|
9 | 9 |
} |
10 | 10 |
\arguments{ |
11 | 11 |
\item{Atot}{a matrix containing the total by set estimates of Pmean output from \code{reOrderBySet}} |
... | ... |
@@ -16,21 +16,21 @@ cellMatchR(Atot, nSets, cnt, minNS = NULL, maxNS = NULL, |
16 | 16 |
|
17 | 17 |
\item{minNS}{minimum of individual set contributions a cluster must contain} |
18 | 18 |
|
19 |
+\item{maxNS}{maximum of individual set contributions a cluster must contain} |
|
20 |
+ |
|
19 | 21 |
\item{ignore.NA}{logical indicating whether or not to ignore NAs from potential over dimensionalization. Default is FALSE.} |
20 | 22 |
|
21 | 23 |
\item{bySet}{logical indicating whether to return list of matched set solutions from \code{Atot}} |
22 | 24 |
|
23 |
-\item{...}{additional parameters for \code{agnes}} |
|
25 |
+\item{plotDen}{plot} |
|
24 | 26 |
|
25 |
-\item{R2mean}{} |
|
27 |
+\item{...}{additional parameters for \code{agnes}} |
|
26 | 28 |
} |
27 | 29 |
\value{ |
28 | 30 |
a matrix of concensus patterns by samples. If \code{bySet=TRUE} then a list of the set contributions to each |
29 | 31 |
concensus pattern is also returned. |
30 | 32 |
} |
31 | 33 |
\description{ |
32 |
-patternMatch4Parallel |
|
33 |
-} |
|
34 |
-\seealso{ |
|
35 |
-\code{\link{fastcluster}} |
|
34 |
+cellMatchR |
|
36 | 35 |
} |
36 |
+ |
... | ... |
@@ -2,30 +2,30 @@ |
2 | 2 |
% Please edit documentation in R/createGWCoGAPSSets.R |
3 | 3 |
\name{createGWCoGAPSSets} |
4 | 4 |
\alias{createGWCoGAPSSets} |
5 |
-\title{createGWCoGAPSSets} |
|
5 |
+\title{Create Gene Sets for GWCoGAPS} |
|
6 | 6 |
\usage{ |
7 |
-createGWCoGAPSSets(D, S, nSets, simulationName, path = "") |
|
7 |
+createGWCoGAPSSets(D, S, nSets, simulationName) |
|
8 | 8 |
} |
9 | 9 |
\arguments{ |
10 |
-\item{nSets}{number of sets for parallelization} |
|
10 |
+\item{D}{data matrix} |
|
11 | 11 |
|
12 |
-\item{path}{character string indicating were to save resulting data objects} |
|
12 |
+\item{S}{uncertainty matrix} |
|
13 | 13 |
|
14 |
-\item{data}{data matrix with unique rownames} |
|
14 |
+\item{nSets}{number of sets to partition the data into} |
|
15 | 15 |
|
16 |
-\item{outRDA}{name of output file} |
|
17 |
- |
|
18 |
-\item{keep}{logical indicating whether or not to save gene set list. Default is TRUE.} |
|
16 |
+\item{simulationName}{name used to identify files created by this simulation} |
|
19 | 17 |
} |
20 | 18 |
\value{ |
21 |
-list with randomly generated sets of genes from whole genome data |
|
19 |
+simulationName used to identify saved files |
|
22 | 20 |
} |
23 | 21 |
\description{ |
24 |
-\code{createGWCoGAPSSets} factors whole genome data into randomly generated sets for indexing; |
|
22 |
+Create Gene Sets for GWCoGAPS |
|
23 |
+} |
|
24 |
+\details{ |
|
25 |
+factors whole genome data into randomly generated sets for indexing |
|
25 | 26 |
} |
26 | 27 |
\examples{ |
27 |
-\dontrun{ |
|
28 |
-createGWCoGAPSSet(D,nSets=nSets) |
|
28 |
+data(SimpSim) |
|
29 |
+createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, "example") |
|
29 | 30 |
} |
30 | 31 |
|
31 |
-} |
... | ... |
@@ -5,7 +5,7 @@ |
5 | 5 |
\title{Create Gene Sets for scCoGAPS} |
6 | 6 |
\usage{ |
7 | 7 |
createscCoGAPSSets(D, nSets, simulationName, samplingRatio = NULL, |
8 |
- path = "") |
|
8 |
+ path = "", anotionObj = NULL) |
|
9 | 9 |
} |
10 | 10 |
\arguments{ |
11 | 11 |
\item{D}{data matrix} |
... | ... |
@@ -18,20 +18,16 @@ createscCoGAPSSets(D, nSets, simulationName, samplingRatio = NULL, |
18 | 18 |
|
19 | 19 |
\item{path}{character string indicating were to save resulting data objects. default is current working dir} |
20 | 20 |
|
21 |
-\item{S}{uncertainty matrix} |
|
22 |
- |
|
23 |
-\item{annotionObj}{vector of same length as number of columns of D} |
|
21 |
+\item{anotionObj}{vector of same length as number of columns of D} |
|
24 | 22 |
} |
25 | 23 |
\value{ |
26 | 24 |
simulationName used to identify saved files |
27 | 25 |
} |
28 | 26 |
\description{ |
29 |
-Create Gene Sets for scCoGAPS |
|
30 |
-} |
|
31 |
-\details{ |
|
32 | 27 |
factors whole genome data into randomly generated sets for indexing |
33 | 28 |
} |
34 | 29 |
\examples{ |
35 | 30 |
data(SimpSim) |
36 |
-createscCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, "example") |
|
31 |
+createscCoGAPSSets(SimpSim.D, nSets=2, simulationName="example") |
|
37 | 32 |
} |
33 |
+ |
... | ... |
@@ -4,11 +4,12 @@ |
4 | 4 |
\alias{patternMatch4Parallel} |
5 | 5 |
\title{patternMatch4Parallel} |
6 | 6 |
\usage{ |
7 |
-patternMatch4Parallel(Ptot, nSets, cnt, minNS = NULL, maxNS = NULL, |
|
7 |
+patternMatch4Parallel(Ptot, nSets, cnt, minNS = NA, maxNS = NULL, |
|
8 | 8 |
cluster.method = "complete", ignore.NA = FALSE, bySet = FALSE, ...) |
9 | 9 |
} |
10 | 10 |
\arguments{ |
11 |
-\item{Ptot}{a matrix containing the total by set estimates of Pmean output from \code{reOrderBySet}} |
|
11 |
+\item{Ptot}{a matrix containing the total by set estimates of Pmean output |
|
12 |
+from \code{reOrderBySet}} |
|
12 | 13 |
|
13 | 14 |
\item{nSets}{number of parallel sets used to generate \code{Ptot}} |
14 | 15 |
|
... | ... |
@@ -16,18 +17,22 @@ patternMatch4Parallel(Ptot, nSets, cnt, minNS = NULL, maxNS = NULL, |
16 | 17 |
|
17 | 18 |
\item{minNS}{minimum of individual set contributions a cluster must contain} |
18 | 19 |
|
19 |
-\item{maxNS}{max of individual set contributions a cluster must contain. default is nSets+minNS} |
|
20 |
+\item{maxNS}{max of individual set contributions a cluster must contain. |
|
21 |
+default is nSets+minNS} |
|
20 | 22 |
|
21 | 23 |
\item{cluster.method}{the agglomeration method to be used for clustering} |
22 | 24 |
|
23 |
-\item{ignore.NA}{logical indicating whether or not to ignore NAs from potential over dimensionalization. Default is FALSE.} |
|
25 |
+\item{ignore.NA}{logical indicating whether or not to ignore NAs from |
|
26 |
+potential over dimensionalization. Default is FALSE.} |
|
24 | 27 |
|
25 |
-\item{bySet}{logical indicating whether to return list of matched set solutions from \code{Ptot}} |
|
28 |
+\item{bySet}{logical indicating whether to return list of matched set |
|
29 |
+solutions from \code{Ptot}} |
|
26 | 30 |
|
27 | 31 |
\item{...}{additional parameters for \code{agnes}} |
28 | 32 |
} |
29 | 33 |
\value{ |
30 |
-a matrix of concensus patterns by samples. If \code{bySet=TRUE} then a list of the set contributions to each |
|
34 |
+a matrix of concensus patterns by samples. If \code{bySet=TRUE} then |
|
35 |
+a list of the set contributions to each |
|
31 | 36 |
concensus pattern is also returned. |
32 | 37 |
} |
33 | 38 |
\description{ |
... | ... |
@@ -36,3 +41,4 @@ patternMatch4Parallel |
36 | 41 |
\seealso{ |
37 | 42 |
\code{\link{agnes}} |
38 | 43 |
} |
44 |
+ |
... | ... |
@@ -4,13 +4,15 @@ |
4 | 4 |
\alias{postFixed4Parallel} |
5 | 5 |
\title{Post Processing of Parallel Output} |
6 | 6 |
\usage{ |
7 |
-postFixed4Parallel(AP.fixed, setPs) |
|
7 |
+postFixed4Parallel(AP.fixed, setValues, setMatrix = "P") |
|
8 | 8 |
} |
9 | 9 |
\arguments{ |
10 | 10 |
\item{AP.fixed}{output of parallel gapsMapRun calls with same FP} |
11 | 11 |
|
12 |
-\item{setPs}{data.frame with rows giving fixed patterns for P used as input |
|
12 |
+\item{setValues}{data.frame with rows giving fixed patterns for P used as input |
|
13 | 13 |
for gapsMapRun} |
14 |
+ |
|
15 |
+\item{setMatrix}{which matrix, A or P} |
|
14 | 16 |
} |
15 | 17 |
\value{ |
16 | 18 |
list of two data.frames containing the A matrix estimates or their |
... | ... |
@@ -19,3 +21,4 @@ corresponding standard deviations from output of parallel CoGAPS |
19 | 21 |
\description{ |
20 | 22 |
Post Processing of Parallel Output |
21 | 23 |
} |
24 |
+ |
... | ... |
@@ -9,7 +9,7 @@ postFixed4SC(AP.fixed, setAs) |
9 | 9 |
\arguments{ |
10 | 10 |
\item{AP.fixed}{output of parallel gapsMapRun calls with same FP} |
11 | 11 |
|
12 |
-\item{setPs}{data.frame with rows giving fixed patterns for P used as input |
|
12 |
+\item{setAs}{data.frame with rows giving fixed patterns for A used as input |
|
13 | 13 |
for gapsMapRun} |
14 | 14 |
} |
15 | 15 |
\value{ |
... | ... |
@@ -19,3 +19,4 @@ corresponding standard deviations from output of parallel CoGAPS |
19 | 19 |
\description{ |
20 | 20 |
Post Processing of Parallel Output |
21 | 21 |
} |
22 |
+ |
... | ... |
@@ -39,9 +39,10 @@ the data matrix for whole genome data; |
39 | 39 |
\examples{ |
40 | 40 |
data(SimpSim) |
41 | 41 |
sim_name <- "example" |
42 |
-createscCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, sim_name) |
|
43 |
-result <- scCoGAPS(sim_name, nFactor=3, nEquil=1000, nSample=1000) |
|
42 |
+createscCoGAPSSets(SimpSim.D, nSets=2, simulationName=sim_name) |
|
43 |
+result <- scCoGAPS(sim_name, nFactor=3, nEquil=200, nSample=200) |
|
44 | 44 |
} |
45 | 45 |
\seealso{ |
46 | 46 |
\code{\link{gapsRun}}, \code{\link{patternMatch4singleCell}}, and \code{\link{gapsMapRun}} |
47 | 47 |
} |
48 |
+ |
48 | 49 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,27 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/scCoGAPS.R |
|
3 |
+\name{scCoGAPSFromCheckpoint} |
|
4 |
+\alias{scCoGAPSFromCheckpoint} |
|
5 |
+\title{Restart a scCoGAPS run from a Checkpoint} |
|
6 |
+\usage{ |
|
7 |
+scCoGAPSFromCheckpoint(simulationName, nCores = NA, cut = NA, minNS = NA, |
|
8 |
+ ...) |
|
9 |
+} |
|
10 |
+\arguments{ |
|
11 |
+\item{simulationName}{name of this simulation} |
|
12 |
+ |
|
13 |
+\item{nCores}{number of cores for parallelization. If left to the default NA, nCores = nSets.} |
|
14 |
+ |
|
15 |
+\item{cut}{number of branches at which to cut dendrogram used in patternMatch4Parallel} |
|
16 |
+ |
|
17 |
+\item{minNS}{minimum of individual set contributions a cluster must contain} |
|
18 |
+ |
|
19 |
+\item{...}{additional parameters to be fed into \code{gapsRun} and \code{gapsMapRun}} |
|
20 |
+} |
|
21 |
+\value{ |
|
22 |
+list of A and P estimates |
|
23 |
+} |
|
24 |
+\description{ |
|
25 |
+Restart a scCoGAPS run from a Checkpoint |
|
26 |
+} |
|
27 |
+ |
... | ... |
@@ -12,6 +12,10 @@ output: |
12 | 12 |
BiocStyle::html_document |
13 | 13 |
--- |
14 | 14 |
|
15 |
+```{r include=FALSE, cache=FALSE} |
|
16 |
+library(CoGAPS) |
|
17 |
+``` |
|
18 |
+ |
|
15 | 19 |
# Introduction |
16 | 20 |
|
17 | 21 |
NMF algorithms associate gene expression changes with biological processes (e.g., time-course dynamics or disease subtypes). Compared with univariate gene associations, the relative gene weights of NMF solutions do not clearly identify gene biomarkers for these processes. Therefore, we developed a novel PatternMarkers statistic to extract genes uniquely representative of NMF patterns for enhanced visualization and subsequent biological validation. Finding unbiased gene markers with PatternMarkers requires whole-genome data. However, NMF algorithms typically do not converge for the tens of thousands of genes in genome-wide profiling. Therefore, we also developed GWCoGAPS to simultaneously cut runtime and ensure convergence for whole genome NMF with a sparse, MCMC NMF algorithm CoGAPS. The software includes a shiny application PatternMatcher to compare NMF patterns. |
... | ... |
@@ -23,59 +27,44 @@ pipeline for genome wide NMF analysis. |
23 | 27 |
|
24 | 28 |
## GWCoGAPS |
25 | 29 |
|
26 |
-### Methods |
|
27 |
-The GWCoGAPS algorithm is run by calling the GWCoGAPS function in the CoGAPS |
|
28 |
-R package as follows: |
|
30 |
+Before calling *GWCoGAPS*, the user must partition the data into random subsets |
|
31 |
+of genes. In this example we use the sample data provided in the package. |
|
29 | 32 |
|
30 | 33 |
```{r} |
31 |
-library(CoGAPS) |
|
34 |
+data(SimpSim) |
|
35 |
+simulationName <- "example" |
|
36 |
+createGWCoGAPSSets(SimpSim.D, SimpSim.S, nSets=2, simulationName) |
|
32 | 37 |
``` |
33 |
-*** |
|
34 |
-GWCoGAPS(D, S, nFactor, nSets, nCores, saveBySetResults, fname, |
|
35 |
- PatternsMatchFN = patternMatch4Parallel, Cut, minNS, ...) |
|
36 |
-*** |
|
37 | 38 |
|
38 |
-**Input Arguments** |
|
39 |
-The inputs that must be set each time are only the nSets, nFactor, and data and standard deviation matrices, with all other inputs having default values. Additional inputs to the gapsRun function, as outlined previously, can be used to taylor the analysis based on the expected dimensionality of the data. GWCoGAPS specific arguments are as follows: |
|
40 |
- |
|
41 |
-\begin{description} |
|
42 |
-\item[D]{data matrix} |
|
43 |
-\item[S]{uncertainty matrix (std devs for chi-squared of Log Likelihood)} |
|
44 |
-\item[nFactor]{number of patterns (basis vectors, metagenes), which must be |
|
45 |
-greater than or equal to the number of rows of FP} |
|
46 |
-\item[nSets]{number of sets for parallelization} |
|
47 |
-\item[nCores]{number of cores for parallelization. If left to the default NA, nCores = nSets. } |
|
48 |
-\item[saveBySetResults]{logical indicating whether to save by intermediary by set results. Default is FALSE.} |
|
49 |
-\item[fname]{character string used to label file output. Default is "GWCoGAPS.out"} |
|
50 |
-\item[PatternsMatchFN]{function to use for pattern matching across sets. Default is patternMatch4Parallel.} |
|
51 |
-\item[Cut]{number of branches at which to cut dendrogram used in patternMatch4Parallel} |
|
52 |
-\item[minNS]{minimum of individual set contributions a cluster must contain used in patternMatch4Parallel} |
|
53 |
-\end{description} |
|
39 |
+This will break up our data into 2 separate *.RData* files which we will pass |
|
40 |
+to *GWCoGAPS* by providing the *simulationName* parameter of |
|
41 |
+*createGWCoGAPSSets*. |
|
54 | 42 |
|
55 |
-\par Once the GWCoGAPS algorithm has been run, the inferred patterns and corresponding amplitudes can processed in the same way as output from gapsRun, gapsMapsRun, or CoGAPS. |
|
43 |
+```{r} |
|
44 |
+#result <- GWCoGAPS(simulationName, nFactor=3, nEquil=500, nSample=500) |
|
45 |
+``` |
|
56 | 46 |
|
57 |
-### Example: Simulated data |
|
47 |
+Note that *nFactor* is required by *GWCoGAPS* but *nEquil* and *nSample* are |
|
48 |
+optional. Any parameters passed in the ... argument of *GWCoGAPS* will be |
|
49 |
+forwarded to the internal call to *CoGAPS*. |
|
58 | 50 |
|
59 |
-\par In this example, we use the same simulated data in SimpSim (SimpSim.D), as previously described, with three known patterns (SimpSim.P) and corresponding amplitude (SimpSim.A) with specified activity in two gene sets (GSets). |
|
51 |
+There are additional optional parameters that can be passed to *GWCoGAPS*: |
|
60 | 52 |
|
61 |
-*** |
|
62 |
-library('CoGAPS') |
|
63 |
-data('SimpSim') |
|
64 |
-GWCoGAPS(SimpSim.D, SimpSim.S, nFactor=3, nCores=NA, nSets=2, |
|
65 |
- fname="test" ,PatternsMatchFN = postCoGAPSPatternMatch, |
|
66 |
- sampleSnapshots = "TRUE", numSnapshots = 3) |
|
67 |
-plotGAPS(AP.Fixed$A, AP.Fixed$P, 'ModSimFigs') |
|
68 |
-*** |
|
53 |
+*nCores* - number of cores for parallelization, if not set then nCores = nSets |
|
54 |
+*cut* - number of branches at which to cut dendrogram used in |
|
55 |
+ *patternMatch4Parallel* |
|
56 |
+*minNS* - minimum of individual set contributions a cluster must contain used |
|
57 |
+ in *patternMatch4Parallel* |
|
58 |
+*manualMatch* - logical indicating whether or not to stop after initial phase |
|
59 |
+ for manual pattern matching |
|
60 |
+*consensusPatterns* - fixed pattern matrix to be used to ensure reciprocity of |
|
61 |
+ A weights accross sets |
|
69 | 62 |
|
70 |
-Figure \ref{fig:ModSim} shows the results from plotting the GWCoGAPS estimates of ${\bf{A}}$ and ${\bf{P}}$ using \texttt{plotGAPS}. |
|
71 |
-\begin{figure}[ht] |
|
72 |
- \begin{center} |
|
73 |
- \includegraphics[width=0.45\linewidth]{ModSimFigs-Amplitude}\includegraphics[width=0.45\linewidth]{ModSimFigs-Patterns} |
|
74 |
- \end{center} |
|
75 |
- \caption{Results from GWCoGAPS on simulated data set with known true patterns.} |
|
76 |
- \label{fig:ModSim} |
|
77 |
-\end{figure} |
|
63 |
+Once the GWCoGAPS algorithm has been run, the inferred patterns and corresponding amplitudes can processed in the same way as output from CoGAPS. |
|
78 | 64 |
|
65 |
+```{r} |
|
66 |
+#plotGaps(result$Amean, result$Pmean) |
|
67 |
+``` |
|
79 | 68 |
|
80 | 69 |
## PatternMatcher Shiny App |
81 | 70 |
|
82 | 71 |
deleted file mode 100644 |
... | ... |
@@ -1,233 +0,0 @@ |
1 |
-title: "GWCoGAPS and PatternMarkers Vignette" |
|
2 |
-author: "Genevieve L. Stein-O'Brien" |
|
3 |
-date: \today |
|
4 |
-output: BiocStyle::pdf_document |
|
5 |
-bibliography: AppNote.bib |
|
6 |
-vignette: > |
|
7 |
- %\VignetteIndexEntry{Overview of CNPBayes package} |
|
8 |
- %\VignetteEngine{knitr::rmarkdown} |
|
9 |
- %\usepackage[utf8]{inputenc} |
|
10 |
- |
|
11 |
-# Introduction |
|
12 |
- |
|
13 |
-NMF algorithms associate gene expression changes with biological processes (e.g., time-course dynamics or disease subtypes). Compared with univariate gene associations, the relative gene weights of NMF solutions do not clearly identify gene biomarkers for these processes. Therefore, we developed a novel PatternMarkers statistic to extract genes uniquely representative of NMF patterns for enhanced visualization and subsequent biological validation. Finding unbiased gene markers with PatternMarkers requires whole-genome data. However, NMF algorithms typically do not converge for the tens of thousands of genes in genome-wide profiling. Therefore, we also developed GWCoGAPS to simultaneously cut runtime and ensure convergence for whole genome NMF with a sparse, MCMC NMF algorithm CoGAPS. The software includes a shiny application PatternMatcher to compare NMF patterns. |
|
14 |
- |
|
15 |
-# Executing GWCoGAPS |
|
16 |
- |
|
17 |
-In this chapter, we describe how to run both the GWCoGAPS algorithm and a manual |
|
18 |
-pipeline for genome wide NMF analysis. |
|
19 |
- |
|
20 |
-## GWCoGAPS |
|
21 |
- |
|
22 |
-### Methods |
|
23 |
-The GWCoGAPS algorithm is run by calling the GWCoGAPS function in the CoGAPS |
|
24 |
-R package as follows: |
|
25 |
- |
|
26 |
- |
|
27 |
-```r |
|
28 |
-library(CoGAPS) |
|
29 |
-GWCoGAPS(D, S, nFactor, nSets, nCores, saveBySetResults, fname, |
|
30 |
- PatternsMatchFN = patternMatch4Parallel, Cut, minNS, ...) |
|
31 |
-``` |
|
32 |
- |
|
33 |
-**Input Arguments** |
|
34 |
-The inputs that must be set each time are only the nSets, nFactor, and data and standard deviation matrices, with all other inputs having default values. Additional inputs to the gapsRun function, as outlined previously, can be used to taylor the analysis based on the expected dimensionality of the data. GWCoGAPS specific arguments are as follows: |
|
35 |
- |
|
36 |
-\begin{description} |
|
37 |
-\item[D]{data matrix} |
|
38 |
-\item[S]{uncertainty matrix (std devs for chi-squared of Log Likelihood)} |
|
39 |
-\item[nFactor]{number of patterns (basis vectors, metagenes), which must be |
|
40 |
-greater than or equal to the number of rows of FP} |
|
41 |
-\item[nSets]{number of sets for parallelization} |
|
42 |
-\item[nCores]{number of cores for parallelization. If left to the default NA, nCores = nSets. } |
|
43 |
-\item[saveBySetResults]{logical indicating whether to save by intermediary by set results. Default is FALSE.} |
|
44 |
-\item[fname]{character string used to label file output. Default is "GWCoGAPS.out"} |
|
45 |
-\item[PatternsMatchFN]{function to use for pattern matching across sets. Default is patternMatch4Parallel.} |
|
46 |
-\item[Cut]{number of branches at which to cut dendrogram used in patternMatch4Parallel} |
|
47 |
-\item[minNS]{minimum of individual set contributions a cluster must contain used in patternMatch4Parallel} |
|
48 |
-\end{description} |
|
49 |
- |
|
50 |
-\par Once the GWCoGAPS algorithm has been run, the inferred patterns and corresponding amplitudes can processed in the same way as output from gapsRun, gapsMapsRun, or CoGAPS. |
|
51 |
- |
|
52 |
-### Example: Simulated data |
|
53 |
- |
|
54 |
-\par In this example, we use the same simulated data in SimpSim (SimpSim.D), as previously described, with three known patterns (SimpSim.P) and corresponding amplitude (SimpSim.A) with specified activity in two gene sets (GSets). |
|
55 |
- |
|
56 |
- |
|
57 |
-```r |
|
58 |
-library('CoGAPS') |
|
59 |
-data('SimpSim') |
|
60 |
-GWCoGAPS(SimpSim.D, SimpSim.S, nFactor=3, nCores=NA, nSets=2, |
|
61 |
- fname="test" ,PatternsMatchFN = postCoGAPSPatternMatch, |
|
62 |
- sampleSnapshots = "TRUE", numSnapshots = 3) |
|
63 |
-plotGAPS(AP.Fixed$A, AP.Fixed$P, 'ModSimFigs') |
|
64 |
-``` |
|
65 |
- |
|
66 |
- |
|
67 |
-## Manual Pipeline |
|
68 |
- |
|
69 |
-The functions that compose the core of the GWCoGAPS algorithm are also provided in the CoGAPS R package and can be used for a manual pipeline of the same analysis or to create a custom pipeline. Additionally, a web based shiny application is included for visualizing and editing the pattern matching process. |
|
70 |
- |
|
71 |
-### PatternMatcher Shiny App |
|
72 |
- |
|
73 |
-**Input Arguments** |
|
74 |
-The inputs that must be set each time is the PBySet. Additionally, the order and sample.color inputs are only available when calling patternMatcher in an interactive R session. |
|
75 |
- |
|
76 |
-\begin{description} |
|
77 |
-\item[PBySet]{list of matched set solutions for the Pmatrix from an NMF algorithm} |
|
78 |
-\item[out]{optional name for saving output} |
|
79 |
-\item[order]{optional vector indicating order of samples for plotting. Default is NULL.} |
|
80 |
-\item[sample.color]{optional vector of colors of same lenght as colnames. Default is NULL.} |
|
81 |
-\end{description} |
|
82 |
- |
|
83 |
- |
|
84 |
- |
|
85 |
-### Example: Simulated data} |
|
86 |
- |
|
87 |
-\par This example will manually generate the same result as calling the GWCoGAPS function as given in the example in the previous section. |
|
88 |
- |
|
89 |
- |
|
90 |
-```r |
|
91 |
-data('SimpSim') |
|
92 |
-D<-SimpSim.D |
|
93 |
-S<-SimpSim.S |
|
94 |
-nFactor=3 |
|
95 |
-nSets<-2 |
|
96 |
-nCores=NA |
|
97 |
-numSnapshots=3 |
|
98 |
-saveBySetResults=TRUE |
|
99 |
-fname="test" |
|
100 |
-minNS=NA |
|
101 |
- |
|
102 |
-# break the data into sets |
|
103 |
-genesInSets<-createGWCoGAPSSets(data=D, nSets=nSets, keep=FALSE) |
|
104 |
- |
|
105 |
-#generate seeds for parallelization |
|
106 |
-nut<-generateSeeds(chains=nSets, seed=-1) |
|
107 |
- |
|
108 |
-# establish the number of cores that you are able to use |
|
109 |
-if(is.na(nCores)){nCores<-nSets} |
|
110 |
-registerDoParallel(cores=nCores) |
|
111 |
- |
|
112 |
-# run CoGAPS for each set |
|
113 |
-AP <- foreach(i=1:nSets) %dopar% { |
|
114 |
- D <- as.matrix(D[genesInSets[[i]],]) |
|
115 |
- S <- as.matrix(S[genesInSets[[i]],]) |
|
116 |
- gapsRun(D=D, S=S, nFactor=nFactor,seed=nut[i],numSnapshots=numSnapshots) |
|
117 |
-} |
|
118 |
- |
|
119 |
-BySet<-reOrderBySet(AP=AP,nFactor=nFactor,nSets=nSets) |
|
120 |
-matchedPs<-patternMatch4Parallel(Ptot=BySet$P,nP=nFactor,nS=nSets,cnt=nFactor,minNS=minNS,bySet=TRUE) |
|
121 |
- |
|
122 |
-# use shiny for pattern matching |
|
123 |
-selectPBySet<-PatternMatcher(PBySet=matchedPs[["PBySet"]]) |
|
124 |
-library(reshape2) |
|
125 |
-selectPBySet<-dcast(selectPBySet, BySet ~ Samples) |
|
126 |
-rownames(selectPBySet)<-selectPBySet$BySet |
|
127 |
-selectPBySet<-as.matrix(selectPBySet[,-1]) |
|
128 |
-matchedPs<-patternMatch4Parallel(Ptot=selectPBySet,nP=nFactor,nS=nSets,cnt=nFactor,minNS=minNS,bySet=FALSE) |
|
129 |
- |
|
130 |
-#generate seeds for parallelization |
|
131 |
-nut<-generateSeeds(chains=nSets, seed=-1) |
|
132 |
-#final number of factors |
|
133 |
-nFactorFinal<-dim(matchedPs)[1] |
|
134 |
- |
|
135 |
-# run fixed CoGAPS |
|
136 |
-Fixed <- foreach(i=1:nSets) %dopar% { |
|
137 |
- D <- as.matrix(D[genesInSets[[i]],]) |
|
138 |
- S <- as.matrix(S[genesInSets[[i]],]) |
|
139 |
- AP <- gapsMapRun(D, S, FP=matchedPs, nFactor=nFactorFinal, fixedMatrix = "P",seed=nut[i],numSnapshots=numSnapshots) |
|
140 |
-} |
|
141 |
- |
|
142 |
-#extract A and Asds |
|
143 |
-As4fixPs <- postFixed(AP.fixed=Fixed,setPs=matchedPs) |
|
144 |
-``` |
|
145 |
- |
|
146 |
- |
|
147 |
-# PatternMarkers |
|
148 |
- |
|
149 |
-In this chapter, we describe the PatternMarkers statistic. |
|
150 |
- |
|
151 |
-## PatternMarkers |
|
152 |
- |
|
153 |
-The PatternMarkers statistic finds the genes most uniquely associated with a given pattern or linear combination of patterns by computing |
|
154 |
- |
|
155 |
-\begin{equation} |
|
156 |
-\sqrt{\left(\bf{A}_{i}-l_{p}\right)^{\textit{t}} \left(\bf{A}_{i}-l_{p}\right)} |
|
157 |
-\label{eq:PatternMarkers} |
|
158 |
-\end{equation} |
|
159 |
- |
|
160 |
-where are the elements of the $\bf{A}$ matrix for the $\textit{i}^{th}$ gene scaled to have a maximum of one and \textit{l} is the $\textit{p}^{th}$ user specified norm. The genes are then ranked. In the case where \textit{l} is the identity vector, the ranking is run separately for each of the K patterns. Unique sets are generated by thresholding using the first gene to have a lower ranking, i.e. better fit to, another patterns. |
|
161 |
- |
|
162 |
-### Methods |
|
163 |
- |
|
164 |
-The PatternMarkers statistic is run by calling thepatternMarkers function in the CoGAPS R package as follows: |
|
165 |
- |
|
166 |
- |
|
167 |
-```r |
|
168 |
- patternMarkers(Amatrix = AP$Amean, scaledPmatrix = FALSE, Pmatrix = NA, |
|
169 |
- threshold = "cut", lp = NA, full = FALSE, ...) |
|
170 |
-``` |
|
171 |
- |
|
172 |
-**Input Arguments** |
|
173 |
-\begin{description} |
|
174 |
-\item[Amatrix]{A matrix of genes by weights resulting from CoGAPS or other NMF decomposition} |
|
175 |
-\item[scaledPmatrix]{logical indicating whether the corresponding pattern matrix was fixed to have max 1 during decomposition} |
|
176 |
-\item[Pmatrix]{the corresponding Pmatrix (patterns X samples) for the provided Amatrix (genes x patterns). This must be supplied if scaledPmatrix is FALSE.} |
|
177 |
-\item[threshold]{the type of threshold to be used. The default "cut" will thresholding by the first gene to have a lower ranking, i.e. better fit to, a pattern. Alternatively, threshold="all" will return all of the genes in rank order for each pattern.} |
|
178 |
-\item[lp]{a vector of weights for each pattern to be used for finding markers. If NA markers for each pattern of the A matrix will be used.} |
|
179 |
-\item[full]{logical indicating whether to return the ranks of each gene for each pattern} |
|
180 |
-\end{description} |
|
181 |
- |
|
182 |
-Once the PatternMarkers statistic has been run, a heatmap of each markers expression level can be displayed using the plotPatternMarkers function as follows: |
|
183 |
- |
|
184 |
- |
|
185 |
-```r |
|
186 |
-plotPatternMarkers(data = NA, patternMarkers = PatternMarkers, |
|
187 |
- patternPalette = NA, sampleNames = NA, samplePalette = NA, |
|
188 |
- colDenogram = TRUE, heatmapCol = "bluered", scale = "row", ...) |
|
189 |
-``` |
|
190 |
- |
|
191 |
-**Input Arguments** |
|
192 |
-\begin{description} |
|
193 |
-\item[data]{the dataset from which the patterns where generated} |
|
194 |
-\item[patternMarkers]{the list of genes generated from the patternMarkers function} |
|
195 |
-\item[patternPalette]{a vector indicating what color should be used for each pattern} |
|
196 |
-\item[sampleNames]{names of the samples to use for labeling } |
|
197 |
-\item[samplePalette]{a vector indicating what color should be used for each sample} |
|
198 |
-\end{description} |
|
199 |
- |
|
200 |
-### Example: Simulated data |
|
201 |
- |
|
202 |
- |
|
203 |
-```r |
|
204 |
-# GWCoGAPS output |
|
205 |
-PatternMarkers<-patternMarkers(Amatrix=AP.fixed$A,scaledPmatrix=TRUE,threshold="cut") |
|
206 |
-plotPatternMarkers(data=D,patternMarkers=PatternMarkers,patternPalette=c("grey","navy","orange")) |
|
207 |
-# manual pipeline output |
|
208 |
-PatternMarkers<-patternMarkers(Amatrix=As4fixPs$A,scaledPmatrix=TRUE,threshold="cut") |
|
209 |
-plotPatternMarkers(data=D,patternMarkers=PatternMarkers,patternPalette=c("grey","navy","orange")) |
|
210 |
-``` |
|
211 |
- |
|
212 |
-Figure \ref{fig:PM1} shows the results from running plotPatternMarkers on the PatternMarkers generated from the the GWCoGAPS (left) and manual pipeline (right) results generated from the simulated data as previously illustrated. |
|
213 |
-\begin{figure}[h] |
|
214 |
-\begin{center} |
|
215 |
-\includegraphics[width=0.4\linewidth]{GWCoGAPSPMs} |
|
216 |
-\includegraphics[width=0.4\linewidth]{ManPipePMs} |
|
217 |
-\end{center} |
|
218 |
-\caption{Heatmap of PatternMarkers expression levels for simulated data.} |
|
219 |
-\label{fig:PM1} |
|
220 |
-\end{figure} |
|
221 |
- |
|
222 |
-# Feedback |
|
223 |