Goal is to initialize container for A and B of the right dimension and avoid unnecessary read / writes
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@52852 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -73,8 +73,8 @@ construct <- function(filenames, |
73 | 73 |
cnSet <- new("CNSet", |
74 | 74 |
alleleA=initializeBigMatrix(name="A", nr, nc), |
75 | 75 |
alleleB=initializeBigMatrix(name="B", nr, nc), |
76 |
-## call=initializeBigMatrix(name="call", nr, nc), |
|
77 |
-## callProbability=initializeBigMatrix(name="callPr", nr,nc), |
|
76 |
+ call=initializeBigMatrix(name="call", nr, nc), |
|
77 |
+ callProbability=initializeBigMatrix(name="callPr", nr,nc), |
|
78 | 78 |
annotation=cdfName, |
79 | 79 |
batch=batch) |
80 | 80 |
sampleNames(cnSet) <- sns |
... | ... |
@@ -116,27 +116,64 @@ genotype <- function(filenames, |
116 | 116 |
if(missing(cdfName)) stop("must specify cdfName") |
117 | 117 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
118 | 118 |
if(missing(sns)) sns <- basename(filenames) |
119 |
- callSet <- construct(filenames=filenames, |
|
120 |
- cdfName=cdfName, |
|
121 |
- copynumber=TRUE, |
|
122 |
- sns=sns, |
|
123 |
- verbose=verbose, |
|
124 |
- batch=batch) |
|
125 |
- FUN <- ifelse(is.lds, "snprma2", "snprma") |
|
126 |
- snprmaFxn <- function(FUN,...){ |
|
127 |
- switch(FUN, |
|
128 |
- snprma=snprma(...), |
|
129 |
- snprma2=snprma2(...)) |
|
130 |
- } |
|
131 |
- snprmaRes <- snprmaFxn(FUN, |
|
132 |
- filenames=filenames, |
|
133 |
- mixtureSampleSize=mixtureSampleSize, |
|
134 |
- fitMixture=TRUE, |
|
135 |
- eps=eps, |
|
136 |
- verbose=verbose, |
|
137 |
- seed=seed, |
|
138 |
- cdfName=cdfName, |
|
139 |
- sns=sns) |
|
119 |
+## callSet <- construct(filenames=filenames, |
|
120 |
+## cdfName=cdfName, |
|
121 |
+## copynumber=TRUE, |
|
122 |
+## sns=sns, |
|
123 |
+## verbose=verbose, |
|
124 |
+## batch=batch) |
|
125 |
+## FUN <- ifelse(is.lds, "snprma2", "snprma") |
|
126 |
+## snprmaFxn <- function(FUN,...){ |
|
127 |
+## switch(FUN, |
|
128 |
+## snprma=snprma(...), |
|
129 |
+## snprma2=snprma2(...)) |
|
130 |
+## } |
|
131 |
+## snprmaRes <- snprmaFxn(FUN, |
|
132 |
+## filenames=filenames, |
|
133 |
+## mixtureSampleSize=mixtureSampleSize, |
|
134 |
+## fitMixture=TRUE, |
|
135 |
+## eps=eps, |
|
136 |
+## verbose=verbose, |
|
137 |
+## seed=seed, |
|
138 |
+## cdfName=cdfName, |
|
139 |
+## sns=sns) |
|
140 |
+ ##--------------------------------------------------------------------------- |
|
141 |
+ ## |
|
142 |
+ ## from snprma2. Goal is to initialize A and B with appropriate dimension for snps+nps |
|
143 |
+ ## |
|
144 |
+ ##--------------------------------------------------------------------------- |
|
145 |
+ if (missing(sns)) sns <- basename(filenames) |
|
146 |
+ if (missing(cdfName)) |
|
147 |
+ cdfName <- read.celfile.header(filenames[1])[["cdfName"]] |
|
148 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
149 |
+ stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
150 |
+ if(verbose) message("Loading annotations and mixture model parameters.") |
|
151 |
+ obj <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
152 |
+ pnsa <- getVarInEnv("pnsa") |
|
153 |
+ pnsb <- getVarInEnv("pnsb") |
|
154 |
+ gns <- getVarInEnv("gns") |
|
155 |
+ rm(list=obj, envir=.crlmmPkgEnv) |
|
156 |
+ rm(obj) |
|
157 |
+ if(verbose) message("Initializing objects.") |
|
158 |
+ mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
|
159 |
+ SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double") |
|
160 |
+ SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double") |
|
161 |
+## A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer") |
|
162 |
+## B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer") |
|
163 |
+ featureData <- getFeatureData(cdfName, copynumber=TRUE) |
|
164 |
+ nr <- nrow(featureData); nc <- length(sns) |
|
165 |
+ A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer") |
|
166 |
+ B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer") |
|
167 |
+ rownames(A) <- rownames(B) <- featureNames(featureData) |
|
168 |
+ |
|
169 |
+ sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
|
170 |
+ if(verbose) message("Processing ", length(filenames), " files.") |
|
171 |
+ ocLapply(sampleBatches, rsprocessCEL, filenames=filenames, |
|
172 |
+ fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR, |
|
173 |
+ mixtureParams=mixtureParams, eps=eps, seed=seed, |
|
174 |
+ mixtureSampleSize=mixtureSampleSize, pkgname=pkgname, |
|
175 |
+ neededPkgs=c("crlmm", pkgname)) |
|
176 |
+ |
|
140 | 177 |
gns <- snprmaRes[["gns"]] |
141 | 178 |
snp.I <- isSnp(callSet) |
142 | 179 |
is.snp <- which(snp.I) |
... | ... |
@@ -242,6 +279,69 @@ genotype <- function(filenames, |
242 | 279 |
return(callSet) |
243 | 280 |
} |
244 | 281 |
|
282 |
+rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
|
283 |
+ mixtureParams, eps, seed, mixtureSampleSize, |
|
284 |
+ pkgname){ |
|
285 |
+ obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
286 |
+ obj2 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
287 |
+ obj3 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
288 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
289 |
+ pnsa <- getVarInEnv("pnsa") |
|
290 |
+ pnsb <- getVarInEnv("pnsb") |
|
291 |
+ fid <- getVarInEnv("fid") |
|
292 |
+ reference <- getVarInEnv("reference") |
|
293 |
+ aIndex <- getVarInEnv("aIndex") |
|
294 |
+ bIndex <- getVarInEnv("bIndex") |
|
295 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
296 |
+ theKnots <- getVarInEnv("theKnots") |
|
297 |
+ gns <- getVarInEnv("gns") |
|
298 |
+ rm(list=c(obj1, obj2, obj3), envir=.crlmmPkgEnv) |
|
299 |
+ rm(obj1, obj2, obj3) |
|
300 |
+ |
|
301 |
+ ## for mixture |
|
302 |
+ set.seed(seed) |
|
303 |
+ idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
304 |
+ ##for skewness. no need to do everything |
|
305 |
+ idx2 <- sample(length(fid), 10^5) |
|
306 |
+ |
|
307 |
+ open(A) |
|
308 |
+ open(B) |
|
309 |
+ open(SKW) |
|
310 |
+ open(mixtureParams) |
|
311 |
+ open(SNR) |
|
312 |
+ |
|
313 |
+ for (k in i){ |
|
314 |
+ y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
315 |
+ x <- log2(y[idx2]) |
|
316 |
+ SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
317 |
+ rm(x) |
|
318 |
+ y <- normalize.quantiles.use.target(y, target=reference) |
|
319 |
+ A[, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
320 |
+ B[, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
321 |
+ rm(y) |
|
322 |
+ if(fitMixture){ |
|
323 |
+ S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN |
|
324 |
+ M <- log2(A[idx, k])-log2(B[idx, k]) |
|
325 |
+ tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
326 |
+ rm(S, M) |
|
327 |
+ mixtureParams[, k] <- tmp[["coef"]] |
|
328 |
+ SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
329 |
+ rm(tmp) |
|
330 |
+ } else { |
|
331 |
+ mixtureParams[, k] <- NA |
|
332 |
+ SNR[k] <- NA |
|
333 |
+ } |
|
334 |
+ } |
|
335 |
+ close(A) |
|
336 |
+ close(B) |
|
337 |
+ close(SKW) |
|
338 |
+ close(mixtureParams) |
|
339 |
+ close(SNR) |
|
340 |
+ rm(list=ls()) |
|
341 |
+ gc(verbose=FALSE) |
|
342 |
+ TRUE |
|
343 |
+} |
|
344 |
+ |
|
245 | 345 |
genotype2 <- function(){ |
246 | 346 |
.Defunct(msg="The genotype2 function has been deprecated. The function genotype should be used instead. genotype will support large data using ff provided that the ff package is loaded.") |
247 | 347 |
} |
... | ... |
@@ -15,7 +15,7 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
15 | 15 |
message(strwrap(msg)) |
16 | 16 |
stop("Package ", pkgname, " could not be found.") |
17 | 17 |
} |
18 |
- |
|
18 |
+ |
|
19 | 19 |
if(verbose) message("Loading annotations and mixture model parameters.") |
20 | 20 |
loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
21 | 21 |
loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
... | ... |
@@ -30,7 +30,7 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
30 | 30 |
SMEDIAN <- getVarInEnv("SMEDIAN") |
31 | 31 |
theKnots <- getVarInEnv("theKnots") |
32 | 32 |
gns <- getVarInEnv("gns") |
33 |
- |
|
33 |
+ |
|
34 | 34 |
##We will read each cel file, summarize, and run EM one by one |
35 | 35 |
##We will save parameters of EM to use later |
36 | 36 |
mixtureParams <- matrix(0, 4, length(filenames)) |
... | ... |
@@ -39,7 +39,7 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
39 | 39 |
## mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames)) |
40 | 40 |
## SNR <- initializeBigVector("crlmmSNR-", length(filenames), "numeric") |
41 | 41 |
## SKW <- initializeBigVector("crlmmSKW-", length(filenames), "numeric") |
42 |
- |
|
42 |
+ |
|
43 | 43 |
## This is the sample for the fitting of splines |
44 | 44 |
## BC: I like better the idea of the user passing the seed, |
45 | 45 |
## because this might intefere with other analyses |
... | ... |
@@ -51,15 +51,15 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
51 | 51 |
##f is the correction. we save to avoid recomputing |
52 | 52 |
A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
53 | 53 |
B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
54 |
- |
|
54 |
+ |
|
55 | 55 |
if(verbose){ |
56 | 56 |
message("Processing ", length(filenames), " files.") |
57 | 57 |
pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
58 | 58 |
} |
59 |
- |
|
59 |
+ |
|
60 | 60 |
##for skewness. no need to do everything |
61 | 61 |
idx2 <- sample(length(fid), 10^5) |
62 |
- |
|
62 |
+ |
|
63 | 63 |
##We start looping throug cel files |
64 | 64 |
for(i in seq(along=filenames)){ |
65 | 65 |
y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
... | ... |
@@ -73,10 +73,10 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
73 | 73 |
if(fitMixture){ |
74 | 74 |
S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN |
75 | 75 |
M <- log2(A[idx, i])-log2(B[idx, i]) |
76 |
- |
|
76 |
+ |
|
77 | 77 |
##we need to test the choice of eps.. it is not the max diff between funcs |
78 | 78 |
tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
79 |
- |
|
79 |
+ |
|
80 | 80 |
mixtureParams[, i] <- tmp[["coef"]] |
81 | 81 |
SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
82 | 82 |
} |
... | ... |
@@ -97,12 +97,12 @@ fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=1 |
97 | 97 |
mus <- append(quantile(M, c(1, 5)/6, names=FALSE), 0, 1) |
98 | 98 |
sigmas <- rep(mad(c(M[M<mus[1]]-mus[1], M[M>mus[3]]-mus[3])), 3) |
99 | 99 |
sigmas[2] <- sigmas[2]/2 |
100 |
- |
|
100 |
+ |
|
101 | 101 |
weights <- apply(cbind(mus, sigmas), 1, function(p) dnorm(M, p[1], p[2])) |
102 | 102 |
previousF1 <- -Inf |
103 | 103 |
change <- eps+1 |
104 | 104 |
it <- 0 |
105 |
- |
|
105 |
+ |
|
106 | 106 |
if(verbose) message("Max change must be under ", eps, ".") |
107 | 107 |
matS <- stupidSplineBasis(S, knots) |
108 | 108 |
while (change > eps & it < maxit){ |
... | ... |
@@ -112,19 +112,19 @@ fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=1 |
112 | 112 |
LogLik <- rowSums(z) |
113 | 113 |
z <- sweep(z, 1, LogLik, "/") |
114 | 114 |
probs <- colMeans(z) |
115 |
- |
|
115 |
+ |
|
116 | 116 |
## M |
117 | 117 |
fit1 <- crossprod(chol2inv(chol(crossprod(sweep(matS, 1, z[, 1], FUN="*"), matS))), crossprod(matS, z[, 1]*M)) |
118 |
- |
|
118 |
+ |
|
119 | 119 |
fit2 <- sum(z[, 2]*M)/sum(z[, 2]) |
120 | 120 |
F1 <- matS%*%fit1 |
121 | 121 |
sigmas[c(1, 3)] <- sqrt(sum(z[, 1]*(M-F1)^2)/sum(z[, 1])) |
122 | 122 |
sigmas[2] <- sqrt(sum(z[, 2]*(M-fit2)^2)/sum(z[, 2])) |
123 |
- |
|
123 |
+ |
|
124 | 124 |
weights[, 1] <- dnorm(M, F1, sigmas[1]) |
125 | 125 |
weights[, 2] <- dnorm(M, fit2, sigmas[2]) |
126 | 126 |
weights[, 3] <- dnorm(M, -F1, sigmas[3]) |
127 |
- |
|
127 |
+ |
|
128 | 128 |
change <- max(abs(F1-previousF1)) |
129 | 129 |
previousF1 <- F1 |
130 | 130 |
if(verbose) message("Iter ", it, ": ", change, ".") |
... | ... |
@@ -140,7 +140,7 @@ snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
140 | 140 |
cdfName <- read.celfile.header(filenames[1])[["cdfName"]] |
141 | 141 |
pkgname <- getCrlmmAnnotationName(cdfName) |
142 | 142 |
stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
143 |
- |
|
143 |
+ |
|
144 | 144 |
if(verbose) message("Loading annotations and mixture model parameters.") |
145 | 145 |
obj <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
146 | 146 |
pnsa <- getVarInEnv("pnsa") |
... | ... |
@@ -148,14 +148,14 @@ snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
148 | 148 |
gns <- getVarInEnv("gns") |
149 | 149 |
rm(list=obj, envir=.crlmmPkgEnv) |
150 | 150 |
rm(obj) |
151 |
- |
|
151 |
+ |
|
152 | 152 |
##We will read each cel file, summarize, and run EM one by one |
153 | 153 |
##We will save parameters of EM to use later |
154 | 154 |
if(verbose) message("Initializing objects.") |
155 | 155 |
mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
156 | 156 |
SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double") |
157 | 157 |
SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double") |
158 |
- |
|
158 |
+ |
|
159 | 159 |
## This is the sample for the fitting of splines |
160 | 160 |
## BC: I like better the idea of the user passing the seed, |
161 | 161 |
## because this might intefere with other analyses |
... | ... |
@@ -180,7 +180,7 @@ snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
180 | 180 |
close(SKW) |
181 | 181 |
close(A) |
182 | 182 |
close(B) |
183 |
- |
|
183 |
+ |
|
184 | 184 |
list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
185 | 185 |
} |
186 | 186 |
|
... | ... |
@@ -188,7 +188,7 @@ snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
188 | 188 |
processCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
189 | 189 |
mixtureParams, eps, seed, mixtureSampleSize, |
190 | 190 |
pkgname){ |
191 |
- |
|
191 |
+ |
|
192 | 192 |
obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
193 | 193 |
obj2 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
194 | 194 |
obj3 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
... | ... |
@@ -226,7 +226,7 @@ processCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
226 | 226 |
A[, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
227 | 227 |
B[, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
228 | 228 |
rm(y) |
229 |
- |
|
229 |
+ |
|
230 | 230 |
if(fitMixture){ |
231 | 231 |
S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN |
232 | 232 |
M <- log2(A[idx, k])-log2(B[idx, k]) |