git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@52855 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -117,27 +117,6 @@ genotype <- function(filenames, |
117 | 117 |
if(missing(cdfName)) stop("must specify cdfName") |
118 | 118 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
119 | 119 |
if(missing(sns)) sns <- basename(filenames) |
120 |
-## callSet <- construct(filenames=filenames, |
|
121 |
-## cdfName=cdfName, |
|
122 |
-## copynumber=TRUE, |
|
123 |
-## sns=sns, |
|
124 |
-## verbose=verbose, |
|
125 |
-## batch=batch) |
|
126 |
-## FUN <- ifelse(is.lds, "snprma2", "snprma") |
|
127 |
-## snprmaFxn <- function(FUN,...){ |
|
128 |
-## switch(FUN, |
|
129 |
-## snprma=snprma(...), |
|
130 |
-## snprma2=snprma2(...)) |
|
131 |
-## } |
|
132 |
-## snprmaRes <- snprmaFxn(FUN, |
|
133 |
-## filenames=filenames, |
|
134 |
-## mixtureSampleSize=mixtureSampleSize, |
|
135 |
-## fitMixture=TRUE, |
|
136 |
-## eps=eps, |
|
137 |
-## verbose=verbose, |
|
138 |
-## seed=seed, |
|
139 |
-## cdfName=cdfName, |
|
140 |
-## sns=sns) |
|
141 | 120 |
##--------------------------------------------------------------------------- |
142 | 121 |
## |
143 | 122 |
## from snprma2. Goal is to initialize A and B with appropriate dimension for snps+nps |
... | ... |
@@ -159,14 +138,11 @@ genotype <- function(filenames, |
159 | 138 |
mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
160 | 139 |
SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double") |
161 | 140 |
SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double") |
162 |
-## A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer") |
|
163 |
-## B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer") |
|
164 | 141 |
featureData <- getFeatureData(cdfName, copynumber=TRUE) |
165 | 142 |
nr <- nrow(featureData); nc <- length(sns) |
166 | 143 |
A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer") |
167 | 144 |
B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer") |
168 | 145 |
rownames(A) <- rownames(B) <- featureNames(featureData) |
169 |
- |
|
170 | 146 |
sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
171 | 147 |
if(verbose) message("Processing ", length(filenames), " files.") |
172 | 148 |
ocLapply(sampleBatches, rsprocessCEL, filenames=filenames, |
... | ... |
@@ -174,11 +150,13 @@ genotype <- function(filenames, |
174 | 150 |
mixtureParams=mixtureParams, eps=eps, seed=seed, |
175 | 151 |
mixtureSampleSize=mixtureSampleSize, pkgname=pkgname, |
176 | 152 |
neededPkgs=c("crlmm", pkgname)) |
177 |
- |
|
178 | 153 |
## Now we initialize a CNSet object, cloning A and B |
179 |
- message("Cloning A and B intensities") |
|
154 |
+ message("Cloning A and B objects") |
|
155 |
+ ## The clones will be used to store calls and confidence scores |
|
180 | 156 |
open(A) |
181 | 157 |
open(B) |
158 |
+ ##user do |
|
159 |
+ ## options("ffbatchbytes"=XXX) to make this efficient |
|
182 | 160 |
call <- clone(A) |
183 | 161 |
callProbability=clone(B) |
184 | 162 |
close(A) |
... | ... |
@@ -193,7 +171,9 @@ genotype <- function(filenames, |
193 | 171 |
batch=batch) |
194 | 172 |
if(!missing(sns)){ |
195 | 173 |
sampleNames(cnSet) <- sns |
174 |
+ open(A) |
|
196 | 175 |
protocolData <- annotatedDataFrameFrom(A(cnSet), byrow=FALSE) |
176 |
+ close(A) |
|
197 | 177 |
} else { |
198 | 178 |
sampleNames(cnSet) <- basename(filenames) |
199 | 179 |
protocolData <- getProtocolData.Affy(filenames) |
... | ... |
@@ -205,132 +185,36 @@ genotype <- function(filenames, |
205 | 185 |
phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd) |
206 | 186 |
stopifnot(validObject(cnSet)) |
207 | 187 |
snp.I <- isSnp(cnSet) |
208 |
- ##gns <- snprmaRes[["gns"]] |
|
209 |
-## snp.I <- isSnp(callSet) |
|
210 |
-## is.snp <- which(snp.I) |
|
211 |
-## snp.index <- match(featureNames(callSet)[is.snp], gns) |
|
212 |
-## stopifnot(identical(featureNames(callSet)[is.snp], gns[snp.index])) |
|
213 |
-#### is.snp <- isSnp(callSet) |
|
214 |
-#### snp.index <- which(is.snp) |
|
215 |
-## if(is.lds) open(callSet) |
|
216 |
-## mixtureParams <- matrix(NA, 4, length(filenames)) |
|
217 |
-## ##message("Saving snprmaRes file") |
|
218 |
-## ##save(snprmaRes, file=file.path(outdir, "snprmaRes.rda")) |
|
219 |
-## if(verbose) message("Finished preprocessing.") |
|
220 |
-## gns <- snprmaRes[["gns"]] |
|
221 |
-## snp.I <- isSnp(callSet) |
|
222 |
-## is.snp <- which(snp.I) |
|
223 |
-## snp.index <- match(featureNames(callSet)[is.snp], gns) |
|
224 |
-## stopifnot(identical(featureNames(callSet)[is.snp], gns[snp.index])) |
|
225 |
-## if(is.lds) open(callSet) |
|
226 |
-## mixtureParams <- matrix(NA, 4, length(filenames)) |
|
227 |
-## if(is.lds){ |
|
228 |
-## open(snprmaRes[["A"]]) |
|
229 |
-## open(snprmaRes[["B"]]) |
|
230 |
-## open(snprmaRes[["SNR"]]) |
|
231 |
-## open(snprmaRes[["mixtureParams"]]) |
|
232 |
-## bb <- getOption("ffbatchbytes") |
|
233 |
-## ffcolapply(A(callSet)[is.snp, i1:i2] <- snprmaRes[["A"]][snp.index, i1:i2], X=snprmaRes[["A"]], |
|
234 |
-## BATCHBYTES=bb) |
|
235 |
-## ffcolapply(B(callSet)[is.snp, i1:i2] <- snprmaRes[["B"]][snp.index, i1:i2], X=snprmaRes[["B"]], |
|
236 |
-## BATCHBYTES=bb) |
|
237 |
-## } else{ |
|
238 |
-## A(callSet)[is.snp, ] <- snprmaRes[["A"]][snp.index, ] |
|
239 |
-## B(callSet)[is.snp, ] <- snprmaRes[["B"]][snp.index, ] |
|
240 |
-## } |
|
241 |
-## pData(callSet)$SKW <- snprmaRes[["SKW"]] |
|
242 |
-## pData(callSet)$SNR <- snprmaRes[["SNR"]] |
|
243 | 188 |
pData(cnSet)$SKW <- SKW |
244 | 189 |
pData(cnSet)$SNR <- SNR |
245 |
-## mixtureParams <- snprmaRes$mixtureParams |
|
246 | 190 |
np.index <- which(!snp.I) |
247 | 191 |
if(verbose) message("Normalizing nonpolymorphic markers") |
248 | 192 |
FUN <- ifelse(is.lds, "cnrma2", "cnrma") |
249 |
- ## main purpose is to update 'alleleA' |
|
250 |
- cnrmaFxn <- function(FUN,...){ |
|
251 |
- switch(FUN, |
|
252 |
- cnrma=cnrma(...), |
|
253 |
- cnrma2=cnrma2(...)) |
|
254 |
- } |
|
255 |
- ## consider passing only A for NPs. |
|
256 | 193 |
if(verbose) message("Quantile normalizing nonpolymorphic markers") |
257 |
-## AA <- cnrmaFxn(FUN, A=A(cnSet), |
|
258 |
-## filenames=filenames, |
|
259 |
-## row.names=featureNames(cnSet)[np.index], |
|
260 |
-## cdfName=cdfName, |
|
261 |
-## sns=sns, |
|
262 |
-## seed=seed, |
|
263 |
-## verbose=verbose) |
|
264 | 194 |
cnrma2(A=A(cnSet), |
265 | 195 |
filenames=filenames, |
266 | 196 |
row.names=featureNames(cnSet)[np.index], |
267 | 197 |
cdfName=cdfName, |
268 |
- sns=sns, |
|
198 |
+ sns=sampleNames(cnSet), |
|
269 | 199 |
seed=seed, |
270 | 200 |
verbose=verbose) |
271 |
- ##if(!is.lds) A(callSet) <- AA |
|
272 |
- ##rm(AA) |
|
273 |
-## FUN <- ifelse(is.lds, "crlmmGT2", "crlmmGT") |
|
274 |
-## ## genotyping |
|
275 |
-## crlmmGTfxn <- function(FUN,...){ |
|
276 |
-## switch(FUN, |
|
277 |
-## crlmmGT2=crlmmGT2(...), |
|
278 |
-## crlmmGT=crlmmGT(...)) |
|
279 |
-## } |
|
280 |
-## if(verbose) message("Running crlmmGT2") |
|
281 |
-## tmp <- crlmmGTfxn(FUN, |
|
282 |
-## A=snprmaRes[["A"]], |
|
283 |
-## B=snprmaRes[["B"]], |
|
284 |
-## SNR=snprmaRes[["SNR"]], |
|
285 |
-## mixtureParams=snprmaRes[["mixtureParams"]], |
|
286 |
-## cdfName=cdfName, |
|
287 |
-## row.names=NULL, |
|
288 |
-## col.names=sampleNames(callSet), |
|
289 |
-## SNRMin=SNRMin, |
|
290 |
-## recallMin=recallMin, |
|
291 |
-## recallRegMin=recallRegMin, |
|
292 |
-## gender=gender, |
|
293 |
-## verbose=verbose, |
|
294 |
-## returnParams=returnParams, |
|
295 |
-## badSNP=badSNP) |
|
296 |
- rscrlmmGT2(A=calls(cnSet), |
|
297 |
- B=callsProbability(cnSet), |
|
298 |
- SNR=SNR, |
|
299 |
- mixtureParams=mixtureParams, |
|
300 |
- cdfName=cdfName, |
|
301 |
- row.names=NULL, |
|
302 |
- col.names=sampleNames(callSet), |
|
303 |
- SNRMin=SNRMin, |
|
304 |
- recallMin=recallMin, |
|
305 |
- recallRegMin=recallRegMin, |
|
306 |
- gender=gender, |
|
307 |
- verbose=verbose, |
|
308 |
- returnParams=returnParams, |
|
309 |
- badSNP=badSNP) |
|
201 |
+ tmp <- rscrlmmGT2(A=calls(cnSet), |
|
202 |
+ B=snpCallProbability(cnSet), |
|
203 |
+ SNR=SNR, |
|
204 |
+ mixtureParams=mixtureParams, |
|
205 |
+ cdfName=cdfName, |
|
206 |
+ row.names=NULL, |
|
207 |
+ col.names=sampleNames(cnSet), |
|
208 |
+ SNRMin=SNRMin, |
|
209 |
+ recallMin=recallMin, |
|
210 |
+ recallRegMin=recallRegMin, |
|
211 |
+ gender=gender, |
|
212 |
+ verbose=verbose, |
|
213 |
+ returnParams=returnParams, |
|
214 |
+ badSNP=badSNP) |
|
310 | 215 |
if(verbose) message("Genotyping finished. Updating container with genotype calls and confidence scores.") |
311 |
- if(is.lds){ |
|
312 |
- open(tmp[["calls"]]) |
|
313 |
- open(tmp[["confs"]]) |
|
314 |
- ffcolapply(snpCall(callSet)[is.snp, i1:i2] <- tmp[["calls"]][snp.index, i1:i2], X=tmp[["calls"]], |
|
315 |
- BATCHBYTES=bb) |
|
316 |
- ffcolapply(snpCallProbability(callSet)[is.snp, i1:i2] <- tmp[["confs"]][snp.index, i1:i2], X=tmp[["confs"]], |
|
317 |
- BATCHBYTES=bb) |
|
318 |
- close(tmp[["calls"]]) |
|
319 |
- close(tmp[["confs"]]) |
|
320 |
- } else { |
|
321 |
- calls(callSet)[is.snp, ] <- tmp[["calls"]][snp.index, ] |
|
322 |
- snpCallProbability(callSet)[is.snp, ] <- tmp[["confs"]][snp.index, ] |
|
323 |
- } |
|
324 |
- message("Finished updating. Cleaning up.") |
|
325 |
- callSet$gender <- tmp$gender |
|
326 |
- if(is.lds){ |
|
327 |
- delete(snprmaRes[["A"]]) |
|
328 |
- delete(snprmaRes[["B"]]) |
|
329 |
- delete(snprmaRes[["mixtureParams"]]) |
|
330 |
- rm(snprmaRes) |
|
331 |
- } |
|
332 |
- close(callSet) |
|
333 |
- return(callSet) |
|
216 |
+ cnSet$gender <- tmp$gender |
|
217 |
+ return(cnSet) |
|
334 | 218 |
} |
335 | 219 |
|
336 | 220 |
rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
... | ... |
@@ -1240,11 +1124,13 @@ processCEL2 <- function(i, filenames, row.names, A, seed, cdfName, pkgname){ |
1240 | 1124 |
np.index <- match(row.names, rownames(A)) |
1241 | 1125 |
gns <- names(fid) |
1242 | 1126 |
set.seed(seed) |
1127 |
+ open(A) |
|
1243 | 1128 |
##idx2 <- sample(length(fid), 10^5) |
1244 | 1129 |
for (k in i){ |
1245 | 1130 |
y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
1246 | 1131 |
A[np.index, k] <- as.integer(normalize.quantiles.use.target(y, target=reference)) |
1247 | 1132 |
} |
1133 |
+ close(A) |
|
1248 | 1134 |
return(TRUE) |
1249 | 1135 |
} |
1250 | 1136 |
|
... | ... |
@@ -599,29 +599,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
599 | 599 |
} |
600 | 600 |
|
601 | 601 |
|
602 |
-## 2 issues with crlmmGT2: |
|
603 |
-## 1. it overwrites what is in A and B with calls and confidence scores |
|
604 |
-## - should be an option to clone. Then overwrite the clone. |
|
605 |
-## 2. the dimensions are not correct for copy number analysis. |
|
606 |
-## this would be fine if there were a way to virtually rbind to ff objects, but |
|
607 |
-## this is not currently possible. |
|
608 |
- |
|
609 |
-## current approach: |
|
610 |
-## i. Matt and I copy a subset of A (snps only) and a subset of B to new ff objects |
|
611 |
-## -- this is one read and one write for A and for B (2 reads/2 writes) |
|
612 |
-## ii. run crlmmGT2 |
|
613 |
-## - calls / confidence scores written to A/B (2 reads / 2 writes) |
|
614 |
-## iii. copy results from crlmmGT2 into the full-size container for calls/confidence scores |
|
615 |
-## (2 reads / 2 writes) |
|
616 |
-## cost: 6 reads / 6 writes |
|
617 |
-## |
|
618 |
-## preferred alternative: |
|
619 |
-## i. clone A and B. (2 writes ) * initializing matrix for calls and confidence scores should be done by the clone |
|
620 |
-## 2. pass cloned A and B to crlmmGT2. |
|
621 |
-## 3. crlmmGT2 would work even though A and B contains more than just SNPs |
|
622 |
-## (2 reads / 2 writes) |
|
623 |
-## 4. results are directly usable -- no need to do anything |
|
624 |
-## 4 reads / 2 writes OR ( 4 reads/ 4writes if clone requires reading) |
|
602 |
+## a copy of crlmmGT2, except changes noted below by *RS* comments |
|
625 | 603 |
rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
626 | 604 |
col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
627 | 605 |
SNRMin=5, recallMin=10, recallRegMin=1000, |
... | ... |
@@ -676,7 +654,8 @@ rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
676 | 654 |
keepIndex[which(gender[keepIndex]==2)], |
677 | 655 |
keepIndex[which(gender[keepIndex]==1)]) |
678 | 656 |
|
679 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
657 |
+ ## move this statement further down |
|
658 |
+ ##if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
680 | 659 |
|
681 | 660 |
## call C |
682 | 661 |
fIndex <- which(gender==2) |
... | ... |
@@ -685,11 +664,18 @@ rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
685 | 664 |
## different here |
686 | 665 |
## use gtypeCallerR in batches |
687 | 666 |
if(is.null(row.names) & is.null(rownames(A))){ |
688 |
- |
|
667 |
+ ##verify that A has only snps. otherwise, calling function must pass rownames |
|
668 |
+ message("rownames not available. Assuming only SNPs in A") |
|
669 |
+ snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets()) |
|
670 |
+ } else{ |
|
671 |
+ loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
672 |
+ gns <- getVarInEnv("gns") |
|
673 |
+ index <- match(gns, rownames(A)) |
|
674 |
+ snpBatches <- splitIndicesByLength(index, ocProbesets()) |
|
689 | 675 |
} |
690 |
- snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets()) |
|
676 |
+ NR <- length(unlist(snpBatches)) |
|
677 |
+ if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
691 | 678 |
newparamsBatch <- vector("list", length(snpBatches)) |
692 |
- |
|
693 | 679 |
process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
694 | 680 |
YIndex, A, B, mixtureParams, fIndex, mIndex, |
695 | 681 |
params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){ |
... | ... |
@@ -71,8 +71,8 @@ pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m" |
71 | 71 |
if(getRversion() < "2.13.0"){ |
72 | 72 |
rpath <- getRversion() |
73 | 73 |
} else rpath <- "trunk" |
74 |
-##outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="") |
|
75 |
-outdir <- "/home/bst/student/rscharpf/tmp" |
|
74 |
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette2", sep="") |
|
75 |
+##outdir <- "/home/bst/student/rscharpf/tmp" |
|
76 | 76 |
dir.create(outdir, recursive=TRUE, showWarnings=FALSE) |
77 | 77 |
@ |
78 | 78 |
|
... | ... |
@@ -158,14 +158,15 @@ be run interactively. |
158 | 158 |
|
159 | 159 |
|
160 | 160 |
<<LDS_genotype>>= |
161 |
-if(!file.exists(file.path(outdir, "cnSet.rda"))){ |
|
162 |
- gtSet <- checkExists("gtSet", |
|
163 |
- .path=outdir, |
|
164 |
- .FUN=genotype, |
|
165 |
- filenames=celFiles, |
|
166 |
- cdfName=cdfName, |
|
167 |
- batch=batch) |
|
168 |
-} |
|
161 |
+##if(!file.exists(file.path(outdir, "cnSet.rda"))){ |
|
162 |
+system.time(checkExists("gtSet", |
|
163 |
+ .path=outdir, |
|
164 |
+ .FUN=genotype, |
|
165 |
+ filenames=celFiles, |
|
166 |
+ cdfName=cdfName, |
|
167 |
+ batch=batch)) |
|
168 |
+q("no") |
|
169 |
+##} |
|
169 | 170 |
@ |
170 | 171 |
|
171 | 172 |
The value returned by genotype is an instance of the class |