Browse code

add open and close statements inside processCEL2. Fix NR in rscrlmmGT2

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@52855 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 16/02/2011 15:59:37
Showing3 changed files

... ...
@@ -117,27 +117,6 @@ genotype <- function(filenames,
117 117
 	if(missing(cdfName)) stop("must specify cdfName")
118 118
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
119 119
 	if(missing(sns)) sns <- basename(filenames)
120
-##	callSet <- construct(filenames=filenames,
121
-##			     cdfName=cdfName,
122
-##			     copynumber=TRUE,
123
-##			     sns=sns,
124
-##			     verbose=verbose,
125
-##			     batch=batch)
126
-##	FUN <- ifelse(is.lds, "snprma2", "snprma")
127
-##	snprmaFxn <- function(FUN,...){
128
-##		switch(FUN,
129
-##		       snprma=snprma(...),
130
-##		       snprma2=snprma2(...))
131
-##	}
132
-##	snprmaRes <- snprmaFxn(FUN,
133
-##			       filenames=filenames,
134
-##			       mixtureSampleSize=mixtureSampleSize,
135
-##			       fitMixture=TRUE,
136
-##			       eps=eps,
137
-##			       verbose=verbose,
138
-##			       seed=seed,
139
-##			       cdfName=cdfName,
140
-##			       sns=sns)
141 120
 	##---------------------------------------------------------------------------
142 121
 	##
143 122
 	## from snprma2.  Goal is to initialize A and B with appropriate dimension for snps+nps
... ...
@@ -159,14 +138,11 @@ genotype <- function(filenames,
159 138
 	mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
160 139
 	SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
161 140
 	SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
162
-##	A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer")
163
-##	B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer")
164 141
 	featureData <- getFeatureData(cdfName, copynumber=TRUE)
165 142
 	nr <- nrow(featureData); nc <- length(sns)
166 143
 	A <- initializeBigMatrix("crlmmA-", nr, length(filenames), "integer")
167 144
 	B <- initializeBigMatrix("crlmmB-", nr, length(filenames), "integer")
168 145
 	rownames(A) <- rownames(B) <- featureNames(featureData)
169
-
170 146
 	sampleBatches <- splitIndicesByNode(seq(along=filenames))
171 147
 	if(verbose) message("Processing ", length(filenames), " files.")
172 148
 	ocLapply(sampleBatches, rsprocessCEL, filenames=filenames,
... ...
@@ -174,11 +150,13 @@ genotype <- function(filenames,
174 150
 		 mixtureParams=mixtureParams, eps=eps, seed=seed,
175 151
 		 mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
176 152
 		 neededPkgs=c("crlmm", pkgname))
177
-
178 153
 	## Now we initialize a CNSet object, cloning A and B
179
-	message("Cloning A and B intensities")
154
+	message("Cloning A and B objects")
155
+	## The clones will be used to store calls and confidence scores
180 156
 	open(A)
181 157
 	open(B)
158
+	##user do
159
+	## options("ffbatchbytes"=XXX) to make this efficient
182 160
 	call <- clone(A)
183 161
 	callProbability=clone(B)
184 162
 	close(A)
... ...
@@ -193,7 +171,9 @@ genotype <- function(filenames,
193 171
 		     batch=batch)
194 172
 	if(!missing(sns)){
195 173
 		sampleNames(cnSet) <- sns
174
+		open(A)
196 175
 		protocolData <- annotatedDataFrameFrom(A(cnSet), byrow=FALSE)
176
+		close(A)
197 177
 	} else {
198 178
 		sampleNames(cnSet) <- basename(filenames)
199 179
 		protocolData <- getProtocolData.Affy(filenames)
... ...
@@ -205,132 +185,36 @@ genotype <- function(filenames,
205 185
 	phenoData(cnSet) <- new("AnnotatedDataFrame", data=pd)
206 186
 	stopifnot(validObject(cnSet))
207 187
 	snp.I <- isSnp(cnSet)
208
-	##gns <- snprmaRes[["gns"]]
209
-##	snp.I <- isSnp(callSet)
210
-##	is.snp <- which(snp.I)
211
-##	snp.index <- match(featureNames(callSet)[is.snp], gns)
212
-##	stopifnot(identical(featureNames(callSet)[is.snp], gns[snp.index]))
213
-####	is.snp <- isSnp(callSet)
214
-####	snp.index <- which(is.snp)
215
-##	if(is.lds) open(callSet)
216
-##	mixtureParams <- matrix(NA, 4, length(filenames))
217
-##	##message("Saving snprmaRes file")
218
-##	##save(snprmaRes, file=file.path(outdir, "snprmaRes.rda"))
219
-##	if(verbose) message("Finished preprocessing.")
220
-##	gns <- snprmaRes[["gns"]]
221
-##	snp.I <- isSnp(callSet)
222
-##	is.snp <- which(snp.I)
223
-##	snp.index <- match(featureNames(callSet)[is.snp], gns)
224
-##	stopifnot(identical(featureNames(callSet)[is.snp], gns[snp.index]))
225
-##	if(is.lds) open(callSet)
226
-##	mixtureParams <- matrix(NA, 4, length(filenames))
227
-##	if(is.lds){
228
-##		open(snprmaRes[["A"]])
229
-##		open(snprmaRes[["B"]])
230
-##		open(snprmaRes[["SNR"]])
231
-##		open(snprmaRes[["mixtureParams"]])
232
-##		bb <- getOption("ffbatchbytes")
233
-##		ffcolapply(A(callSet)[is.snp, i1:i2] <- snprmaRes[["A"]][snp.index, i1:i2], X=snprmaRes[["A"]],
234
-##			   BATCHBYTES=bb)
235
-##		ffcolapply(B(callSet)[is.snp, i1:i2] <- snprmaRes[["B"]][snp.index, i1:i2], X=snprmaRes[["B"]],
236
-##			   BATCHBYTES=bb)
237
-##	} else{
238
-##		A(callSet)[is.snp, ] <- snprmaRes[["A"]][snp.index, ]
239
-##		B(callSet)[is.snp, ] <- snprmaRes[["B"]][snp.index, ]
240
-##	}
241
-##	pData(callSet)$SKW <- snprmaRes[["SKW"]]
242
-##	pData(callSet)$SNR <- snprmaRes[["SNR"]]
243 188
 	pData(cnSet)$SKW <- SKW
244 189
 	pData(cnSet)$SNR <- SNR
245
-##	mixtureParams <- snprmaRes$mixtureParams
246 190
 	np.index <- which(!snp.I)
247 191
 	if(verbose) message("Normalizing nonpolymorphic markers")
248 192
 	FUN <- ifelse(is.lds, "cnrma2", "cnrma")
249
-	## main purpose is to update 'alleleA'
250
-	cnrmaFxn <- function(FUN,...){
251
-		switch(FUN,
252
-		       cnrma=cnrma(...),
253
-		       cnrma2=cnrma2(...))
254
-	}
255
-	## consider passing only A for NPs.
256 193
 	if(verbose) message("Quantile normalizing nonpolymorphic markers")
257
-##	AA <- cnrmaFxn(FUN, A=A(cnSet),
258
-##		       filenames=filenames,
259
-##		       row.names=featureNames(cnSet)[np.index],
260
-##		       cdfName=cdfName,
261
-##		       sns=sns,
262
-##		       seed=seed,
263
-##		       verbose=verbose)
264 194
 	cnrma2(A=A(cnSet),
265 195
 	       filenames=filenames,
266 196
 	       row.names=featureNames(cnSet)[np.index],
267 197
 	       cdfName=cdfName,
268
-	       sns=sns,
198
+	       sns=sampleNames(cnSet),
269 199
 	       seed=seed,
270 200
 	       verbose=verbose)
271
-	##if(!is.lds) A(callSet) <- AA
272
-	##rm(AA)
273
-##	FUN <- ifelse(is.lds, "crlmmGT2", "crlmmGT")
274
-##	## genotyping
275
-##	crlmmGTfxn <- function(FUN,...){
276
-##		switch(FUN,
277
-##		       crlmmGT2=crlmmGT2(...),
278
-##		       crlmmGT=crlmmGT(...))
279
-##	}
280
-##	if(verbose) message("Running crlmmGT2")
281
-##	tmp <- crlmmGTfxn(FUN,
282
-##			  A=snprmaRes[["A"]],
283
-##			  B=snprmaRes[["B"]],
284
-##			  SNR=snprmaRes[["SNR"]],
285
-##			  mixtureParams=snprmaRes[["mixtureParams"]],
286
-##			  cdfName=cdfName,
287
-##			  row.names=NULL,
288
-##			  col.names=sampleNames(callSet),
289
-##			  SNRMin=SNRMin,
290
-##			  recallMin=recallMin,
291
-##			  recallRegMin=recallRegMin,
292
-##			  gender=gender,
293
-##			  verbose=verbose,
294
-##			  returnParams=returnParams,
295
-##			  badSNP=badSNP)
296
-	rscrlmmGT2(A=calls(cnSet),
297
-		   B=callsProbability(cnSet),
298
-		   SNR=SNR,
299
-		   mixtureParams=mixtureParams,
300
-		   cdfName=cdfName,
301
-		   row.names=NULL,
302
-		   col.names=sampleNames(callSet),
303
-		   SNRMin=SNRMin,
304
-		   recallMin=recallMin,
305
-		   recallRegMin=recallRegMin,
306
-		   gender=gender,
307
-		   verbose=verbose,
308
-		   returnParams=returnParams,
309
-		   badSNP=badSNP)
201
+	tmp <- rscrlmmGT2(A=calls(cnSet),
202
+			  B=snpCallProbability(cnSet),
203
+			  SNR=SNR,
204
+			  mixtureParams=mixtureParams,
205
+			  cdfName=cdfName,
206
+			  row.names=NULL,
207
+			  col.names=sampleNames(cnSet),
208
+			  SNRMin=SNRMin,
209
+			  recallMin=recallMin,
210
+			  recallRegMin=recallRegMin,
211
+			  gender=gender,
212
+			  verbose=verbose,
213
+			  returnParams=returnParams,
214
+			  badSNP=badSNP)
310 215
 	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
311
-	if(is.lds){
312
-		open(tmp[["calls"]])
313
-		open(tmp[["confs"]])
314
-		ffcolapply(snpCall(callSet)[is.snp, i1:i2] <- tmp[["calls"]][snp.index, i1:i2], X=tmp[["calls"]],
315
-			   BATCHBYTES=bb)
316
-		ffcolapply(snpCallProbability(callSet)[is.snp, i1:i2] <- tmp[["confs"]][snp.index, i1:i2], X=tmp[["confs"]],
317
-			   BATCHBYTES=bb)
318
-		close(tmp[["calls"]])
319
-		close(tmp[["confs"]])
320
-	} else {
321
-		calls(callSet)[is.snp, ] <- tmp[["calls"]][snp.index, ]
322
-		snpCallProbability(callSet)[is.snp, ] <- tmp[["confs"]][snp.index, ]
323
-	}
324
-	message("Finished updating.  Cleaning up.")
325
-	callSet$gender <- tmp$gender
326
-	if(is.lds){
327
-		delete(snprmaRes[["A"]])
328
-		delete(snprmaRes[["B"]])
329
-		delete(snprmaRes[["mixtureParams"]])
330
-		rm(snprmaRes)
331
-	}
332
-	close(callSet)
333
-	return(callSet)
216
+	cnSet$gender <- tmp$gender
217
+	return(cnSet)
334 218
 }
335 219
 
336 220
 rsprocessCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
... ...
@@ -1240,11 +1124,13 @@ processCEL2 <- function(i, filenames, row.names, A, seed, cdfName, pkgname){
1240 1124
 	np.index <- match(row.names, rownames(A))
1241 1125
 	gns <- names(fid)
1242 1126
 	set.seed(seed)
1127
+	open(A)
1243 1128
 	##idx2 <- sample(length(fid), 10^5)
1244 1129
 	for (k in i){
1245 1130
 		y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
1246 1131
 		A[np.index, k] <- as.integer(normalize.quantiles.use.target(y, target=reference))
1247 1132
 	}
1133
+	close(A)
1248 1134
 	return(TRUE)
1249 1135
 }
1250 1136
 
... ...
@@ -599,29 +599,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
599 599
 }
600 600
 
601 601
 
602
-## 2 issues with crlmmGT2:
603
-## 1.  it overwrites what is in A and B with calls and confidence scores
604
-##     - should be an option to clone.  Then overwrite the clone.
605
-## 2.  the dimensions are not correct for copy number analysis.
606
-##     this would be fine if there were a way to virtually rbind to ff objects, but
607
-##     this is not currently possible.
608
-
609
-## current approach:
610
-## i. Matt and I copy a subset of A (snps only) and a subset of B to new ff objects
611
-##   -- this is one read and one write for A and for B (2 reads/2 writes)
612
-## ii.   run crlmmGT2
613
-##        - calls / confidence scores written to A/B (2 reads / 2 writes)
614
-## iii. copy results from crlmmGT2 into the full-size container for calls/confidence scores
615
-##           (2 reads / 2 writes)
616
-## cost:  6 reads / 6 writes
617
-##
618
-## preferred alternative:
619
-## i.  clone A and B.  (2 writes )  * initializing matrix for calls and confidence scores should be done by the clone
620
-## 2.  pass cloned A and B to crlmmGT2.
621
-## 3.  crlmmGT2 would work even though A and B contains more than just SNPs
622
-##          (2 reads / 2 writes)
623
-## 4.  results are directly usable -- no need to do anything
624
-##     4 reads / 2 writes  OR  ( 4 reads/ 4writes if clone requires reading)
602
+## a copy of crlmmGT2, except changes noted below by *RS* comments
625 603
 rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
626 604
 		       col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
627 605
 		       SNRMin=5, recallMin=10, recallRegMin=1000,
... ...
@@ -676,7 +654,8 @@ rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
676 654
                    keepIndex[which(gender[keepIndex]==2)],
677 655
                    keepIndex[which(gender[keepIndex]==1)])
678 656
 
679
-  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
657
+  ## move this statement further down
658
+  ##if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
680 659
 
681 660
   ## call C
682 661
   fIndex <- which(gender==2)
... ...
@@ -685,11 +664,18 @@ rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
685 664
   ## different here
686 665
   ## use gtypeCallerR in batches
687 666
   if(is.null(row.names) & is.null(rownames(A))){
688
-
667
+	  ##verify that A has only snps.  otherwise, calling function must pass rownames
668
+	  message("rownames not available.  Assuming only SNPs in A")
669
+	  snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
670
+  } else{
671
+	  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
672
+	  gns <- getVarInEnv("gns")
673
+	  index <- match(gns, rownames(A))
674
+	  snpBatches <- splitIndicesByLength(index, ocProbesets())
689 675
   }
690
-  snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
676
+  NR <- length(unlist(snpBatches))
677
+  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
691 678
   newparamsBatch <- vector("list", length(snpBatches))
692
-
693 679
   process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
694 680
                        YIndex, A, B, mixtureParams, fIndex, mIndex,
695 681
                        params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
... ...
@@ -71,8 +71,8 @@ pathToCels <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m"
71 71
 if(getRversion() < "2.13.0"){
72 72
 	rpath <- getRversion()
73 73
 } else rpath <- "trunk"
74
-##outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="")
75
-outdir <- "/home/bst/student/rscharpf/tmp"
74
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette2", sep="")
75
+##outdir <- "/home/bst/student/rscharpf/tmp"
76 76
 dir.create(outdir, recursive=TRUE, showWarnings=FALSE)
77 77
 @
78 78
 
... ...
@@ -158,14 +158,15 @@ be run interactively.
158 158
 
159 159
 
160 160
 <<LDS_genotype>>=
161
-if(!file.exists(file.path(outdir, "cnSet.rda"))){
162
-	gtSet <- checkExists("gtSet",
163
-			     .path=outdir,
164
-			     .FUN=genotype,
165
-			     filenames=celFiles,
166
-			     cdfName=cdfName,
167
-			     batch=batch)
168
-}
161
+##if(!file.exists(file.path(outdir, "cnSet.rda"))){
162
+system.time(checkExists("gtSet",
163
+			.path=outdir,
164
+			.FUN=genotype,
165
+			filenames=celFiles,
166
+			cdfName=cdfName,
167
+			batch=batch))
168
+q("no")
169
+##}
169 170
 @
170 171
 
171 172
 The value returned by genotype is an instance of the class