Browse code

write to calls and callProbability slots in processIDAT. Remove corresponding writes from genotypeInf.

Overall, the changes should save one read step, reducing I/O for big
datasets.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54166 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 30/03/2011 02:40:18
Showing1 changed files

... ...
@@ -1160,6 +1160,8 @@ preprocessInf <- function(cnSet,
1160 1160
 		 useTarget=useTarget,
1161 1161
 		 A=A(cnSet),
1162 1162
 		 B=B(cnSet),
1163
+		 GT=calls(cnSet),
1164
+		 GTP=snpCallProbability(cnSet),
1163 1165
 		 SKW=cnSet$SKW,
1164 1166
 		 SNR=cnSet$SNR,
1165 1167
 		 mixtureParams=mixtureParams,
... ...
@@ -1178,22 +1180,22 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
1178 1180
 			badSNP=0.7,
1179 1181
 			gender=NULL,
1180 1182
 			DF=6){
1181
-	is.snp = isSnp(cnSet)
1182
-	snp.index = which(is.snp)
1183
-	narrays = ncol(cnSet)
1184
-	open(A(cnSet))
1185
-	open(B(cnSet))
1186
-	open(snpCall(cnSet))
1187
-	open(snpCallProbability(cnSet))
1188
-	## crlmmGT2 overwrites the normalized intensities with calls and confidenceScores
1189
-	message("Writing to call and callProbability slots")
1190
-	for(j in 1:ncol(cnSet)){
1191
-		snpCall(cnSet)[snp.index, j] <- as.integer(A(cnSet)[snp.index, j])
1192
-		snpCallProbability(cnSet)[snp.index, j] <- as.integer(B(cnSet)[snp.index, j])
1193
-	}
1194
-	message("Writing complete.  Begin genotyping...")
1195
-	close(A(cnSet))
1196
-	close(B(cnSet))
1183
+##	is.snp = isSnp(cnSet)
1184
+##	snp.index = which(is.snp)
1185
+##	narrays = ncol(cnSet)
1186
+##	open(A(cnSet))
1187
+##	open(B(cnSet))
1188
+##	open(snpCall(cnSet))
1189
+##	open(snpCallProbability(cnSet))
1190
+##	## crlmmGT2 overwrites the normalized intensities with calls and confidenceScores
1191
+##	message("Writing to call and callProbability slots")
1192
+##	for(j in 1:ncol(cnSet)){
1193
+##		snpCall(cnSet)[snp.index, j] <- as.integer(A(cnSet)[snp.index, j])
1194
+##		snpCallProbability(cnSet)[snp.index, j] <- as.integer(B(cnSet)[snp.index, j])
1195
+##	}
1196
+##	message("Writing complete.  Begin genotyping...")
1197
+##	close(A(cnSet))
1198
+##	close(B(cnSet))
1197 1199
 	tmp <- rscrlmmGT2(A=snpCall(cnSet),
1198 1200
 			  B=snpCallProbability(cnSet),
1199 1201
 			  SNR=cnSet$SNR,
... ...
@@ -1316,81 +1318,64 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
1316 1318
 			sns,
1317 1319
 			stripNorm=TRUE,
1318 1320
 			useTarget=TRUE,
1319
-			A, B, SKW, SNR, mixtureParams, is.snp) { #, outdir=".") {
1321
+			A, B,
1322
+			GT,
1323
+			GTP,
1324
+			SKW, SNR, mixtureParams, is.snp) { #, outdir=".") {
1320 1325
 	message("Processing sample stratum ", stratum, " of ", length(sampleBatches))
1321 1326
 	sel <- sampleBatches[[stratum]]
1322 1327
         if(length(path)>= length(sel)) path = path[sel]
1323
-	##message("RS:... processIDAT:  calling readIdatFiles")
1324 1328
         RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel],
1325 1329
                        ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
1326 1330
                        highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose)
1327 1331
         XY = RGtoXY(RG, chipType=cdfName)
1328
-#        open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero)
1329
-#        delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero);
1330 1332
         rm(RG)
1331 1333
         gc()
1332 1334
         if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)
1333
-
1334
-	##message("RS:... processIDAT: calling preprocessInfinium2")
1335 1335
         res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1336 1336
                                seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir)
1337
-#							    save.it=save.it, snpFile=snpFile, cnFile=cnFile)
1338
-#        open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero)
1339
-#        delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero);
1340 1337
         rm(XY)
1341 1338
         gc()
1342
-	    if(verbose) message("Finished preprocessing.")
1339
+	if(verbose) message("Finished preprocessing.")
1343 1340
         snp.index = which(is.snp)
1344 1341
 	np.index = which(!is.snp)
1345
-
1346
-#        open(res[["A"]])
1347
-#        open(res[["B"]])
1348
-#        open(res[["SKW"]])
1349
-#        open(res[["SNR"]])
1350
-#        open(res[["mixtureParams"]])
1351
-# remove this line: bb = ocProbesets()*length(sns)*8
1352
-# Add these
1353
-	##segfault came after 'Finished preprocessing', but the processIDAT loop was not yet complete
1354 1342
 	open(A); open(B);
1343
+	open(GT); open(GTP)
1355 1344
 	Amatrix <- res[["A"]]
1356 1345
 	Bmatrix <- res[["B"]]
1346
+
1347
+	## Amatrix and Bmatrix are ordinary matrices--not ff objects.
1348
+	## By writing columns of a ordinary matrix to GT and GTP, we
1349
+	## save one read step later on.  GT and GTP will be updated to
1350
+	## calls and call probabilities by the crlmmGT2 function. The A
1351
+	## and B slots will retain the normalized intensities for the
1352
+	## A and B alleles
1357 1353
 	for(j in seq_along(sel)){
1358 1354
 		jj <- sel[j]
1359 1355
 		A[snp.index, jj] <- Amatrix[, j]
1356
+		GT[snp.index, jj] <- Amatrix[, j]
1360 1357
 		B[snp.index, jj] <- Bmatrix[, j]
1358
+		GTP[snp.index, jj] <- Bmatrix[, j]
1361 1359
 	}
1362
-		##ffcolapply(A[snp.index,sel][,i1:i2] <- res[["A"]][,i1:i2], X=res[["A"]])
1363
-		##ffcolapply(B[snp.index,sel][,i1:i2] <- res[["B"]][,i1:i2], X=res[["B"]])
1364
-##	}
1365
-#   ffrowapply(A[snp.index,][i1:i2, sel] <- res[["A"]][i1:i2, ], X=res[["A"]], BATCHBYTES=bb)
1366
-#	ffrowapply(B[snp.index,][i1:i2, sel] <- res[["B"]][i1:i2, ], X=res[["B"]], BATCHBYTES=bb)
1367
-	    if(length(np.index)>0) {
1368
-#			for (j in 1:length(sel)) {
1369
-		    cnAmatrix <- res[["cnAB"]]$A
1370
-		    cnBmatrix <- res[["cnAB"]]$B
1371
-		    for(j in seq_along(sel)){
1372
-			    jj <- sel[j]
1373
-			    A[np.index, jj] <- cnAmatrix[, j]
1374
-			    B[np.index, jj] <- cnBmatrix[, j]
1375
-		    }
1376
-
1377
-##			ffcolapply(A[np.index,sel][,i1:i2] <- res[["cnAB"]]$A[,i1:i2], X=res[["cnAB"]]$A)
1378
-##			ffcolapply(B[np.index,sel][,i1:i2] <- res[["cnAB"]]$B[,i1:i2], X=res[["cnAB"]]$B)
1379
-#            A[np.index, sel[j]] = res[["cnAB"]]$A[,j]
1380
-#            B[np.index, sel[j]] = res[["cnAB"]]$B[,j]
1381
-#          }
1360
+	if(length(np.index)>0) {
1361
+		cnAmatrix <- res[["cnAB"]]$A
1362
+		cnBmatrix <- res[["cnAB"]]$B
1363
+		for(j in seq_along(sel)){
1364
+			jj <- sel[j]
1365
+			A[np.index, jj] <- cnAmatrix[, j]
1366
+			GT[np.index, jj] <- cnAmatrix[, j]
1367
+			B[np.index, jj] <- cnBmatrix[, j]
1368
+			GTP[np.index, jj] <- cnBmatrix[, j]
1369
+		}
1382 1370
         }
1383
-#        delete(res[["A"]]); delete(res[["B"]])
1384 1371
 	open(SKW); open(SNR); open(mixtureParams)
1385 1372
 	SKW[sel] = res[["SKW"]][]
1386 1373
 	SNR[sel] = res[["SNR"]][]
1387 1374
 	mixtureParams[, sel] = res[["mixtureParams"]][]
1388
-        close(A)
1389
-        close(B)
1390
-        close(SNR)
1391
-        close(SKW)
1375
+        close(A); close(B)
1376
+	close(GT); close(GTP)
1377
+        close(SNR); close(SKW)
1392 1378
         close(mixtureParams)
1393
-#        delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]])
1394 1379
         rm(res)
1395 1380
 	gc()
1396 1381
         TRUE