Browse code

updated genotype and crlmmIlluminaRS functions. suppressing integer overflow warnings that do not appear to be relevant

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45348 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 19/03/2010 16:00:25
Showing 7 changed files

... ...
@@ -477,3 +477,10 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error.
477 477
 
478 478
 ** a few updates to initializeBigMatrix
479 479
 ** show, [ defined for CNSetLM
480
+
481
+2010-03-18 R.Scharpf committed version 1.5.38
482
+
483
+** import snpCall, snpCallProbability, snpCall<-, snpCallProbability<-
484
+   from Biobase
485
+** updates to genotype and crlmmIlluminaRS
486
+** class union of ff_matrix, matrix, and ffdf
... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.5.38
4
+Version: 1.5.39
5 5
 Date: 2010-02-05
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -14,7 +14,9 @@ importMethodsFrom(Biobase, annotation, "annotation<-",
14 14
                   fData, featureData, "featureData<-", featureNames,
15 15
                   fvarMetadata, fvarLabels, pData, phenoData,
16 16
                   "phenoData<-", protocolData, "protocolData<-",
17
-                  pubMedIds, rowMedians, sampleNames, storageMode,
17
+                  pubMedIds, rowMedians, sampleNames, snpCall,
18
+                  snpCallProbability,
19
+		  "snpCall<-", "snpCallProbability<-", storageMode,
18 20
                   "storageMode<-", updateObject, varLabels)
19 21
 
20 22
 importFrom(Biobase, assayDataElement, assayDataElementNames,
... ...
@@ -2,6 +2,7 @@ setOldClass("ffdf")
2 2
 setOldClass("ff_matrix")
3 3
 ##setClassUnion("matrix_or_ff", c("matrix", "ff_matrix"))
4 4
 setClassUnion("list_or_ffdf", c("list", "ffdf"))
5
+setClassUnion("ff_or_matrix", c("ff_matrix", "matrix", "ffdf"))
5 6
 setClass("CNSetLM", contains="CNSet", representation(lM="list_or_ffdf"))
6 7
 setMethod("initialize", "CNSetLM", function(.Object, CA=new("matrix"), CB=new("matrix"), lM=new("list"), ...){
7 8
 	.Object <- callNextMethod(.Object, CA=CA, CB=CB, lM=lM, ...)
... ...
@@ -92,10 +92,14 @@ setMethod("open", "AlleleSet", function(con, ...){
92 92
 ##setReplaceMethod("calls", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "call", value))
93 93
 ##setReplaceMethod("confs", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "callProbability", value))
94 94
 ##setMethod("confs", "SnpSuperSet", function(object) assayDataElement(object, "callProbability"))
95
-genotype <- function(filenames, cdfName, mixtureSampleSize=10^5,
96
-		     fitMixture=TRUE,
97
-		     eps=0.1, verbose=TRUE,
98
-		     seed=1, sns, copynumber=FALSE,
95
+genotype <- function(filenames,
96
+		     cdfName,
97
+		     mixtureSampleSize=10^5,
98
+		     eps=0.1,
99
+		     verbose=TRUE,
100
+		     seed=1,
101
+		     sns,
102
+		     copynumber=FALSE,
99 103
 		     probs=rep(1/3, 3),
100 104
 		     DF=6,
101 105
 		     SNRMin=5,
... ...
@@ -117,10 +121,12 @@ genotype <- function(filenames, cdfName, mixtureSampleSize=10^5,
117 121
 	mixtureParams <- matrix(NA, 4, length(filenames))
118 122
 	snp.index <- which(isSnp(callSet)==1)
119 123
 	batches <- splitIndicesByLength(1:ncol(callSet), ocSamples())
124
+	iter <- 1
120 125
 	for(j in batches){
126
+		if(verbose) message("Batch ", iter, " of ", length(batches))
121 127
 		snprmaRes <- snprma(filenames=filenames[j],
122 128
 				    mixtureSampleSize=mixtureSampleSize,
123
-				    fitMixture=fitMixture,
129
+				    fitMixture=TRUE,
124 130
 				    eps=eps,
125 131
 				    verbose=verbose,
126 132
 				    seed=seed,
... ...
@@ -129,27 +135,25 @@ genotype <- function(filenames, cdfName, mixtureSampleSize=10^5,
129 135
 		stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns))
130 136
 		pData(callSet)$SKW[j] <- snprmaRes$SKW
131 137
 		pData(callSet)$SNR[j] <- snprmaRes$SNR
132
-		A(callSet)[snp.index, j] <- snprmaRes[["A"]]
133
-		B(callSet)[snp.index, j] <- snprmaRes[["B"]]
138
+		suppressWarnings(A(callSet)[snp.index, j] <- snprmaRes[["A"]])
139
+		suppressWarnings(B(callSet)[snp.index, j] <- snprmaRes[["B"]])
134 140
 		mixtureParams[, j] <- snprmaRes$mixtureParams
135 141
 		rm(snprmaRes); gc()
136
-	}
137
-	## remove snprmaRes and garbage collect before quantile-normalizing NP probes 
138
-	if(copynumber){
139
-		np.index <- which(isSnp(callSet) == 0)
140
-		cnrmaRes <- cnrma(filenames=filenames[j],
141
-				  cdfName=cdfName,
142
-				  row.names=featureNames(callSet)[np.index],				  
143
-				  sns=sns[j],
144
-				  seed=seed,
145
-				  verbose=verbose)
146
-		stopifnot(identical(featureNames(callSet)[np.index], rownames(cnrmaRes)))
147
-		A(callSet)[np.index, j] <- cnrmaRes
148
-		rm(cnrmaRes); gc()
149
-	}
150
-	for(j in batches){
151
-		tmp <- crlmmGT(A=A(callSet)[snp.index, j],
152
-			       B=B(callSet)[snp.index, j],
142
+		if(copynumber){
143
+			np.index <- which(isSnp(callSet) == 0)
144
+			cnrmaRes <- cnrma(filenames=filenames[j],
145
+					  cdfName=cdfName,
146
+					  row.names=featureNames(callSet)[np.index],				  
147
+					  sns=sns[j],
148
+					  seed=seed,
149
+					  verbose=verbose)
150
+			stopifnot(identical(featureNames(callSet)[np.index], rownames(cnrmaRes)))
151
+			A(callSet)[np.index, j] <- cnrmaRes
152
+			rm(cnrmaRes); gc()
153
+		}
154
+		## as.matrix needed when ffdf is used
155
+		tmp <- crlmmGT(A=as.matrix(A(callSet)[snp.index, j]),
156
+			       B=as.matrix(B(callSet)[snp.index, j]),
153 157
 			       SNR=callSet$SNR[j],
154 158
 			       mixtureParams=mixtureParams[, j],
155 159
 			       cdfName=annotation(callSet),
... ...
@@ -164,11 +168,11 @@ genotype <- function(filenames, cdfName, mixtureSampleSize=10^5,
164 168
 			       verbose=verbose,
165 169
 			       returnParams=returnParams,
166 170
 			       badSNP=badSNP)
167
-		snpCall(callSet)[snp.index, j] <- tmp[["calls"]]
168
-		snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]
171
+		suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
172
+		suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
169 173
 		callSet$gender[j] <- tmp$gender
170
-		rm(tmp); gc()
171
-	}	
174
+		iter <- iter+1
175
+	}
172 176
 	return(callSet)
173 177
 }
174 178
 
... ...
@@ -267,17 +271,11 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
267 271
 	if(missing(cdfName)) stop("must specify cdfName")
268 272
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
269 273
 	if(missing(sns)) sns <- basename(arrayNames)
270
-	RG <- constructRG(filenames=arrayNames,
271
-			  cdfName=cdfName,
272
-			  sns=sns,
273
-			  verbose=verbose,
274
-			  fileExt=fileExt,
275
-			  sep=sep,
276
-			  sampleSheet=sampleSheet,
277
-			  arrayInfoColNames=arrayInfoColNames)
278 274
 	batches <- splitIndicesByLength(seq(along=arrayNames), ocSamples())
275
+	k <- 1
279 276
 	for(j in batches){
280
-		tmp <- readIdatFiles(sampleSheet=sampleSheet[j, ],
277
+		if(verbose) message("Batch ", k, " of ", length(batches))
278
+		RG <- readIdatFiles(sampleSheet=sampleSheet[j, ],
281 279
 				     arrayNames=arrayNames[j],
282 280
 				     ids=ids,
283 281
 				     path=path,
... ...
@@ -286,55 +284,9 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
286 284
 				     sep=sep,
287 285
 				     fileExt=fileExt,
288 286
 				     saveDate=TRUE)
289
-		if(nrow(tmp) != nrow(RG)){
290
-			##RS: I don't understand why the IDATS for the
291
-			##same platform do not have necessarily have
292
-			##the same length
293
-			tmp <- tmp[featureNames(tmp) %in% featureNames(RG), ]
294
-		}
295
-		index <- match(featureNames(tmp), featureNames(RG))
296
-		assayData(RG)$R[index,j] <- assayData(tmp)$R
297
-		assayData(RG)$G[index, j] <- assayData(tmp)$G
298
-		assayData(RG)$zero[index, j] <- assayData(tmp)$zero
299
-		pData(RG)[j, ] <- pData(tmp)
300
-		annotation(RG) <- annotation(tmp)
301
-		pData(protocolData(RG))[j, ] <- pData(protocolData(tmp))
302
-		rm(tmp); gc()
303
-	}
304
-	message("Saving RG")
305
-	save(RG, file=file.path(ldPath(), "RG.rda"))
306
-	k <- 1
307
-	for(j in batches){
308
-		##MR: Might want to to nonpolymorphic markers in a
309
-		##separate step and make that optional
310
-		tmp <- RGtoXY(RG[, j], chipType=cdfName)
311
-		if(k == 1){
312
-			##initialize XYSet
313
-			nc <- ncol(tmp)
314
-			nr <- nrow(tmp)
315
-			XY <- new("NChannelSet",
316
-				  X=initializeBigMatrix(name="X", nr, nc),
317
-				  Y=initializeBigMatrix(name="Y", nr, nc),
318
-				  zero=initializeBigMatrix(name="zero", nr, nc),
319
-				  experimentData=experimentData(RG),
320
-				  phenoData=phenoData(RG),
321
-				  protocolData=protocolData(RG),
322
-				  annotation=annotation(RG))
323
-		}
324
-		storageMode(XY) <- "environment"
325
-		assayData(XY)$X[, j] <- assayData(tmp)$X
326
-		assayData(XY)$Y[, j] <- assayData(tmp)$Y
327
-		assayData(XY)$zero[, j] <- assayData(tmp)$zero
328
-		k <- k+1
329
-		rm(tmp); gc()
330
-	}
331
-	annotation(XY) <- cdfName
332
-	message("Saving XY")
333
-	save(XY, file=file.path(ldPath(), "XY.rda"))
334
-	mixtureParams <- matrix(NA, 4, length(filenames))
335
-	k <- 1
336
-	for(j in batches){
337
-		res <- preprocessInfinium2(XY[, j],
287
+		XY <- RGtoXY(RG, chipType=cdfName)
288
+		rm(RG); gc()
289
+		res <- preprocessInfinium2(XY,
338 290
 					   mixtureSampleSize=mixtureSampleSize,
339 291
 					   fitMixture=TRUE,
340 292
 					   verbose=verbose,
... ...
@@ -344,40 +296,55 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
344 296
 					   sns=sns[j],
345 297
 					   stripNorm=stripNorm,
346 298
 					   useTarget=useTarget)
347
-		if(k==1){
348
-			## MR: number of rows should be number of SNPs + number of nonpolymorphic markers.
349
-			##  Here, I'm just using the # of rows returned from the above function
299
+		## MR: number of rows should be number of SNPs + number of nonpolymorphic markers.
300
+		##  Here, I'm just using the # of rows returned from the above function
301
+		if(k == 1){
302
+			if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
350 303
 			callSet <- new("SnpSuperSet",
351
-				       alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=ncol(XY)),
352
-				       alleleB=initializeBigMatrix(name="B", nr=nrow(res[[2]]), nc=ncol(XY)),
353
-				       call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=ncol(XY)),
354
-				       callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=ncol(XY)),
355
-				       protocolData=protocolData(XY),
304
+				       alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
305
+				       alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
306
+				       call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
307
+				       callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
356 308
 				       experimentData=experimentData(XY),
357
-				       phenoData=phenoData(XY),
358 309
 				       annotation=annotation(XY))
359
-			featureNames(callSet) <- res[["gns"]]
360 310
 			sampleNames(callSet) <- sns
311
+			phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet,
312
+							   arrayNames=sns,
313
+							   arrayInfoColNames=arrayInfoColNames)
314
+			pD <- data.frame(matrix(NA, length(sns), 1),
315
+					 row.names=sns)
316
+			colnames(pD) <- "ScanDate"
317
+			protocolData(callSet) <- new("AnnotatedDataFrame", data=pD)
318
+			pData(protocolData(callSet))[j, ] <- pData(protocolData(XY))
319
+			featureNames(callSet) <- res[["gns"]]
320
+			pData(callSet)$SKW <- rep(NA, length(sns))
321
+			pData(callSet)$SNR <- rep(NA, length(sns))
322
+			pData(callSet)$gender <- rep(NA, length(sns))
323
+			##sampleNames(callSet) <- sns
324
+		}
325
+		if(k > 1 & nrow(res[[1]]) != nrow(callSet)){
326
+			##RS: I don't understand why the IDATS for the
327
+			##same platform potentially have different lengths
328
+			res[["A"]] <- res[["A"]][res$gns %in% featureNames(callSet), ]
329
+			res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ]
361 330
 		}
362
-		## MR: we need to define a snp.index
363
-		A(callSet)[, j] <- res[["A"]]
364
-		B(callSet)[, j] <- res[["B"]]
331
+		## MR: we need to define a snp.index vs np.index
332
+		snp.index <- match(res$gns, featureNames(callSet))		
333
+		suppressWarnings(A(callSet)[snp.index, j] <- res[["A"]])
334
+		suppressWarnings(B(callSet)[snp.index, j] <- res[["B"]])
365 335
 		pData(callSet)$SKW[j] <- res$SKW
366 336
 		pData(callSet)$SNR[j] <- res$SNR
367
-		mixtureParams[, j] <- res$mixtureParams
337
+		##mixtureParams[, j] <- res$mixtureParams
338
+		mixtureParams <- res$mixtureParams
368 339
 		rm(res); gc()
369
-	}
370
-	message("Saving callSe")
371
-	save(callSet, file=file.path(ldPath(), "callSet.rda"))	
372
-	for(j in batches){
373 340
 		##MR:  edit snp.index
374
-		snp.index <- 1:nrow(callSet)
375
-		tmp <- crlmmGT(A=A(callSet)[snp.index, j],
376
-			       B=B(callSet)[snp.index, j],
341
+		##snp.index <- 1:nrow(callSet)
342
+		tmp <- crlmmGT(A=as.matrix(A(callSet)[, j]),
343
+			       B=as.matrix(B(callSet)[, j]),
377 344
 			       SNR=callSet$SNR[j],
378
-			       mixtureParams=mixtureParams[, j],
345
+			       mixtureParams=mixtureParams,
379 346
 			       cdfName=annotation(callSet),
380
-			       row.names=featureNames(callSet)[snp.index],
347
+			       row.names=featureNames(callSet),
381 348
 			       col.names=sampleNames(callSet)[j],
382 349
 			       probs=probs,
383 350
 			       DF=DF,
... ...
@@ -388,11 +355,12 @@ crlmmIlluminaRS <- function(sampleSheet=NULL,
388 355
 			       verbose=verbose,
389 356
 			       returnParams=returnParams,
390 357
 			       badSNP=badSNP)
391
-		snpCall(callSet)[snp.index, j] <- tmp[["calls"]]
392
-		## many zeros here (?)
393
-		snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]
358
+		suppressWarnings(snpCall(callSet)[, j] <- tmp[["calls"]])
359
+		## MR: many zeros in the conf. scores (?)
360
+		suppressWarnings(snpCallProbability(callSet)[, j] <- tmp[["confs"]])
394 361
 		callSet$gender[j] <- tmp$gender
395 362
 		rm(tmp); gc()
363
+		k <- k+1
396 364
 	}
397 365
 	return(callSet)
398 366
 }
... ...
@@ -1,3 +1,15 @@
1
+setReplaceMethod("snpCall", c("SnpSuperSet", "ff_or_matrix"),
2
+                 function(object, ..., value)
3
+{
4
+    assayDataElementReplace(object, "call", value)
5
+})
6
+setReplaceMethod("snpCallProbability", c("SnpSuperSet", "ff_or_matrix"),
7
+                 function(object, ..., value)
8
+{
9
+    assayDataElementReplace(object, "callProbability", value)
10
+})
11
+
12
+
1 13
 ## Method("initialize", "AlleleSet",
2 14
 ##        function(.Object,
3 15
 ##                 assayData = assayDataNew(alleleA=alleleA,
... ...
@@ -236,6 +236,7 @@ initializeBigMatrix <- function(name, nr, nc, vmode="integer"){
236 236
 			results <- createFF(name=name,
237 237
 					    dim=c(nr, nc),
238 238
 					    vmode=vmode)
239
+			results[,] <- NA
239 240
 		}
240 241
 	}  else results <- matrix(NA, nr, nc)
241 242
 	return(results)