Browse code

updated copynumber vignette in inst/scripts. Added example datasets

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@47121 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 25/05/2010 13:07:43
Showing 13 changed files

... ...
@@ -519,3 +519,6 @@ function (which expects ff objects and supports parallel processing)
519 519
 2010-04-24 B Carvalho committed version 1.7.1
520 520
 ** fixed bug in gender prediction that cause a spike in memory usage
521 521
 
522
+2010-05-25 R. Scharpf committed version 1.7.2
523
+** added example dataset
524
+** fixed vignette: inst/scripts/copynumber.Rnw
... ...
@@ -1,8 +1,8 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.7.1
5
-Date: 2010-04-16
4
+Version: 1.7.2
5
+Date: 2010-05-25
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>
7 7
 Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
8 8
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms
... ...
@@ -23,7 +23,8 @@ Imports: affyio (>= 1.15.2),
23 23
          utils
24 24
 Suggests: hapmapsnp6,
25 25
           genomewidesnp6Crlmm (>= 1.0.2),
26
-          snpMatrix
26
+          snpMatrix,
27
+	  ellipse
27 28
 Collate: AllGenerics.R
28 29
 	 AllClasses.R
29 30
 	 methods-CNSet.R
... ...
@@ -33,6 +34,7 @@ Collate: AllGenerics.R
33 34
          crlmm-functions.R
34 35
          crlmm-illumina.R
35 36
 	 snprma-functions.R
37
+	 plot-methods.R
36 38
          utils.R
37 39
          zzz.R
38 40
 LazyLoad: yes
... ...
@@ -70,6 +70,7 @@ export(crlmm,
70 70
        batch,
71 71
        crlmmCopynumber2)
72 72
 ##export(initializeParamObject, biasAdjNP2)
73
+##export(construct)
73 74
 
74 75
 
75 76
 
... ...
@@ -8,3 +8,16 @@ setMethod("initialize", "CNSetLM", function(.Object, lM=new("list"), ...){
8 8
 	.Object@lM <- lM
9 9
 	.Object <- callNextMethod(.Object, ...)
10 10
 })
11
+
12
+setValidity("CNSetLM",
13
+	    function(object){
14
+		    if(!"batch" %in% varLabels(protocolData(object)))
15
+			    return("'batch' not defined in protocolData")
16
+		    if(!"chromosome" %in% fvarLabels(object))
17
+			    return("'chromosome' not defined in featureData")
18
+		    if(!"position" %in% fvarLabels(object))
19
+			    return("'position' not defined in featureData")
20
+		    if(!"isSnp" %in% fvarLabels(object))
21
+			    return("'isSnp' not defined in featureData")
22
+		    return(TRUE)
23
+	    })
... ...
@@ -24,7 +24,6 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){
24 24
 	gns <- getVarInEnv("gns")
25 25
 	path <- system.file("extdata", package=paste(cdfName, "Crlmm", sep=""))
26 26
 	load(file.path(path, "snpProbes.rda"))
27
-
28 27
 	if(copynumber){
29 28
 		load(file.path(path, "cnProbes.rda"))
30 29
 		cnProbes <- get("cnProbes")
... ...
@@ -40,7 +39,6 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){
40 39
 	M[index, "isSnp"] <- 1L
41 40
 	index <- which(is.na(M[, "isSnp"]))
42 41
 	M[index, "isSnp"] <- 1L
43
-
44 42
 	if(copynumber){
45 43
 		index <- match(rownames(cnProbes), rownames(M)) #only snp probes in M get assigned position
46 44
 		M[index, "position"] <- cnProbes[, grep("pos", colnames(cnProbes))]
... ...
@@ -54,29 +52,42 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){
54 52
 	##crlmmOpts$snpRange <- range(snpIndex)
55 53
 	##crlmmOpts$npRange <- range(npIndex)
56 54
 }
55
+
57 56
 construct <- function(filenames, cdfName, copynumber=FALSE,
58
-		      sns, verbose=TRUE){
59
-	if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability")
57
+		      sns, verbose=TRUE, batch){
58
+	if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability, CA, CB")
60 59
 	if(missing(sns)) sns <- basename(filenames)
61 60
 	protocolData <- getProtocolData.Affy(filenames)
61
+	if(missing(batch)){
62
+		protocolData$batch <- as.numeric(as.factor(protocolData$ScanDate))
63
+	} else 	{
64
+		if(length(batch) != length(filenames))
65
+			stop("batch variable must be the same length as the filenames")
66
+		protocolData$batch <- batch
67
+	}
68
+	rownames(pData(protocolData)) <- NULL
62 69
 	featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber)
63 70
 	nr <- nrow(featureData); nc <- length(filenames)
64
-	callSet <- new("SnpSuperSet", 
71
+	callSet <- new("CNSetLM", 
65 72
 		       alleleA=initializeBigMatrix(name="A", nr, nc),
66 73
 		       alleleB=initializeBigMatrix(name="B", nr, nc),
67 74
 		       call=initializeBigMatrix(name="call", nr, nc),
68 75
 		       callProbability=initializeBigMatrix(name="callPr", nr,nc),
76
+		       CA=initializeBigMatrix(name="CA", nr, nc),
77
+		       CB=initializeBigMatrix(name="CB", nr, nc),
78
+		       protocolData=protocolData,
69 79
 		       featureData=featureData,
70 80
 		       annotation=cdfName)
71 81
 	pd <- data.frame(matrix(NA, nc, 3), row.names=sns)
72 82
 	colnames(pd)=c("SKW", "SNR", "gender")
73 83
 	phenoData(callSet) <- new("AnnotatedDataFrame", data=pd)
74
-	protocolData(callSet) <- protocolData
75
-	sampleNames(callSet) <- sns
84
+	rownames(pData(callSet)) <- NULL
85
+	phenoData(callSet)$sampleNames <- sns
76 86
 	return(callSet)
77 87
 }
78 88
 
79 89
 
90
+
80 91
 genotype <- function(filenames,
81 92
 		     cdfName,
82 93
 		     batch,
... ...
@@ -97,27 +108,20 @@ genotype <- function(filenames,
97 108
 	if(missing(cdfName)) stop("must specify cdfName")
98 109
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
99 110
 	if(missing(sns)) sns <- basename(filenames)
100
-	if(missing(batch)){
101
-		warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch.  The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.")
102
-	} else {
103
-		if(length(batch) != length(filenames))
104
-			stop("batch variable must be the same length as the filenames")
105
-	}	
106 111
 	## callSet contains potentially very big matrices
107 112
 	## More big matrices are created within snprma, that will then be removed.
108 113
 	callSet <- construct(filenames=filenames,
109 114
 			     cdfName=cdfName,
110 115
 			     copynumber=copynumber,
111 116
 			     sns=sns,
112
-			     verbose=verbose)
113
-	if(missing(batch)){
114
-		protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate))
115
-	}
117
+			     verbose=verbose,
118
+			     batch=batch)
116 119
 	mixtureParams <- matrix(NA, 4, length(filenames))
117 120
 	snp.index <- which(isSnp(callSet)==1)
118 121
 	batches <- splitIndicesByLength(1:ncol(callSet), ocSamples())
119 122
 	iter <- 1
120
-	for(j in batches){
123
+##	for(j in batches){
124
+	j <- unlist(batches)
121 125
 		if(verbose) message("Batch ", iter, " of ", length(batches))
122 126
 		snprmaRes <- snprma(filenames=filenames[j],
123 127
 				    mixtureSampleSize=mixtureSampleSize,
... ...
@@ -167,7 +171,7 @@ genotype <- function(filenames,
167 171
 		snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]]
168 172
 		callSet$gender[j] <- tmp$gender
169 173
 		iter <- iter+1
170
-	}
174
+##	}
171 175
 	return(callSet)
172 176
 }
173 177
 
... ...
@@ -208,23 +212,15 @@ genotype2 <- function(filenames,
208 212
 	}
209 213
 	if(missing(cdfName)) stop("must specify cdfName")
210 214
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
211
-	if(missing(batch)){
212
-		warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch.  The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.")
213
-	} else {
214
-		if(length(batch) != length(filenames))
215
-			stop("batch variable must be the same length as the filenames")
216
-	}
217 215
 	if(missing(sns)) sns <- basename(filenames)
218 216
 	## callSet contains potentially very big matrices
219 217
 	callSet <- construct(filenames=filenames,
220 218
 			     cdfName=cdfName,
221 219
 			     copynumber=TRUE,
222 220
 			     sns=sns,
223
-			     verbose=verbose)
224
-	if(missing(batch)){
225
-		protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate))
226
-	}
227
-	if(!missing(batch)) protocolData(callSet)$batch <- batch
221
+			     verbose=verbose,
222
+			     batch=batch)
223
+	##lM(callSet) <- initializeParamObject(list(featureNames(callSet), unique(protocolData(callSet)$batch)))
228 224
 	mixtureParams <- matrix(NA, 4, length(filenames))
229 225
 	snp.index <- which(isSnp(callSet)==1)
230 226
 	snprmaRes <- snprma2(filenames=filenames,
... ...
@@ -242,16 +238,23 @@ genotype2 <- function(filenames,
242 238
 	open(snprmaRes[["SKW"]])
243 239
 	open(snprmaRes[["mixtureParams"]])
244 240
 	if(verbose) message("Updating elements of callSet")
245
-	bb = ocProbesets()*ncol(A)*8
246
-	ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb)
247
-	ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb)
241
+##	bb = ocProbesets()*ncol(A)*8
242
+##	ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb)
243
+##	ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb)
244
+	##batches <- splitIndicesByLength(1:nrow(snprmaRes[["A"]]), ocProbesets())
245
+	AA <- A(callSet)
246
+	BB <- B(callSet)
247
+	for(j in 1:ncol(callSet)){
248
+		AA[snp.index, j] <- snprmaRes[["A"]][, j]
249
+		BB[snp.index, j] <- snprmaRes[["B"]][, j]
250
+	}
248 251
 	if(verbose) message("Finished updating elements of callSet")
249 252
 	stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns))
250 253
 	pData(callSet)$SKW <- snprmaRes$SKW
251 254
 	pData(callSet)$SNR <- snprmaRes$SNR
252 255
 	mixtureParams <- snprmaRes$mixtureParams
253 256
 	np.index <- which(isSnp(callSet) == 0)
254
-	cnrmaRes <- cnrma2(A=A(callSet),
257
+	cnrmaRes <- cnrma2(A=AA,
255 258
 			   filenames=filenames,
256 259
 			   row.names=featureNames(callSet)[np.index],
257 260
 			   cdfName=cdfName,
... ...
@@ -261,32 +264,169 @@ genotype2 <- function(filenames,
261 264
 	rm(cnrmaRes); gc()
262 265
 	## as.matrix needed when ffdf is used
263 266
 	tmp <- crlmmGT2(A=snprmaRes[["A"]],
264
-			   B=snprmaRes[["B"]],
265
-			   SNR=snprmaRes[["SNR"]],
266
-			   mixtureParams=snprmaRes[["mixtureParams"]],
267
-			   cdfName=cdfName,
268
-			   row.names=NULL, ##featureNames(callSet),##[snp.index],
269
-			   col.names=sampleNames(callSet),
270
-			   probs=probs,
271
-			   DF=DF,
272
-			   SNRMin=SNRMin,
273
-			   recallMin=recallMin,
274
-			   recallRegMin=recallRegMin,
275
-			   gender=gender,
276
-			   verbose=verbose,
277
-			   returnParams=returnParams,
278
-			   badSNP=badSNP)
267
+			B=snprmaRes[["B"]],
268
+			SNR=snprmaRes[["SNR"]],
269
+			mixtureParams=snprmaRes[["mixtureParams"]],
270
+			cdfName=cdfName,
271
+			row.names=NULL, ##featureNames(callSet),##[snp.index],
272
+			col.names=sampleNames(callSet),
273
+			probs=probs,
274
+			DF=DF,
275
+			SNRMin=SNRMin,
276
+			recallMin=recallMin,
277
+			recallRegMin=recallRegMin,
278
+			gender=gender,
279
+			verbose=verbose,
280
+			returnParams=returnParams,
281
+			badSNP=badSNP)
282
+	if(verbose) message("Leaving crlmmGT2")
279 283
 	open(tmp[["calls"]])
280 284
 	open(tmp[["confs"]])
281
-	bb = ocProbesets()*ncol(A)*8
282
-	ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb)
283
-	ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb)
285
+	##bb = ocProbesets()*ncol(A)*8
286
+	GT <- snpCall(callSet)
287
+	GTP <- snpCallProbability(callSet)
288
+	for(j in 1:ncol(callSet)){
289
+		GT[snp.index, j] <- tmp[["calls"]][, j]
290
+		GTP[snp.index, j] <- tmp[["confs"]][, j]
291
+	}
292
+##	ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb)
293
+##	ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb)
284 294
 	callSet$gender <- tmp$gender
285
-	cnSet <- as(callSet, "CNSetLM")
286
-	return(cnSet)
295
+	return(callSet)
296
+}
297
+
298
+
299
+
300
+genotype3 <- function(filenames,
301
+		      cdfName,
302
+		      batch,
303
+		      mixtureSampleSize=10^5,
304
+		      eps=0.1,
305
+		      verbose=TRUE,
306
+		      seed=1,
307
+		      sns,
308
+		      copynumber=FALSE,
309
+		      probs=rep(1/3, 3),
310
+		      DF=6,
311
+		      SNRMin=5,
312
+		      recallMin=10,
313
+		      recallRegMin=1000,
314
+		      gender=NULL,
315
+		      returnParams=TRUE,
316
+		      badSNP=0.7){
317
+	if(!isPackageLoaded("ff")) stop("Must load package 'ff'")
318
+	if(!copynumber){
319
+		callSet <- crlmm2(filenames=filenames,
320
+				  cdfName=cdfName,
321
+				  mixtureSampleSize=mixtureSampleSize,
322
+				  eps=eps,
323
+				  verbose=verbose,
324
+				  sns=sns,
325
+				  probs=probs,
326
+				  DF=DF,
327
+				  SNRMin=SNRMin,
328
+				  recallMin=recallMin,
329
+				  recallRegMin=recallRegMin,
330
+				  gender=gender,
331
+				  returnParams=returnParams,
332
+				  badSNP=badSNP)
333
+		return(callSet)
334
+	}
335
+	if(missing(cdfName)) stop("must specify cdfName")
336
+	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
337
+	if(missing(batch)){
338
+		warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch.  The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.")
339
+	} else {
340
+		if(length(batch) != length(filenames))
341
+			stop("batch variable must be the same length as the filenames")
342
+	}
343
+	if(missing(sns)) sns <- basename(filenames)
344
+	## callSet contains potentially very big matrices
345
+	callSet <- construct(filenames=filenames,
346
+			     cdfName=cdfName,
347
+			     copynumber=TRUE,
348
+			     sns=sns,
349
+			     verbose=verbose)
350
+	if(missing(batch)){
351
+		protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate))
352
+	}
353
+	if(!missing(batch)) protocolData(callSet)$batch <- batch
354
+	##lM(callSet) <- initializeParamObject(list(featureNames(callSet), unique(protocolData(callSet)$batch)))
355
+	mixtureParams <- matrix(NA, 4, length(filenames))
356
+	snp.index <- which(isSnp(callSet)==1)
357
+##	snprmaRes <- snprma2(filenames=filenames,
358
+##			       mixtureSampleSize=mixtureSampleSize,
359
+##			       fitMixture=TRUE,
360
+##			       eps=eps,
361
+##			       verbose=verbose,
362
+##			       seed=seed,
363
+##			       cdfName=cdfName,
364
+##			       sns=sns)
365
+##	if(verbose) message("Finished preprocessing.")
366
+##	open(snprmaRes[["A"]])
367
+##	open(snprmaRes[["B"]])
368
+##	open(snprmaRes[["SNR"]])
369
+##	open(snprmaRes[["SKW"]])
370
+##	open(snprmaRes[["mixtureParams"]])
371
+##	if(verbose) message("Updating elements of callSet")
372
+####	bb = ocProbesets()*ncol(A)*8
373
+####	ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb)
374
+####	ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb)
375
+##	##batches <- splitIndicesByLength(1:nrow(snprmaRes[["A"]]), ocProbesets())
376
+##	for(j in 1:ncol(callSet)){
377
+##		A(callSet)[snp.index, j] <- snprmaRes[["A"]][, j]
378
+##		B(callSet)[snp.index, j] <- snprmaRes[["B"]][, j]
379
+##	}
380
+##	if(verbose) message("Finished updating elements of callSet")
381
+##	stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns))
382
+##	pData(callSet)$SKW <- snprmaRes$SKW
383
+##	pData(callSet)$SNR <- snprmaRes$SNR
384
+##	mixtureParams <- snprmaRes$mixtureParams
385
+	np.index <- which(isSnp(callSet) == 0)
386
+	cnrmaRes <- cnrma2(A=A(callSet),
387
+			   filenames=filenames,
388
+			   row.names=featureNames(callSet)[np.index],
389
+			   cdfName=cdfName,
390
+			   sns=sns,
391
+			   seed=seed,
392
+			   verbose=verbose)
393
+##	if(verbose) message("Entering crlmmGT2...")
394
+##	rm(cnrmaRes); gc()
395
+##	## as.matrix needed when ffdf is used
396
+##	tmp <- crlmmGT2(A=snprmaRes[["A"]],
397
+##			B=snprmaRes[["B"]],
398
+##			SNR=snprmaRes[["SNR"]],
399
+##			mixtureParams=snprmaRes[["mixtureParams"]],
400
+##			cdfName=cdfName,
401
+##			row.names=NULL, ##featureNames(callSet),##[snp.index],
402
+##			col.names=sampleNames(callSet),
403
+##			probs=probs,
404
+##			DF=DF,
405
+##			SNRMin=SNRMin,
406
+##			recallMin=recallMin,
407
+##			recallRegMin=recallRegMin,
408
+##			gender=gender,
409
+##			verbose=verbose,
410
+##			returnParams=returnParams,
411
+##			badSNP=badSNP)
412
+##	if(verbose) message("Leaving crlmmGT2")
413
+##	open(tmp[["calls"]])
414
+##	open(tmp[["confs"]])
415
+##	##bb = ocProbesets()*ncol(A)*8
416
+##	for(j in 1:ncol(callSet)){
417
+##		snpCall(callSet)[snp.index, j] <- tmp[["calls"]][, j]
418
+##		snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]][, j]
419
+##	}
420
+####	ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb)
421
+####	ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb)
422
+##	callSet$gender <- tmp$gender
423
+####	cnSet <- as(callSet, "CNSetLM")
424
+##	return(callSet)
425
+	return(cnrmaRes)
287 426
 }
288 427
 
289 428
 
429
+
290 430
 ##---------------------------------------------------------------------------
291 431
 ##---------------------------------------------------------------------------
292 432
 ## For Illumina
... ...
@@ -758,6 +898,7 @@ predictGender <- function(res, cdfName="genomewidesnp6", SNRMin=5){
758 898
 }
759 899
 
760 900
 
901
+##replace with release/crlmm/R/cnrma-functions
761 902
 crlmmCopynumber <- function(object,
762 903
 			    chromosome=1:23,
763 904
 			    which.batches,
... ...
@@ -776,7 +917,8 @@ crlmmCopynumber <- function(object,
776 917
 			    MIN.PHI=2^3,
777 918
 			    THR.NU.PHI=TRUE,
778 919
 			    thresholdCopynumber=TRUE){
779
-	if(isPackageLoaded("ff")){
920
+	ffIsLoaded <- class(calls(object))[[1]] == "ff"
921
+	if(ffIsLoaded){
780 922
 		open(object)
781 923
 		open(object$SKW)
782 924
 		open(object$SNR)
... ...
@@ -787,7 +929,7 @@ crlmmCopynumber <- function(object,
787 929
 	stopifnot("isSnp" %in% fvarLabels(object))
788 930
 	##batch <- object$batch
789 931
 	batch <- batch(object)
790
-	if(isPackageLoaded("ff")) {
932
+	if(ffIsLoaded) {
791 933
 		open(object$SNR)
792 934
 		SNR <- object$SNR[, ]
793 935
 	} else SNR <-  object$SNR
... ...
@@ -844,13 +986,19 @@ crlmmCopynumber <- function(object,
844 986
 			fvarLabels(tmp)[13:14] <- paste(c("phiPrimeA", "phiPrimeB"), batchName, sep=".")
845 987
 			fvarLabels(tmp) <- gsub("_", ".", fvarLabels(tmp))
846 988
 			##fvarLabels(tmp) <- gsub("\\.[1-9]", "", fvarLabels(tmp))
847
-			fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% names(physical(lM(object)))]
848
-			jj <- match(fvarLabels(tmp), names(lM(object)))
849
-			lM(object)[row.index, jj] <- fData(tmp)
850
-			##labels.asis <- fvarLabels(tmp)
851
-			##labels.asis <- gsub(paste("_", unique(tmp$batch), sep=""), paste(".", ii, sep=""), labels.asis)
852
-			##k <- match(labels.asis, colnames(lM(object)))
853
-			##lM(object)[row.index, k] <- fData(tmp)
989
+			if(ffIsLoaded){
990
+				fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% names(physical(lM(object)))]
991
+				jj <- match(fvarLabels(tmp), names(lM(object)))
992
+				lM(object)[row.index, jj] <- fData(tmp)
993
+			} else {
994
+				nms <- paste(names(lM(object)), batchName, sep=".")
995
+				fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% nms]
996
+				for(k in seq_along(fvarLabels(tmp))){
997
+					kk <- match(fvarLabels(tmp)[k], paste(names(lM(object)), batchName, sep="."))
998
+					column <- match(batchName, colnames(lM(object)[[k]]))
999
+					lM(object)[[k]][row.index, column] <- fData(tmp)[, k]
1000
+				}
1001
+			}			
854 1002
 			rm(tmp); gc()
855 1003
 			ii <- ii+1
856 1004
 		}
... ...
@@ -879,12 +1027,15 @@ crlmmCopynumber2 <- function(object,
879 1027
 	stopifnot("chromosome" %in% fvarLabels(object))
880 1028
 	stopifnot("position" %in% fvarLabels(object))
881 1029
 	stopifnot("isSnp" %in% fvarLabels(object))
1030
+	lM(object) <- initializeParamObject(list(featureNames(object), unique(protocolData(object)$batch)))
882 1031
 	batch <- batch(object)
883
-	XIndex.snps <- (1:nrow(object))[chromosome(object) == 23 & isSnp(object) & !is.na(chromosome(object))]
1032
+	XIndex.snps <- which(chromosome(object) == 23 & isSnp(object) & !is.na(chromosome(object)))
884 1033
 	##YIndex.snps <- (1:nrow(object))[chromosome(object) == 24 & isSnp(object)]
885
-	XIndex.nps <- (1:nrow(object))[chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object))]
886
-	autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))]
887
-	autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))]	
1034
+	XIndex.nps <- which(chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object)))
1035
+	##autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))]
1036
+	autosomeIndex.snps <- which(chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object)))
1037
+	autosomeIndex.nps <- which(chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object)))
1038
+	##autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))]	
888 1039
 
889 1040
 	## Do chromosome X in batches
890 1041
 	Ns <- initializeBigMatrix("Ns", nrow(object), 5)
... ...
@@ -1017,8 +1168,8 @@ fit.lm1 <- function(idxBatch,
1017 1168
 	open(snpflags)
1018 1169
 	open(normal)
1019 1170
 
1020
-	corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batch(object))))
1021
-	flags <- nuA <- nuB <- phiA <- phiB <- corr
1171
+	corrAB <- corrBB <- corrAA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batch(object))))
1172
+	flags <- nuA <- nuB <- phiA <- phiB <- corrAB
1022 1173
 
1023 1174
 	normal.snps <- normal[snps, ]
1024 1175
 	cB <- cA <- matrix(NA, length(snps), ncol(object))
... ...
@@ -1053,9 +1204,9 @@ fit.lm1 <- function(idxBatch,
1053 1204
 		tau2B[, J] <- shrink(taus[, 3, drop=FALSE], Ns[, 1], DF.PRIOR)
1054 1205
 		sig2B[, J] <- shrink(taus[, 1, drop=FALSE], Ns[, 3], DF.PRIOR)
1055 1206
 
1056
-		corr[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR)
1057
-		corrB.AA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR)
1058
-		corrA.BB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR)
1207
+		corrAB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR)
1208
+		corrAA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR)
1209
+		corrBB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR)
1059 1210
 
1060 1211
 		##---------------------------------------------------------------------------
1061 1212
 		## Impute sufficient statistics for unobserved genotypes (plate-specific)
... ...
@@ -1067,7 +1218,8 @@ fit.lm1 <- function(idxBatch,
1067 1218
 		size <- min(5000, length(index.complete))
1068 1219
 		if(size == 5000) index.complete <- sample(index.complete, 5000, replace=TRUE)
1069 1220
 		if(length(index.complete) < 200){
1070
-			stop("fewer than 200 snps pass criteria for predicting the sufficient statistics")
1221
+			warning("fewer than 200 snps pass criteria for predicting the sufficient statistics")
1222
+			return()
1071 1223
 		}
1072 1224
 		index <- vector("list", 3)
1073 1225
 		index[[1]] <- which(Ns[, 1] == 0 & (Ns[, 2] >= MIN.OBS & Ns[, 3] >= MIN.OBS))
... ...
@@ -1105,6 +1257,7 @@ fit.lm1 <- function(idxBatch,
1105 1257
 		flags[, J] <- rowSums(Ns == 0) > 0
1106 1258
 		##flags[, J] <- index[[1]] | index[[2]] | index[[3]] | rowSums(
1107 1259
 
1260
+		## we're regressing on the medians using the standard errors (hence the division by N) as weights
1108 1261
 		##formerly coefs()
1109 1262
 		Np <- Ns
1110 1263
 		Np[Np < 1] <- 1
... ...
@@ -1176,6 +1329,16 @@ fit.lm1 <- function(idxBatch,
1176 1329
 	tmp <- physical(lM(object))$phiB
1177 1330
 	tmp[snps, ] <- phiB
1178 1331
 	lM(object)$phiB <- tmp
1332
+	tmp <- physical(lM(object))$corrAB
1333
+	tmp[snps, ] <- corrAB
1334
+	lM(object)$corrAB <- tmp
1335
+	tmp <- physical(lM(object))$corrAA
1336
+	tmp[snps, ] <- corrAA
1337
+	lM(object)$corrAA <- tmp
1338
+	tmp <- physical(lM(object))$corrBB
1339
+	tmp[snps, ] <- corrBB
1340
+	lM(object)$corrAB <- tmp
1341
+	
1179 1342
 	lapply(assayData(object), close)
1180 1343
 	lapply(lM(object), close)
1181 1344
 	TRUE
... ...
@@ -1380,13 +1543,14 @@ fit.lm3 <- function(idxBatch,
1380 1543
 		##---------------------------------------------------------------------------
1381 1544
 		## Impute sufficient statistics for unobserved genotypes (plate-specific)
1382 1545
 		##---------------------------------------------------------------------------
1383
-		index <- apply(Ns.F, 2, function(x, MIN.OBS) which(x > MIN.OBS), MIN.OBS)
1546
+		index <- apply(Ns.F, 2, function(x, MIN.OBS) which(x >= MIN.OBS), MIN.OBS)
1384 1547
 		correct.orderA <- muA.F[, 1] > muA.F[, 3]
1385 1548
 		correct.orderB <- muB.F[, 3] > muB.F[, 1]
1386 1549
 		index.complete <- intersect(which(correct.orderA & correct.orderB), intersect(index[[1]], intersect(index[[2]], index[[3]])))
1387 1550
 		size <- min(5000, length(index.complete))
1388 1551
 		if(length(index.complete) < 200){
1389
-			stop("fewer than 200 snps pass criteria for predicting the sufficient statistics")
1552
+			warning("fewer than 200 snps pass criteria for predicting the sufficient statistics")
1553
+			return()
1390 1554
 		}
1391 1555
 		if(size==5000) index.complete <- sample(index.complete, size)
1392 1556
 		index <- vector("list", 3)
... ...
@@ -1520,6 +1684,16 @@ fit.lm3 <- function(idxBatch,
1520 1684
 	tmp <- physical(lM(object))$phiPrimeB
1521 1685
 	tmp[snps, ] <- phiB2
1522 1686
 	lM(object)$phiPrimeB <- tmp
1687
+	
1688
+	tmp <- physical(lM(object))$corrAB
1689
+	tmp[snps, ] <- corrAB
1690
+	lM(object)$corrAB <- tmp
1691
+	tmp <- physical(lM(object))$corrAA
1692
+	tmp[snps, ] <- corrAA
1693
+	lM(object)$corrAA <- tmp
1694
+	tmp <- physical(lM(object))$corrBB
1695
+	tmp[snps, ] <- corrBB
1696
+	lM(object)$corrAB <- tmp	
1523 1697
 ##	lM(object)$tau2A[snps, ] <- tau2A
1524 1698
 ##	lM(object)$tau2B[snps, ] <- tau2B
1525 1699
 ##	lM(object)$sig2A[snps, ] <- sig2A
... ...
@@ -54,7 +54,6 @@ readIdatFiles <- function(sampleSheet=NULL,
54 54
                }
55 55
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
56 56
        }
57
-       
58 57
        narrays = length(arrayNames)
59 58
        grnfiles = paste(arrayNames, fileExt$green, sep=sep)
60 59
        redfiles = paste(arrayNames, fileExt$red, sep=sep)
... ...
@@ -360,7 +359,6 @@ readIdatFiles2 <- function(sampleSheet=NULL,
360 359
 
361 360
 ## the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008
362 361
 readIDAT <- function(idatFile){
363
-
364 362
   fileSize <- file.info(idatFile)$size
365 363
 
366 364
   tempCon <- file(idatFile,"rb")
... ...
@@ -91,18 +91,34 @@ setMethod("computeCopynumber", "CNSet",
91 91
 	}
92 92
 	object
93 93
 })
94
-
95
-
96 94
 setMethod("copyNumber", "CNSet", function(object){
97 95
 	I <- isSnp(object)
96
+	ffIsLoaded <- class(calls(object))[[1]]=="ff"
98 97
 	CA <- CA(object)
99 98
 	CB <- CB(object)
99
+	if(ffIsLoaded){
100
+		open(CA)
101
+		open(CB)
102
+		CA <- CA[,]
103
+		CB <- CB[,]
104
+	}
100 105
 	CN <- CA + CB
101 106
 	##For nonpolymorphic probes, CA is the total copy number
102 107
 	CN[!I, ] <- CA(object)[!I, ]
108
+	CN <- CN/100
103 109
 	CN
104 110
 })
105 111
 
112
+##setMethod("copyNumber", "CNSet", function(object){
113
+##	I <- isSnp(object)
114
+##	CA <- CA(object)
115
+##	CB <- CB(object)
116
+##	CN <- CA + CB
117
+##	##For nonpolymorphic probes, CA is the total copy number
118
+##	CN[!I, ] <- CA(object)[!I, ]
119
+##	CN
120
+##})
121
+
106 122
 
107 123
 setMethod("ellipse", "CNSet", function(x, copynumber, batch, ...){
108 124
 	ellipse.CNSet(x, copynumber, batch, ...)
109 125
new file mode 100644
110 126
Binary files /dev/null and b/data/sample.CNSetLM.rda differ
... ...
@@ -13,6 +13,7 @@
13 13
 \newcommand{\Rclass}[1]{{\textit{#1}}}
14 14
 \newcommand{\oligo}{\Rpackage{oligo }}
15 15
 \newcommand{\R}{\textsf{R}}
16
+\newcommand{\crlmm}{\Rpackage{crlmm}}
16 17
 \usepackage[margin=1in]{geometry}
17 18
 
18 19
 \begin{document}
... ...
@@ -22,8 +23,7 @@
22 23
 \maketitle
23 24
 
24 25
 <<setup, echo=FALSE, results=hide>>=
25
-options(continue=" ")
26
-options(prompt="R> ")
26
+options(continue=" ", width=70)
27 27
 @ 
28 28
 
29 29
 %\section{Estimating copy number}
... ...
@@ -43,168 +43,304 @@ options(prompt="R> ")
43 43
 CRLMM supports the following platforms:
44 44
 
45 45
 <<cdfname>>=
46
+library(oligoClasses)
46 47
 library(crlmm)
47 48
 crlmm:::validCdfNames()
49
+cdfName <- "genomewidesnp6"
48 50
 @ 
49 51
 
50 52
 \paragraph{Preprocess and genotype.}
51 53
 
52
-Provide the complete path for the filenames:
54
+In the following code chunk, we provide the complete path to the
55
+Affymetrix CEL files, assign a path to store the normalized
56
+intensities and genotype data, and define a 'batch' variable. The
57
+batch variable will later be useful when we estimate copy number.  We
58
+typically use the 96 well chemistry plate or the scan date of the
59
+array as a surrogate for batch. Adjusting by date or chemistry plate
60
+can be helpful for limiting the influence of batch effects.  
53 61
 
54 62
 <<celfiles>>=
55 63
 celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE, pattern=".CEL")
56
-## CEPH and Yoruban files
64
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/crlmm"
65
+if(!file.exists(outdir)) dir.create(outdir)
66
+## CEPH and Yoruban
57 67
 batch <- substr(basename(celFiles), 13, 13)
58 68
 celFiles <- celFiles[batch %in% c("C", "Y")]
59 69
 batch <- batch[batch %in% c("C", "Y")]
70
+stopifnot(length(batch) == length(celFiles))
60 71
 @ 
61 72
 
62
-While genotyping with crlmm can be performed using a small number of
63
-samples, copy number estimation requires at least 10 samples --
64
-preferably all of the samples that were processed together on the same
65
-plate.  Crlmm does not use a reference dataset when estimating model
66
-parameters because of large batch effects \citep{Scharpf2009}. The
67
-quantile normalization performed as part of the preprocessing of the raw
68
-data is insufficient for removing batch effects.  Processing a reference
69
-dataset, such as HapMap samples, along with the experimental data will
70
-not improve copy number estimation for the experimental dataset, and
71
-should not be used as a means to increase the sample size. Furthermore,
72
-processing a reference dataset without acknowledging that these samples
73
-were derived from a different batch can result in incorrect copy number
74
-estimates in both the experimental and reference datasets.  The
75
-appropriate way to acknowledge batch is to supply the batch name for
76
-each sample to be processed in the argument to \Robject{cnOptions}:
77
-
78
-<<celfiles>>=
79
-cnOpts <- cnOptions(cdfName="genomewidesnp6",
80
-		    outdir="/thumper/ctsa/beaty/scharpf/crlmmOut/hapmap",
81
-		    batch=batch)
82
-str(cnOpts)
83
-stopifnot(length(cnOpts$batch) == length(celFiles))
84
-names(cnOpts$batch) <- basename(celFiles)
73
+The preprocessing steps for copy number estimation includes quantile
74
+normalization of the polymorphic and nonpolymorphic markers and a
75
+summarization step whereby the quantile normalized intenities are
76
+summarized for each locus.  As the markers at polymorphic loci on the
77
+Affymetrix 6.0 platform are identical, we summarize the intensities by
78
+the median. For the nonpolymorphic markers, no summarization step is
79
+performs.  Next, the \crlmm{} package provides a genotype call and a
80
+confidence score at each polymorphic locus.  Unless the dataset is small
81
+(e.g., fewer than 50 samples), we suggest installing and loading the
82
+\R{} package \Rpackage{ff}.  The function \Rfunction{genotype} checks to
83
+see whether the \Rpackage{ff} is loaded.  If loaded, the normalized
84
+intensities and genotype are stored as \Robject{ff} objects on disk.
85
+Otherwise, the genotypes and normalized intensities are stored in
86
+matrices.  A word of caution: the \Rfunction{genotype} function requires
87
+a potentially large amount of RAM even when the \R{} package
88
+\Rpackage{ff} is loaded.  Users with large datasets may want to skip
89
+this part.  For the 180 HapMap samples processed in this vignette, the
90
+\Rfunction{genotype} function required 25MB RAM. The next two code
91
+chunks should be submitted as batch jobs.
92
+
93
+<<genotype>>=
94
+if(!exists("cnSet")){
95
+	if(!file.exists(file.path(outdir, "cnSet.rda"))){
96
+		##ops <- options(error=recover)
97
+		cnSet <- genotype(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch)
98
+		save(cnSet, file=file.path(outdir, "cnSet.rda"))
99
+	} else load(file.path(outdir, "cnSet.rda"))
100
+}
85 101
 @ 
86 102
 
87
-\noindent The next code chunk quantile normalizes the samples to a
88
-target reference distribution, uses the crlmm algorithm to genotype, and
89
-then estimates the copy number for each batch. Currently processing the
90
-180 HapMap cel files will require less than 20G of RAM. We are working
91
-on methods to reduce the memory footprint.
103
+A more memory efficient approach to preprocessing and genotyping is
104
+implemented in the \R{} function \Rfunction{genotype2}.  The arguments
105
+to this function are similar to \Rfunction{genotype} but requires
106
+explicit loading of the \R{} package \Rpackage{ff} prior to calling
107
+the function. The functions \Rfunction{ocProbesets} and
108
+\Rfunction{ocSamples} can be used to manage how many probesets and
109
+samples are to be loaded into memory and processed.  Using the default
110
+settings, the following code required 1.9Mb RAM to process 180 CEL
111
+files.
112
+
113
+<<genotype2>>=
114
+library(ff)
115
+ld.path <- file.path(outdir, "ffobjs")
116
+if(!file.exists(ld.path)) dir.create(ld.path)
117
+ldPath(ld.path)
118
+if(!exists("cnSet2")){
119
+	if(!file.exists(file.path(outdir, "cnSet2.rda"))){
120
+		cnSet2 <- genotype2(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch)
121
+		save(cnSet2, file=file.path(outdir, "cnSet2.rda"))
122
+	} else load(file.path(outdir, "cnSet2.rda"))
123
+}
124
+@ 
92 125
 
93
-<<preprocessAndGenotype, echo=TRUE>>=
94
-if(FALSE) crlmmCopynumber(celFiles, cnOpts) 
95
-load(file.path(cnOpts[["outdir"]], "cnSet_21.rda"))
126
+The objects returned by the \Rfunction{genotype} and
127
+\Rfunction{genotype2} differ in size as the former returns an object
128
+with matrices in the assay data slot, whereas the latter return an
129
+object with pointers to files on disk.  The functions \Rfunction{open}
130
+and \Rfunction{close} can be used to open and close the connections
131
+stored in the assay data of the \Robject{cnSet2} object, respectively.
132
+Subsetting the \Robject{ff} object pulls the data from disk into
133
+memory.  As subsetting operations pull the data from disk to memory,
134
+these operations must be performed with care.  In particular,
135
+subsetting the \Robject{cnSet2} will subset each assay data element
136
+and this can be slow.  The preferred approach would be to extract the
137
+assay data element that is needed, and then subset this function
138
+object as illustrated below for the genotype calls.
139
+
140
+<<check>>=
141
+class(calls(cnSet))
142
+dims(cnSet)
143
+class(calls(cnSet2))
144
+dims(cnSet2)
145
+print(object.size(cnSet), units="Mb")
146
+print(object.size(cnSet2), units="Mb")
147
+if(FALSE){
148
+	replicate(5, system.time(gt1 <- calls(cnSet)[, 1:50])[[1]])
149
+	open(calls(cnSet2))
150
+	replicate(5, system.time(gt2 <- calls(cnSet2)[, 1:50])[[1]])
151
+}
152
+gt1 <- calls(cnSet)[, 1:50]
153
+gt2 <- calls(cnSet2)[, 1:50]
154
+all.equal(gt1, gt2)
96 155
 @ 
97 156
 
98
-The following R objects are created from crlmmCopynumber:
157
+\noindent With \Robject{ff} objects the additional I/O time required
158
+for extracting data is substantial and will increase for larger
159
+datasets.  Patience is required.
160
+
161
+Note that for the Affymetrix 6.0 platform the assay data elements each
162
+have a row dimension corresponding to the total number of polymorphic
163
+and nonpolymorphic markers interrogated by the Affymetrix 6.0 platform.
164
+A consequence of keeping the rows of the assay data elements the same
165
+for all of the statistical summaries is that the matrix used to store
166
+genotype calls is larger than necessary.  Also, note the additional
167
+overhead of some operations when using \Robject{ff} objects.  For
168
+instance, the posterior probabilities for the CRLMM genotype calls are
169
+represented as integers. The accessor \Robject{snpCallProbability} can
170
+be used to access these confidence scores.  When stored as matrices,
171
+converting the integer representation back to the probability scale is
172
+straightforward as shown below.  However, for the \Robject{ff} objects
173
+we must first bring the data into memory. To bring all the scores into
174
+memory, one could use the function \Rfunction{[,]} but this could be
175
+slow and require a lot of RAM depending on the size of the dataset.
176
+Instead, we suggest pulling only the needed rows and columns from
177
+memory. In the following example, we convert the integer scores to
178
+probabilities for the CEPH samples.  As genotype confidence scores are
179
+not applicable to the nonpolymorphic markers, we extract only the
180
+polymorphic markers using the \Rfunction{isSnp} function.
181
+
182
+<<assayData>>=
183
+rows <- which(isSnp(cnSet))
184
+cols <- which(batch == "C")
185
+posterior.prob <- tryCatch(integerScoreToProbability(snpCallProbability),
186
+			    error=function(e) print("This will not work for an ff object."))
187
+posterior.prob1 <- integerScoreToProbability(snpCallProbability(cnSet)[rows, cols])
188
+posterior.prob2 <- integerScoreToProbability(snpCallProbability(cnSet2)[rows, cols])
189
+all.equal(posterior.prob1, posterior.prob2)
190
+@ 
99 191
 
100
-<<crlmmSetListObjects>>=
101
-fns <- list.files(cnOpts[["outdir"]], pattern="cnSet", full.name=TRUE)
102
-basename(fns)[1:5]
192
+Next, we obtain locus-level estimates of copy number by fitting a linear
193
+model to each SNP. A variable named 'batch' must be indicated in the
194
+\Robject{protocolData} of the \Robject{cnSet} object. As the inverse
195
+variance of the within-genotype normalized intensities are used as
196
+weights in the linear model (and hence the design matrix in the
197
+regression changes for each SNP), the time is linear with the number of
198
+markers on the array. Copy number estimation at nonpolymorphic markers
199
+and polymorphic markers with unobserved genotypes is more difficult. We
200
+refer the interested reader to the technical report \citep{Scharpf2009}.
201
+Again, we peform the copy number estimation using the matrix version and
202
+the ff version in parallel and encourage users with large datasets to
203
+explore the latter. As with the preprocessing and genotyping, the
204
+following code should be submitted as part of the batch job as it is too
205
+slow for interactive analysis.
206
+
207
+<<cn, echo=FALSE, eval=FALSE>>=
208
+##Matrix format
209
+cnSet <- crlmmCopynumber(cnSet)
103 210
 @ 
104 211
 
212
+<<save.cnset, echo=FALSE, eval=FALSE>>=
213
+save(cnSet, file=file.path(outdir, "cnSet.rda"))
214
+@ 
105 215
 
106
-The above algorithm for estimating copy number is predicated on the
107
-assumption that most samples within a batch have copy number 2 at any
108
-given locus.  For common copy number variants, this assumption may not
109
-hold.  An additional iteration using a bias correction provides
110
-additional robustness to this assumption.  Set the \Robject{bias.adj}
111
-argument to \Robject{TRUE}:
216
+<<cn.ff, echo=FALSE, eval=FALSE>>=
217
+rm(cnSet); gc()
218
+ocProbesets(75e3)
219
+##ff objects
220
+system.time(cnSet2 <- crlmmCopynumber2(cnSet2))
221
+save(cnSet2, file=file.path(outdir, "cnSet2.rda"))
222
+@ 
112 223
 
113
-<<biasAdjustment, eval=FALSE>>=
114
-cnOpts[["bias.adj"]] <- TRUE
115
-if(FALSE) crlmmCopynumber(celFiles, cnOpts)
224
+<<timings, eval=FALSE>>=
225
+tryCatch(print(paste("Time for matrix version:", time1)), error=function(e) print("timing for CN estimation not available"))
226
+tryCatch(print(paste("Time for ff version:", time2)), error=function(e) print("timing for CN estimation not available"))
227
+rm(cnSet2); gc()
116 228
 @ 
117 229
 
230
+
118 231
 \section{Accessors}
119 232
 
120
-\subsection{Assay data accessors}
233
+For the remainder of this vignette, we illustrate accessors and
234
+visualizations using the sample object provided in the
235
+\Rpackage{crlmm} package.
236
+
237
+<<sampleset>>=
238
+data(sample.CNSetLM)
239
+x <- sample.CNSetLM
240
+@ 
121 241
 
122
-\paragraph{\Robject{ABset}:  quantile normalized intensities}
123 242
 
124
-An object of class \Robject{ABset} is stored in the first element of the
125
-\Robject{crlmmSetList} object. The following accessors may be of use:
243
+\subsection{Quantile-normalized intensities}
126 244
 
127 245
 Accessors for the quantile normalized intensities for the A allele at
128 246
 polymorphic loci:
129 247
 
130 248
 <<accessors>>=
131
-a <- A(cnSet)[isSnp(cnSet), ]
249
+snp.index <- which(isSnp(x))
250
+np.index <- which(!isSnp(x))
251
+a <- (A(x))[snp.index, ]
132 252
 dim(a)
133 253
 @ 
134 254
 
135
-The quantile-normalized intensities for nonpolymorphic loci are obtained by:
255
+The extra set of parentheses surrounding \Robject{A(cnSet2)} above is
256
+added to emphasize the appropriate order of operations. Subsetting the
257
+entire \Robject{x} object in the following code should be avoided for
258
+large datasets.
259
+
260
+<<eval=FALSE>>=
261
+a <- A(x[snp.index, ])
262
+@ 
263
+
264
+The quantile-normalized intensities for nonpolymorphic loci are obtained
265
+by:
136 266
 
137 267
 <<>>=
138
-npIntensities <- A(cnSet)[!isSnp(cnSet), ]
268
+npIntensities <- (A(x))[np.index, ]
139 269
 @ 
140 270
 
141 271
 Quantile normalized intensities for the B allele at polymorphic loci:
142 272
 
143 273
 <<B.polymorphic>>=
144
-b.snps <- B(cnSet[isSnp(cnSet), ])
274
+b.snps <- (B(x))[snp.index, ]
145 275
 @ 
146 276
 
147 277
 Note that NAs are recorded in the 'B' assay data element for
148 278
 nonpolymorphic loci:
149 279
 
150 280
 <<B.NAs>>=
151
-all(is.na(B(cnSet[!isSnp(cnSet), ])))
281
+all(is.na((B(x))[np.index, ]))
282
+@ 
283
+
284
+<<clean, echo=FALSE>>=
285
+rm(b.snps, a, npIntensities); gc()
152 286
 @ 
153 287
 
154 288
 \paragraph{\Robject{SnpSet}: Genotype calls and confidence scores}
155 289
 
156 290
 Genotype calls:
157 291
 <<genotypes>>=
158
-genotypes <- snpCall(cnSet)
292
+genotypes <- (snpCall(x))[snp.index, ]
159 293
 @ 
160 294
 Confidence scores of the genotype calls:
161 295
 <<confidenceScores>>=
162
-genotypeConf <- confs(cnSet[isSnp(cnSet), ])
296
+genotypeConf <- integerScoreToProbability(snpCallProbability(x)[snp.index[1:10], ])
163 297
 @ 
164 298
 
165 299
 \paragraph{\Robject{CopyNumberSet}: allele-specific copy number}
166 300
 
167 301
 Allele-specific copy number at polymorphic loci:
168 302
 <<ca>>=
169
-ca <- CA(cnSet[isSnp(cnSet), ])
303
+ca <- CA(x[snp.index, ])/100
170 304
 @ 
171 305
 
172 306
 Total copy number at nonpolymorphic loci:
173 307
 <<ca>>=
174
-cn.nonpolymorphic <- CA(cnSet[!isSnp(cnSet), ])
308
+cn.nonpolymorphic <- CA(x[np.index, ])/100
175 309
 @ 
176 310
 
177 311
 Total copy number at both polymorphic and nonpolymorphic loci:
178 312
 <<totalCopynumber>>=
179
-cn <- copyNumber(cnSet)
313
+cn <- copyNumber(x)
314
+apply(cn, 2, median, na.rm=TRUE)
180 315
 @ 
181 316
 
182 317
 \subsection{Other accessors}
183 318
 
184
-Information on physical position and chromosome can be accessed by the
185
-following accessors:
319
+Information on physical position and chromosome can be accessed as follows:
186 320
 
187
-<<positionChromosome, eval=FALSE>>=
188
-xx <- position(cnSet)
189
-yy <- chromosome(cnSet)
321
+<<positionChromosome>>=
322
+xx <- position(x)
323
+yy <- chromosome(x)
190 324
 @ 
191 325
 
192
-There are many parameters computed during copy number estimation that
193
-are at present stored in the \Robject{featureData} slot.  In particular,
194
-the estimation procedure fits a linear model to the normalized
195
-intensities for each allele. These parameters are not generally meant to
196
-be extracted by the user; for now we just mention where they are stored.
326
+Parameters from the linear model used to estimate copy number are
327
+stored in the slot \Robject{lM}. 
197 328
 
198 329
 <<copynumberParameters>>=
199
-fvarLabels(cnSet)
330
+names(lM(x))
331
+dim(lM(x)[[1]])
200 332
 @ 
201 333
 
334
+
335
+
336
+\section{Suggested visualizations}
337
+
202 338
 \paragraph{SNR.}
203 339
 
204 340
 A histogram of the signal to noise ratio for the HapMap samples:
205 341
 
206 342
 <<plotSnr, fig=TRUE, include=FALSE>>=
207
-hist(cnSet$SNR, xlab="SNR", main="")
343
+hist(x$SNR, xlab="SNR", main="", breaks=25)
208 344
 @ 
209 345
 
210 346
 \begin{figure}
... ...
@@ -213,22 +349,7 @@ hist(cnSet$SNR, xlab="SNR", main="")
213 349
   \caption{Signal to noise ratios for the HapMap samples.}
214 350
 \end{figure}
215 351
 
216
-\paragraph{Data/Batch.}
217
-For Affymetrix 6.0, we currently suggest excluding or flagging samples
218
-with a signal to noise ratio less than 5.  Adjusting by date or
219
-chemistry plate can be helpful for limiting the influence of batch
220
-effects.  Ideally, one would have 70+ files in a given batch. Here we
221
-make a table of date versus ancestry (batch):
222
-
223
-<<specifyBatch, eval=FALSE, echo=FALSE>>=
224
-table(format(as.POSIXlt(protocolData(cnSet)[["ScanDate"]]), "%d %b %Y"), cnSet$batch)
225
-@ 
226
-
227
-As all of these samples were run on the first week of March, we would
228
-expect that any systematic artifacts to the intensities that develop
229
-over time to be minimal (a best case scenario).  
230 352
 
231
-\section{Suggested visualizations}
232 353
 
233 354
 \paragraph{One sample at a time: locus-level estimates}
234 355
 
... ...
@@ -240,24 +361,24 @@ area of research.
240 361
 
241 362
 <<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>=
242 363
 par(las=1, mar=c(4, 5, 4, 2))
243
-plot(position(cnSet), copyNumber(cnSet)[, 1], pch=".", 
244
-     cex=2, xaxt="n", col="grey20", ylim=c(0,6), 
364
+plot(position(x), copyNumber(x)[, 1], pch=".", 
365
+     cex=2, xaxt="n", col="grey20", ylim=c(0,4), 
245 366
      ylab="copy number", xlab="physical position (Mb)",
246
-     main=paste(sampleNames(cnSet)[1], ", CHR:", unique(chromosome(cnSet))))
247
-points(position(cnSet)[!isSnp(cnSet)], copyNumber(cnSet)[!isSnp(cnSet), 1],
367
+     main=paste(sampleNames(x)[1], ", CHR:", unique(chromosome(x))))
368
+points(position(x)[!isSnp(x)], copyNumber(x)[!isSnp(x), 1],
248 369
        pch=".", cex=2, col="lightblue")
249
-axis(1, at=pretty(range(position(cnSet))), labels=pretty(range(position(cnSet)))/1e6)
370
+axis(1, at=pretty(range(position(x))), labels=pretty(range(position(x)))/1e6)
250 371
 @ 
251 372
 
252 373
 <<idiogram, eval=FALSE, echo=FALSE>>=
253 374
 require(SNPchip)
254
-plotCytoband(22, new=FALSE, cytoband.ycoords=c(5.8, 6.0), label.cytoband=FALSE)
375
+plotCytoband(22, new=FALSE, cytoband.ycoords=c(3.8, 4), label.cytoband=FALSE)
255 376
 @ 
256 377
 
257 378
 \begin{figure}
258 379
   \includegraphics[width=0.9\textwidth]{copynumber-oneSample}
259 380
   \caption{\label{fig:oneSample} Total copy number (y-axis) for
260
-    chromosome 22 plotted against physical position (x-axis) for one
381
+    chromosome 1 plotted against physical position (x-axis) for one
261 382
     sample.  Estimates at nonpolymorphic loci are plotted in light
262 383
     blue.}
263 384
 \end{figure}
... ...
@@ -267,16 +388,16 @@ plotCytoband(22, new=FALSE, cytoband.ycoords=c(5.8, 6.0), label.cytoband=FALSE)
267 388
 %ask <- FALSE
268 389
 %op <- par(mfrow=c(3, 1), las=1, mar=c(1, 4, 1, 1), oma=c(3, 1, 1, 1), ask=ask)
269 390
 %##Put fit on the copy number scale
270
-%cns <- copyNumber(cnSet)
391
+%cns <- copyNumber(cnSet2)
271 392
 %cnState <- hmmPredictions - as.integer(1)
272
-%xlim <- c(10*1e6, max(position(cnSet)))
393
+%xlim <- c(10*1e6, max(position(cnSet2)))
273 394
 %cols <- brewer.pal(8, "Dark2")[1:4]
274 395
 %for(j in 1:3){
275
-%	plot(position(cnSet), cnState[, j], pch=".", col=cols[2], xaxt="n", 
396
+%	plot(position(cnSet2), cnState[, j], pch=".", col=cols[2], xaxt="n", 
276 397
 %	     ylab="copy number", xlab="Physical position (Mb)", type="s", lwd=2,
277 398
 %	     ylim=c(0,6), xlim=xlim)
278
-%	points(position(cnSet), cns[, j], pch=".", col=cols[3])
279
-%	lines(position(cnSet), cnState[,j], lwd=2, col=cols[2])
399
+%	points(position(cnSet2), cns[, j], pch=".", col=cols[3])
400
+%	lines(position(cnSet2), cnState[,j], lwd=2, col=cols[2])
280 401
 %	axis(1, at=pretty(position(cnSet)), 
281 402
 %	     labels=pretty(position(cnSet))/1e6)
282 403
 %	abline(h=c(1,3), lty=2, col=cols[1])	
... ...
@@ -300,52 +421,58 @@ plotCytoband(22, new=FALSE, cytoband.ycoords=c(5.8, 6.0), label.cytoband=FALSE)
300 421
 \clearpage
301 422
 \paragraph{One SNP at a time}
302 423
 
303
-Scatterplots of the A and B allele intensities (log-scale) can be useful
304
-for assessing the biallelic genotype calls.  The following code chunk is
305
-displayed in Figure \ref{fig:genotypeCalls}.
306
-
307
-<<genotypeCalls, fig=TRUE, width=8, height=8, include=FALSE>>=
308
-myScatter <- function(object, add=FALSE, ...){
309
-	A <- log2(A(object))
310
-	B <- log2(B(object))
311
-	if(!add){
312
-		plot(A, B, ...)
313
-	} else{
314
-		points(A, B, ...)
315
-	}
316
-}
317
-index <- which(isSnp(cnSet))[1:9]
318
-xlim <- ylim <- c(6.5,13)
319
-par(mfrow=c(3,3), las=1, pty="s", ask=FALSE, mar=c(2, 2, 2, 2), oma=c(2, 2, 1, 1))
320
-for(i in index){
321
-	gt <- calls(cnSet)[i, ]
322
-	if(i != 89){
323
-		myScatter(cnSet[i, ], 
324
-			  pch=pch, 
325
-			  col=colors[snpCall(cnSet)[i, ]], 
326
-			  bg=colors[snpCall(cnSet)[i, ]], cex=cex,
327
-			  xlim=xlim, ylim=ylim)
328
-		mtext("A", 1, outer=TRUE, line=1)
329
-		mtext("B", 2, outer=TRUE, line=1)	
330
-		crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="C", lwd=2, col="black")
331
-		crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="Y", lwd=2, col="grey50")
332
-	} else {
333
-		plot(0:1, xlim=c(0,1), ylim=c(0,1), type="n", xaxt="n", yaxt="n")
334
-		legend("center",
335
-		       legend=c("CN = 2, CEPH", "CN = 2, Yoruban"),
336
-		       col=c("black", "grey50"), lwd=2, bty="n")
337
-	}
338
-}
424
+Scatterplots of the A and B allele intensities (log-scale) can be
425
+useful for assessing the biallelic genotype calls.  This section of
426
+the vignette is currently under development.
427
+% The following code chunk is
428
+%displayed in Figure \ref{fig:prediction}.
429
+
430
+<<predictionRegions, fig=TRUE, width=8, height=8, include=FALSE, eval=FALSE>>=
431
+i <- snp.index[1]
432
+#plotCNSetLM=crlmm:::plotCNSetLM
433
+##trace(plotCNSetLM, browser)
434
+plot(i, x, copynumber=2)
435
+##myScatter <- function(object, add=FALSE, ...){
436
+##	A <- log2(A(object))
437
+##	B <- log2(B(object))
438
+##	if(!add){
439
+##		plot(A, B, ...)
440
+##	} else{
441
+##		points(A, B, ...)
442
+##	}
443
+##}
444
+##index <- which(isSnp(cnSet))[1:9]
445
+##xlim <- ylim <- c(6.5,13)
446
+##par(mfrow=c(3,3), las=1, pty="s", ask=FALSE, mar=c(2, 2, 2, 2), oma=c(2, 2, 1, 1))
447
+##for(i in index){
448
+##	gt <- calls(cnSet)[i, ]
449
+##	if(i != 89){
450
+##		myScatter(cnSet[i, ], 
451
+##			  pch=pch, 
452
+##			  col=colors[snpCall(cnSet)[i, ]], 
453
+##			  bg=colors[snpCall(cnSet)[i, ]], cex=cex,
454
+##			  xlim=xlim, ylim=ylim)
455
+##		mtext("A", 1, outer=TRUE, line=1)
456
+##		mtext("B", 2, outer=TRUE, line=1)	
457
+##		crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="C", lwd=2, col="black")
458
+##		crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="Y", lwd=2, col="grey50")
459
+##	} else {
460
+##		plot(0:1, xlim=c(0,1), ylim=c(0,1), type="n", xaxt="n", yaxt="n")
461
+##		legend("center",
462
+##		       legend=c("CN = 2, CEPH", "CN = 2, Yoruban"),
463
+##		       col=c("black", "grey50"), lwd=2, bty="n")
464
+##	}
465
+##}
339 466
 @
340 467
 
341
-\begin{figure}
342
-  \centering
343
-  \includegraphics[width=0.8\textwidth]{copynumber-genotypeCalls}
344
-  \caption{\label{fig:genotypeCalls} Scatterplots of A versus B
345
-    intensities.  Each panel displays a single SNP. The ellipses
346
-    indicate the 95\% probability region for copy number 2 for the CEPH
347
-    (black) and Yoruban subjects (grey).}
348
-\end{figure}
468
+%\begin{figure}
469
+%  \centering
470
+%  \includegraphics[width=0.8\textwidth]{copynumber-predictionRegions}
471
+%  \caption{\label{fig:prediction} Scatterplots of A versus B
472
+%    intensities.  Each panel displays a single SNP. The ellipses
473
+%    indicate the 95\% probability region for copy number 2 for the CEPH
474
+%    (black) and Yoruban subjects (grey).}
475
+%\end{figure}
349 476
 
350 477
 %\section{Details for the copy number estimation procedure}
351 478
 %
352 479
Binary files a/inst/scripts/copynumber.pdf and b/inst/scripts/copynumber.pdf differ
... ...
@@ -4,7 +4,6 @@
4 4
 \alias{CNSetLM}
5 5
 \alias{CNSetLM-class}
6 6
 \alias{[,CNSetLM-method}
7
-\alias{lM}
8 7
 \alias{lM,CNSetLM-method}
9 8
 \alias{lM<-,CNSetLM,list_or_ffdf-method}
10 9
 \alias{open,CNSetLM-method}
11 10
new file mode 100644
... ...
@@ -0,0 +1,21 @@
1
+\name{CNSetLM-methods}
2
+\alias{lM}
3
+
4
+\title{Methods for CNSetLM class}
5
+\description{ Accessors for CNSetLM class}
6
+\usage{
7
+	lM(object)
8
+}
9
+
10
+\arguments{
11
+  \item{object}{\code{CNSetLM}}
12
+}
13
+\details{
14
+	\code{lM} returns a list (or an ffdf object if large data support is enabled) of the parameters estimated from a linear model fit for each SNP.  The parameters are batch and locus-specific.
15
+}
16
+\value{object of class \code{ffdf} or \code{list}}
17
+
18
+\author{R. Scharpf}
19
+
20
+\seealso{\code{\link{crlmmCopynumber}}, \code{\link{crlmmCopynumber2}}, \code{\link{CNSetLM-class}}}
21
+\keyword{manip}
0 22
new file mode 100644
... ...
@@ -0,0 +1,27 @@
1
+\name{sample.CNSetLM}
2
+\alias{sample.CNSetLM}
3
+\docType{data}
4
+\title{
5
+	Dataset of class 'CNSetLM'	
6
+}
7
+\description{
8
+
9
+	The data for 2119 polymorphic and nonpolymorphic markers on
10
+	chromosome 1 for the CEPH and Yoruban HapMap samples.
11
+
12
+}
13
+\usage{data(sample.CNSetLM)}
14
+\format{
15
+
16
+	The data illustrates the \code{\link{CNSetLM-class}}, with
17
+	\code{assayData} containing the quantile-normalized
18
+	intensities for the A and B alleles, genotype calls and
19
+	confidence scores (call and callProbability), and
20
+	allele-specific copy number (CA and CB). The parameters from
21
+	the linear model are stored in the lM slot.
22
+	
23
+}
24
+\examples{
25
+data(sample.CNSetLM)
26
+}
27
+\keyword{datasets}