Browse code

A few edits to copynumber vignette and sample.CNSetLM Rd file

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@48965 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 21/08/2010 02:50:25
Showing5 changed files

... ...
@@ -208,7 +208,10 @@ genotype <- function(filenames,
208 208
 	close(callSet)
209 209
 	return(callSet)
210 210
 }
211
-genotypeLD <- genotype2 <- genotype
211
+genotype2 <- function(){
212
+	.Defunct(msg="The genotype2 function has been deprecated. The function genotype should be used instead.  genotype will support large data using ff provided that the ff package is loaded.")
213
+}
214
+genotypeLD <- genotype2
212 215
 
213 216
 rowCovs <- function(x, y, ...){
214 217
 	notna <- !is.na(x)
... ...
@@ -321,34 +321,34 @@ setReplaceMethod("flags", signature=signature(object="CNSet", value="ff_matrix")
321 321
 ##	ellipse.CNSet(x, copynumber, batch, ...)
322 322
 ##})
323 323
 
324
-##ACN.X <- function(object, allele, i, j){
325
-##	acn <- list()
326
-##	batches <- unique(batch(object)[j])
327
-##	for(k in seq_along(batches)){
328
-##		l <- match(batches[k], bns)
329
-##		nuA <- nuA(object)[ii, l]
330
-##		nuB <- nuB(object)[ii, l]
331
-##		phiA <- phiA(object)[ii, l]
332
-##		phiB <- phiB(object)[ii, l]
333
-##		phiPrimeA <- phiPrimeA(object)[ii, l]
334
-##		phiPrimeB <- phiPrimeB(object)[ii, l]
335
-##		if(all(is.na(phiPrimeA))){
336
-##			I <- allele(object, allele)[ii, j]
337
-##			if(allele=="A") acn[[k]] <- 1/phiA*(I - nuA)
338
-##			if(allele=="B") acn[[k]] <- 1/phiB*(I - nuB)
339
-##		} else {
340
-##			A <- A(object)[ii, j]
341
-##			B <- B(object)[ii, j]
342
-##			phistar <- phiPrimeB/phiA
343
-##			tmp <- (B-nuB - phistar*A + phistar*nuA)/phiB
344
-##			cb <- tmp/(1-phistar*phiPrime/phiB)
345
-##			ca <- A-nuA-phiPrimeA*cb/phiA
346
-##			if(allele=="A") acn[[k]] <- ca
347
-##			if(allele=="B") acn[[k]] <- cb
348
-##		}
349
-##	}
350
-##	return(acn)
351
-##}
324
+ACN.X <- function(object, allele, i, j){
325
+	acn <- list()
326
+	batches <- unique(batch(object)[j])
327
+	for(k in seq_along(batches)){
328
+		l <- match(batches[k], bns)
329
+		nuA <- nuA(object)[ii, l]
330
+		nuB <- nuB(object)[ii, l]
331
+		phiA <- phiA(object)[ii, l]
332
+		phiB <- phiB(object)[ii, l]
333
+		phiPrimeA <- phiPrimeA(object)[ii, l]
334
+		phiPrimeB <- phiPrimeB(object)[ii, l]
335
+		if(all(is.na(phiPrimeA))){
336
+			I <- allele(object, allele)[ii, j]
337
+			if(allele=="A") acn[[k]] <- 1/phiA*(I - nuA)
338
+			if(allele=="B") acn[[k]] <- 1/phiB*(I - nuB)
339
+		} else {
340
+			A <- A(object)[ii, j]
341
+			B <- B(object)[ii, j]
342
+			phistar <- phiPrimeB/phiA
343
+			tmp <- (B-nuB - phistar*A + phistar*nuA)/phiB
344
+			cb <- tmp/(1-phistar*phiPrime/phiB)
345
+			ca <- A-nuA-phiPrimeA*cb/phiA
346
+			if(allele=="A") acn[[k]] <- ca
347
+			if(allele=="B") acn[[k]] <- cb
348
+		}
349
+	}
350
+	return(acn)
351
+}
352 352
 
353 353
 
354 354
 ACN <- function(object, allele, i , j){
... ...
@@ -139,13 +139,13 @@ to small to be used for copy number estimation). We wrap the function
139 139
 Sweaving after the initial batch job.
140 140
 
141 141
 <<genotype>>=
142
-gtSet <- checkExists("gtSet.assayData_matrix",
142
+gtSet.tmp <- checkExists("gtSet.assayData_matrix",
143 143
 		     .path=outdir,
144 144
 		     .FUN=genotype,
145 145
 		     filenames=celFiles[1:5],
146 146
 		     cdfName=cdfName,
147 147
 		     batch=batch[1:5])
148
-}
148
+rm(gtSet.tmp); gc()
149 149
 @
150 150
 
151 151
 A more memory efficient approach to preprocessing and genotyping is
... ...
@@ -177,9 +177,9 @@ if(!file.exists(file.path(outdir, "cnSet.rda"))){
177 177
 	gtSet <- checkExists("gtSet",
178 178
 			     .path=outdir,
179 179
 			     .FUN=genotypeLD,
180
-			     filenames=celFiles,
180
+			     filenames=celFiles[1:200],
181 181
 			     cdfName=cdfName,
182
-			     batch=batch)
182
+			     batch=batch[1:200])
183 183
 	class(calls(gtSet))
184 184
 }
185 185
 @
... ...
@@ -243,35 +243,46 @@ At nonpolymorphic loci, either the 'CA' or totalCopynumber method can
243 243
 be used to obtain total copy number.
244 244
 
245 245
 <<ca>>=
246
-cn.nonpolymorphic <- CA(obj, i=which(!isSnp(obj)))
246
+cn.nonpolymorphic <- CA(cnSet, i=which(!isSnp(obj)))
247 247
 @
248 248
 
249 249
 Total copy number at both polymorphic and nonpolymorphic loci:
250 250
 <<totalCopynumber>>=
251 251
 ##cn <- copyNumber(x)
252
-cn <- totalCopyNumber(x, sample(1:nrow(x), 1e4), 1:5)
252
+cn <- totalCopyNumber(cnSet, sample(1:nrow(cnSet), 1e4), 1:5)
253 253
 apply(cn, 2, median, na.rm=TRUE)
254 254
 @
255 255
 
256
-The above estimates can be plotted versus physical position.
256
+The above estimates can be plotted versus physical position. Accessors
257
+for physical position and chromosome are also provided. In the
258
+following codechunk we extract the position and chromosome for the
259
+first 10 markers in the \Robject{cnSet} object.
260
+
261
+<<fdAccessors>>=
262
+position(cnSet)[1:10]
263
+chromosome(cnSet)[1:10]
264
+@
257 265
 
258 266
 
259 267
 \section{The CNSet container}
260 268
 
261 269
 The objects returned by the \Rfunction{genotype} and
262 270
 \Rfunction{crlmmCopynumber} have assay data elements that are pointers
263
-to \Robject{ff} objects stored in the directory \Robject{outdir}.  The
271
+to \Robject{ff} objects stored in the directory \Robject{outdir}.  Had
272
+we not loaded the \Rpackage{ff} prior to running these functions, the
273
+\Robject{AssayData} elements would be ordinary matrices, though the
274
+RAM required for running the algorithm would be substantial. The
264 275
 functions \Rfunction{open} and \Rfunction{close} open and close the
265 276
 connections to the \Robject{assayData} elements. Subsetting an
266
-\Robject{ff} object pulls the data from disk into memory and should
267
-therefore be used with caution. In particular, subsetting the
268
-\Robject{gtSet} would subset each element in the \Robject{assayData}
269
-slot, returning an object of the same class but with
270
-\Robject{assayData} elements that are matrices.  Such an operation can
271
-be exceedingly slow when performed over a network and require
272
-subsantial RAM.  The preferred approach is to extract only the assay
273
-data element that is needed. In the example below, we extract the
274
-genotype calls for the the first 50 samples.
277
+\Robject{ff} object pulls the data from disk into memory and should be
278
+used with caution. In particular, subsetting the \Robject{gtSet} would
279
+subset each element in the \Robject{assayData} slot, returning an
280
+object of the same class but with \Robject{assayData} elements that
281
+are matrices.  Such an operation can be exceedingly slow when
282
+performed over a network and require subsantial RAM.  The preferred
283
+approach is to extract only the assay data element that is needed. In
284
+the example below, we extract the genotype calls for the the first 50
285
+samples.
275 286
 
276 287
 <<check>>=
277 288
 dims(cnSet)
... ...
@@ -295,8 +306,6 @@ slot can be listed as follows.
295 306
 assayDataElementNames(batchStatistics(cnSet))
296 307
 @
297 308
 
298
-
299
-
300 309
 Note that for the Affymetrix 6.0 platform the assay data elements each
301 310
 have a row dimension corresponding to the total number of polymorphic
302 311
 and nonpolymorphic markers interrogated by the Affymetrix 6.0
... ...
@@ -327,74 +336,43 @@ posterior.prob <- tryCatch(i2p(snpCallProbability(cnSet)),
327 336
 posterior.prob2 <- p2i(snpCallProbability(cnSet)[rows, cols])
328 337
 @
329 338
 
330
-Next, we obtain locus-level estimates of copy number.
331
-
332
-<<cn.ff, echo=FALSE, eval=FALSE>>=
333
-rm(cnSet); gc()
334
-ocProbesets(75e3)
335
-##ff objects
336
-system.time(cnSet2 <- crlmmCopynumber2(cnSet2))
337
-save(cnSet2, file=file.path(outdir, "cnSet2.rda"))
338
-@
339
-
340
-<<timings, eval=FALSE>>=
341
-tryCatch(print(paste("Time for matrix version:", time1)), error=function(e) print("timing for CN estimation not available"))
342
-tryCatch(print(paste("Time for ff version:", time2)), error=function(e) print("timing for CN estimation not available"))
343
-rm(cnSet2); gc()
344
-@
345
-
346
-
347
-\section{Accessors}
348
-
349
-For the remainder of this vignette, we illustrate accessors and
350
-visualizations using the sample object provided in the
351
-\Rpackage{crlmm} package.
352
-
353
-<<sampleset>>=
354
-data(sample.CNSetLM)
355
-x <- sample.CNSetLM
356
-@
357
-
358
-
359
-\subsection{Quantile-normalized intensities}
360
-
361 339
 Accessors for the quantile normalized intensities for the A allele at
362 340
 polymorphic loci:
363 341
 
364 342
 <<accessors>>=
365
-snp.index <- which(isSnp(x))
366
-np.index <- which(!isSnp(x))
367
-a <- (A(x))[snp.index, ]
343
+snp.index <- which(isSnp(cnSet))
344
+np.index <- which(!isSnp(cnSet))
345
+a <- (A(cnSet))[snp.index, ]
368 346
 dim(a)
369 347
 @
370 348
 
371 349
 The extra set of parentheses surrounding \Robject{A(cnSet2)} above is
372 350
 added to emphasize the appropriate order of operations. Subsetting the
373
-entire \Robject{x} object in the following code should be avoided for
374
-large datasets.
351
+entire \Robject{cnSet} object in the following, unevaluated codechunk
352
+should be avoided for large datasets.
375 353
 
376 354
 <<eval=FALSE>>=
377
-a <- A(x[snp.index, ])
355
+a <- A(cnSet[snp.index, ])
378 356
 @
379 357
 
380
-The quantile-normalized intensities for nonpolymorphic loci are obtained
381
-by:
358
+The quantile-normalized intensities for nonpolymorphic loci are
359
+obtained by:
382 360
 
383 361
 <<>>=
384
-npIntensities <- (A(x))[np.index, ]
362
+npIntensities <- (A(cnSet))[np.index, ]
385 363
 @
386 364
 
387 365
 Quantile normalized intensities for the B allele at polymorphic loci:
388 366
 
389 367
 <<B.polymorphic>>=
390
-b.snps <- (B(x))[snp.index, ]
368
+b.snps <- (B(cnSet))[snp.index, ]
391 369
 @
392 370
 
393 371
 Note that NAs are recorded in the 'B' assay data element for
394 372
 nonpolymorphic loci:
395 373
 
396 374
 <<B.NAs>>=
397
-all(is.na((B(x))[np.index, ]))
375
+all(is.na((B(cnSet))[np.index, ]))
398 376
 @
399 377
 
400 378
 <<clean, echo=FALSE>>=
... ...
@@ -405,11 +383,11 @@ rm(b.snps, a, npIntensities); gc()
405 383
 
406 384
 Genotype calls:
407 385
 <<genotypes>>=
408
-genotypes <- (snpCall(x))[snp.index, ]
386
+genotypes <- (snpCall(cnSet))[snp.index, ]
409 387
 @
410 388
 Confidence scores of the genotype calls:
411 389
 <<confidenceScores>>=
412
-genotypeConf <- integerScoreToProbability(snpCallProbability(x)[snp.index[1:10], ])
390
+genotypeConf <- integerScoreToProbability(snpCallProbability(cnSet)[snp.index[1:10], ])
413 391
 @
414 392
 
415 393
 \paragraph{\Robject{CopyNumberSet}: allele-specific copy number}
... ...
@@ -421,18 +399,18 @@ genotypeConf <- integerScoreToProbability(snpCallProbability(x)[snp.index[1:10],
421 399
 Information on physical position and chromosome can be accessed as follows:
422 400
 
423 401
 <<positionChromosome>>=
424
-xx <- position(x)
402
+xx <- position(cnSet)
425 403
 yy <- chromosome(x)
426 404
 @
427 405
 
428 406
 Parameters from the linear model used to estimate copy number are
429
-stored in the slot \Robject{lM}.
407
+stored in the slot \Robject{batchStatistics}.
430 408
 
431 409
 <<copynumberParameters>>=
432
-names(lM(x))
433
-dim(lM(x)[[1]])
410
+assayDataElementNames(batchStatistics(cnSet))
434 411
 @
435 412
 
413
+TODO: Describe accessors for batch-level summaries.
436 414
 
437 415
 
438 416
 \section{Suggested visualizations}
... ...
@@ -442,7 +420,7 @@ dim(lM(x)[[1]])
442 420
 A histogram of the signal to noise ratio for the HapMap samples:
443 421
 
444 422
 <<plotSnr, fig=TRUE, include=FALSE>>=
445
-hist(x$SNR, xlab="SNR", main="", breaks=25)
423
+hist(cnSet$SNR, xlab="SNR", main="", breaks=25)
446 424
 @
447 425
 
448 426
 \begin{figure}
... ...
@@ -463,13 +441,13 @@ area of research.
463 441
 
464 442
 <<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>=
465 443
 par(las=1, mar=c(4, 5, 4, 2))
466
-plot(position(x), copyNumber(x)[, 1], pch=".",
444
+plot(position(cnSet), copyNumber(cnSet)[, 1], pch=".",
467 445
      cex=2, xaxt="n", col="grey20", ylim=c(0,4),
468 446
      ylab="copy number", xlab="physical position (Mb)",
469
-     main=paste(sampleNames(x)[1], ", CHR:", unique(chromosome(x))))
470
-points(position(x)[!isSnp(x)], copyNumber(x)[!isSnp(x), 1],
447
+     main=paste(sampleNames(cnSet)[1], ", CHR:", unique(chromosome(cnSet))))
448
+points(position(cnSet)[!isSnp(cnSet)], copyNumber(cnSet)[!isSnp(cnSet), 1],
471 449
        pch=".", cex=2, col="lightblue")
472
-axis(1, at=pretty(range(position(x))), labels=pretty(range(position(x)))/1e6)
450
+axis(1, at=pretty(range(position(cnSet))), labels=pretty(range(position(cnSet)))/1e6)
473 451
 @
474 452
 
475 453
 <<idiogram, eval=FALSE, echo=FALSE>>=
... ...
@@ -526,59 +504,6 @@ plotCytoband(22, new=FALSE, cytoband.ycoords=c(3.8, 4), label.cytoband=FALSE)
526 504
 Scatterplots of the A and B allele intensities (log-scale) can be
527 505
 useful for assessing the biallelic genotype calls.  This section of
528 506
 the vignette is currently under development.
529
-% The following code chunk is
530
-%displayed in Figure \ref{fig:prediction}.
531
-
532
-<<predictionRegions, fig=TRUE, width=8, height=8, include=FALSE, eval=FALSE>>=
533
-i <- snp.index[1]
534
-#plotCNSetLM=crlmm:::plotCNSetLM
535
-##trace(plotCNSetLM, browser)
536
-plot(i, x, copynumber=2)
537
-##myScatter <- function(object, add=FALSE, ...){
538
-##	A <- log2(A(object))
539
-##	B <- log2(B(object))
540
-##	if(!add){
541
-##		plot(A, B, ...)
542
-##	} else{
543
-##		points(A, B, ...)
544
-##	}
545
-##}
546
-##index <- which(isSnp(cnSet))[1:9]
547
-##xlim <- ylim <- c(6.5,13)
548
-##par(mfrow=c(3,3), las=1, pty="s", ask=FALSE, mar=c(2, 2, 2, 2), oma=c(2, 2, 1, 1))
549
-##for(i in index){
550
-##	gt <- calls(cnSet)[i, ]
551
-##	if(i != 89){
552
-##		myScatter(cnSet[i, ],
553
-##			  pch=pch,
554
-##			  col=colors[snpCall(cnSet)[i, ]],
555
-##			  bg=colors[snpCall(cnSet)[i, ]], cex=cex,
556
-##			  xlim=xlim, ylim=ylim)
557
-##		mtext("A", 1, outer=TRUE, line=1)
558
-##		mtext("B", 2, outer=TRUE, line=1)
559
-##		crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="C", lwd=2, col="black")
560
-##		crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="Y", lwd=2, col="grey50")
561
-##	} else {
562
-##		plot(0:1, xlim=c(0,1), ylim=c(0,1), type="n", xaxt="n", yaxt="n")
563
-##		legend("center",
564
-##		       legend=c("CN = 2, CEPH", "CN = 2, Yoruban"),
565
-##		       col=c("black", "grey50"), lwd=2, bty="n")
566
-##	}
567
-##}
568
-@
569
-
570
-%\begin{figure}
571
-%  \centering
572
-%  \includegraphics[width=0.8\textwidth]{copynumber-predictionRegions}
573
-%  \caption{\label{fig:prediction} Scatterplots of A versus B
574
-%    intensities.  Each panel displays a single SNP. The ellipses
575
-%    indicate the 95\% probability region for copy number 2 for the CEPH
576
-%    (black) and Yoruban subjects (grey).}
577
-%\end{figure}
578
-
579
-%\section{Details for the copy number estimation procedure}
580
-%
581
-%See the technical report \citep{Scharpf2009}.
582 507
 
583 508
 \section{Session information}
584 509
 <<sessionInfo, results=tex>>=
... ...
@@ -98,6 +98,11 @@ crlmmCopynumber(object, MIN.SAMPLES=10, SNRMin = 5, MIN.OBS = 1,
98 98
     \code{MIN.NU} and \code{MIN.PHI} are ignored. When TRUE, background
99 99
     (nu) and slope (phi) coefficients below MIN.NU and MIN.PHI are set
100 100
     to MIN.NU and MIN.PHI, respectively.}
101
+
102
+  \item{type}{ Character string vector that must be one or more of
103
+    "SNP", "NP", "X.SNP", or "X.NP". Type refers to a set of
104
+    markers. See details below}
105
+    
101 106
  }
102 107
   
103 108
  \references{
... ...
@@ -133,6 +138,19 @@ crlmmCopynumber(object, MIN.SAMPLES=10, SNRMin = 5, MIN.OBS = 1,
133 138
 	The functions \code{crlmmCopynumberLD} and
134 139
 	\code{crlmmCopynumber2} have been deprecated.
135 140
 
141
+	The argument \code{type} can be used to specify a subset of
142
+	markers for which the copy number estimation algorithm is run.
143
+	One or more of the following possible entries are valid: 'SNP',
144
+	'NP', 'X.SNP', and 'X.NP'.
145
+
146
+	'SNP' referers to autosomal SNPs.
147
+
148
+	'NP' refers to autosomal nonpolymorphic markers.
149
+
150
+	'X.SNP' refers to SNPs on chromosome X.
151
+
152
+	'X.NP' refers to autosomes on chromosome X.
153
+
136 154
 }
137 155
 \author{R. Scharpf}
138 156
 \keyword{manip}
... ...
@@ -31,28 +31,23 @@ data(sample.CNSetLM)
31 31
 cnSet <- as(sample.CNSetLM, "CNSet")
32 32
 all(isCurrent(cnSet)) ## is the cnSet object current?
33 33
 
34
-## updating class CNSetLM using ff objects
35
-library(ff)
36
-path <- system.file("extdata", package="crlmm")
37
-ldPath(path)
38
-data(sample.CNSetLMff)
39
-cnSetff <- as(sample.CNSetLMff, "CNSet")
40
-all(isCurrent(cnSet))
41
-
42
-## a bigger object with multiple batches
43 34
 \dontrun{
44
-	outdir <- "/amber1/scratch/rscharpf/jss/hapmap2"
45
-	load(file.path(outdir, "container.rda"))
46
-	container <- object; rm(object); gc()
47
-	container2 <- as(container, "CNSet")
48
-	all(isCurrent(container2))
49
-	## test replacement methods, subset methods
50
-	table(batch(container2))
51
-	##generates warning ... would need open, close in the '[' method
52
-	invisible(open(nuA(container2)))
53
-	xx <- nu(container2, "A")[1:5, ]
54
-	nuA(container2)[1:5, ] <- xx
55
-	invisible(close(nuA(container2)))
35
+	## updating class CNSetLM using ff objects
36
+	## a bigger object with multiple batches
37
+	if(require(ff)){
38
+		outdir <- "/amber1/scratch/rscharpf/jss/hapmap2"
39
+		load(file.path(outdir, "container.rda"))
40
+		container <- object; rm(object); gc()
41
+		container2 <- as(container, "CNSet")
42
+		all(isCurrent(container2))
43
+		## test replacement methods, subset methods
44
+		table(batch(container2))
45
+		##generates warning ... would need open, close in the '[' method
46
+		invisible(open(nuA(container2)))
47
+		xx <- nu(container2, "A")[1:5, ]
48
+		nuA(container2)[1:5, ] <- xx
49
+		invisible(close(nuA(container2)))
50
+	}
56 51
 }
57 52
 ## --------------------------------------------------
58 53
 ## accessors for the feature-level info