Browse code

Updates to genotype function and copy number accessors.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@50808 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 11/11/2010 04:13:34
Showing4 changed files

... ...
@@ -124,7 +124,6 @@ genotype <- function(filenames,
124 124
 		       snprma=snprma(...),
125 125
 		       snprma2=snprma2(...))
126 126
 	}
127
-
128 127
 	snprmaRes <- snprmaFxn(FUN,
129 128
 			       filenames=filenames,
130 129
 			       mixtureSampleSize=mixtureSampleSize,
... ...
@@ -151,12 +150,18 @@ genotype <- function(filenames,
151 150
 		open(snprmaRes[["B"]])
152 151
 		open(snprmaRes[["SNR"]])
153 152
 		open(snprmaRes[["mixtureParams"]])
154
-		##bb <- getOption("ffbatchbytes")
155
-		message("Writing normalized intensities to callSet")
156
-		for(j in 1:ncol(callSet)){
157
-			A(callSet)[is.snp, j] <- snprmaRes[["A"]][snp.index, j]
158
-			B(callSet)[is.snp, j] <- snprmaRes[["B"]][snp.index, j]
159
-		}
153
+		bb <- getOption("ffbatchbytes")
154
+		ffcolapply(A(callSet)[is.snp, i1:i2] <- snprmaRes[["A"]][snp.index, i1:i2], X=snprmaRes[["A"]],
155
+			   BATCHBYTES=bb)
156
+		ffcolapply(B(callSet)[is.snp, i1:i2] <- snprmaRes[["B"]][snp.index, i1:i2], X=snprmaRes[["B"]],
157
+			   BATCHBYTES=bb)
158
+##		##bb <- getOption("ffbatchbytes")
159
+##		message("Writing normalized intensities to callSet")
160
+##		system.time(
161
+##		for(j in 1:ncol(callSet)){
162
+##			A(callSet)[is.snp, j] <- snprmaRes[["A"]][snp.index, j]
163
+##			B(callSet)[is.snp, j] <- snprmaRes[["B"]][snp.index, j]
164
+##		})
160 165
 		##ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]])##, BATCHBYTES=bb)
161 166
 		##ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]])##, BATCHBYTES=bb)
162 167
 		pData(callSet)$SKW <- snprmaRes[["SKW"]]
... ...
@@ -213,15 +218,19 @@ genotype <- function(filenames,
213 218
 	if(is.lds){
214 219
 		open(tmp[["calls"]])
215 220
 		open(tmp[["confs"]])
216
-		for(j in 1:ncol(callSet)){
217
-			snpCall(callSet)[is.snp, j] <- tmp[["calls"]][snp.index, j]
218
-			snpCallProbability(callSet)[is.snp, j] <- tmp[["confs"]][snp.index, j]
219
-		}
221
+		ffcolapply(snpCall(callSet)[is.snp, i1:i2] <- tmp[["A"]][snp.index, i1:i2], X=tmp[["A"]],
222
+			   BATCHBYTES=bb)
223
+		ffcolapply(snpCallProbability(callSet)[is.snp, i1:i2] <- tmp[["B"]][snp.index, i1:i2], X=tmp[["B"]],
224
+			   BATCHBYTES=bb)
225
+##		for(j in 1:ncol(callSet)){
226
+##			snpCall(callSet)[is.snp, j] <- tmp[["calls"]][snp.index, j]
227
+##			snpCallProbability(callSet)[is.snp, j] <- tmp[["confs"]][snp.index, j]
228
+##		}
220 229
 		##		ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]])#, BATCHBYTES=bb)
221 230
 		##		ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]])#, BATCHBYTES=bb)
222 231
 	} else {
223
-		calls(callSet)[snp.index, ] <- tmp[["calls"]][snp.index, ]
224
-		snpCallProbability(callSet)[snp.index, ] <- tmp[["confs"]][snp.index, ]
232
+		calls(callSet)[is.snp, ] <- tmp[["calls"]][snp.index, ]
233
+		snpCallProbability(callSet)[is.snp, ] <- tmp[["confs"]][snp.index, ]
225 234
 	}
226 235
 	message("Finished updating.  Cleaning up.")
227 236
 	callSet$gender <- tmp$gender
... ...
@@ -358,28 +358,26 @@ C3 <- function(object, allele, marker.index, batch.index, sample.index){
358 358
 		phiB2 <- phiPrimeB(object)[marker.index, l]
359 359
 		phiA <- phiA(object)[marker.index, l]
360 360
 		phiB <- phiB(object)[marker.index, l]
361
-		nuB <- nuB(object)[marker.index, l]
362
-		phiB <- phiB(object)[marker.index, l]
363 361
 		nuA <- nuA(object)[marker.index, l]
364
-		phiA <- phiA(object)[marker.index, l]
362
+		nuB <- nuB(object)[marker.index, l]
365 363
 		IA <- A(object)[marker.index, jj]
366 364
 		IB <- B(object)[marker.index, jj]
367
-		##phistar <- phiB2/phiA
368
-		##tmp <- (IB - nuB - phistar*IA + phistar*nuA)/phiB
369
-		CB <- 1/(1-phiA2*phiB2/(phiA*phiB)) * 1/phiB * (IA-nuB-phiB2/phiA*(IA-nuA))
370
-		##CB <- tmp/(1-phistar*phiA2/phiB)
365
+		phistar <- phiB2/phiA
366
+		tmp <- (IB - nuB - phistar*IA + phistar*nuA)/phiB
367
+		CB <- tmp/(1-phistar*phiA2/phiB)
368
+		##CB <- 1/(1-phiA2*phiB2/(phiA*phiB)) * 1/phiB * (IA-nuB-phiB2/phiA*(IA-nuA))
369
+		CA <- (IA-nuA-phiA2*CB)/phiA
371 370
 		if(allele == "B"){
372 371
 			acn[[k]] <- CB
373 372
 		}
374 373
 		if(allele == "A"){
375
-			ca <- (IA-nuA-phiA2*CB)/phiA
376
-			acn[[k]] <- ca
377
-		}
378
-		if(allele == "AandB"){
379
-			CA <- tmp/(1-phistar*phiA2/phiB)
380
-			CB <- (IA-nuA-phiA2*CB)/phiA
381
-			acn[[k]] <- CA+CB
374
+			acn[[k]] <- CA
382 375
 		}
376
+##		if(allele=="AandB")
377
+##			CA <- tmp/(1-phistar*phiA2/phiB)
378
+##			CB <- (IA-nuA-phiA2*CB)/phiA
379
+##			acn[[k]] <- CA+CB
380
+##		}
383 381
 	}
384 382
 	if(length(acn) > 1){
385 383
 		acn <- do.call("cbind", acn)
... ...
@@ -494,20 +492,15 @@ ACN <- function(object, allele, i , j){
494 492
 			open(phiB(object))
495 493
 			open(B(object))
496 494
 		}
497
-		if(any(!is.snp)){
498
-			acn.index <- which(!is.snp)
499
-			marker.index <- i[!is.snp]
500
-			acn[acn.index, ] <- 0
495
+		if(any(is.auto)){
496
+			auto.index <- which(is.auto)
497
+			marker.index <- i[is.auto]
498
+			acn[auto.index, ] <- C2(object, marker.index, batch.index, j)
501 499
 		}
502
-		if(any(is.snp)){
503
-			if(any(is.auto)){
504
-				## autosomal SNPs
505
-				acn.index <- which(is.auto & is.snp)
506
-				marker.index <- i[is.auto & is.snp]
507
-				acn[acn.index, ] <- C2(object, marker.index, batch.index, j)
508
-			}
509
-			if(any(is.X)){
510
-				if(is.ff){
500
+		if(any(is.X)){
501
+			##Chr X SNPs
502
+			if(any(is.snp)){
503
+				if(is.ff) {
511 504
 					open(phiPrimeA(object))
512 505
 					open(phiPrimeB(object))
513 506
 					open(phiA(object))
... ...
@@ -517,7 +510,7 @@ ACN <- function(object, allele, i , j){
517 510
 				marker.index <- i[is.X & is.snp]
518 511
 				acn.index <- which(is.X & is.snp)
519 512
 				acn[acn.index, ] <- C3(object, allele="B", marker.index, batch.index, j)
520
-				if(is.ff){
513
+				if(is.ff) {
521 514
 					close(phiPrimeA(object))
522 515
 					close(phiPrimeB(object))
523 516
 					close(phiA(object))
... ...
@@ -525,6 +518,11 @@ ACN <- function(object, allele, i , j){
525 518
 					close(A(object))
526 519
 				}
527 520
 			}
521
+			if(any(!is.snp)){
522
+				acn.index <- which(!is.snp)
523
+				marker.index <- i[!is.snp]
524
+				acn[acn.index, ] <- 0
525
+			}
528 526
 		}
529 527
 		if(is.ff){
530 528
 			close(nuB(object))
... ...
@@ -192,8 +192,6 @@ The \Rfunction{crlmmCopynumber} performs the following steps:
192 192
 GT.CONF.THR <- 0.90
193 193
 cnSet <- checkExists("cnSet", .path=outdir, .FUN=crlmmCopynumber, object=gtSet, GT.CONF.THR=GT.CONF.THR)
194 194
 invisible(open(cnSet))
195
-q("no")
196
-stop("here")
197 195
 @
198 196
 
199 197
 In an effort to reduce I/O, the \Rpackage{crlmmCopynumber} function no
... ...
@@ -222,7 +220,7 @@ the useful accessors for extracting which markers are SNPs, which are
222 220
 chromosomes, etc.
223 221
 
224 222
 <<ca>>=
225
-snp.index <- which(isSnp(cnSet))
223
+snp.index <- which(isSnp(cnSet) & chromosome(cnSet) < 23)
226 224
 ca <- CA(cnSet, i=snp.index, j=1:5)
227 225
 cb <- CB(cnSet, i=snp.index, j=1:5)
228 226
 ct <- ca+cb
... ...
@@ -239,7 +237,7 @@ At nonpolymorphic loci, either the \Rfunction{CA} or
239 237
 of total copy number.
240 238
 
241 239
 <<nonpolymorphicAutosomes>>=
242
-marker.index <- which(!isSnp(cnSet) & chromosome(cnSet) < 23)
240
+marker.index <- which(!isSnp(cnSet))
243 241
 ct <- CA(cnSet, i=marker.index, j=1:5)
244 242
 ## all zeros
245 243
 stopifnot(all(CB(cnSet, i=marker.index, j=1:5) == 0))
... ...
@@ -249,20 +247,27 @@ stopifnot(all.equal(ct, ct2))
249 247
 
250 248
 Nonpolymorphic markers on X:
251 249
 
250
+\begin{figure}
251
+  \centering
252 252
 <<nonpolymorphicX>>=
253
-## nu and phi are not estimated appropriately for nonpolymorphic X markers.
254 253
 library(RColorBrewer)
255 254
 cols <- brewer.pal(8, "Paired")[3:4]
256 255
 cols <- rep(cols, each=5)
257
-##set.seed(123)
258
-X.markers <- sample(which(!isSnp(cnSet) & chromosome(cnSet) == 23), 20e3)
259
-cn.M <- CA(cnSet, i=X.markers, j=sample(which(cnSet$gender==1), 5))
260
-cn.F <- CA(cnSet, i=X.markers, j=sample(which(cnSet$gender==2), 5))
261
-##phi(cnSet, "A")[X.markers[1:10]]
256
+set.seed(123)
257
+X.markers <- sample(which(chromosome(cnSet)==23 & !isSnp(cnSet)), 10e3)
258
+M <- sample(which(cnSet$gender==1), 5)
259
+F <- sample(which(cnSet$gender==2), 5)
260
+cn.M <- CA(cnSet, i=X.markers, j=M)
261
+cn.F <- CA(cnSet, i=X.markers, j=F)
262 262
 boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", xaxt="n", main="nonpolymorphic markers on X",
263
-	col=cols)
263
+	col=cols, ylim=c(0,5))
264 264
 legend("topleft", fill=unique(cols), legend=c("Male", "Female"))
265 265
 @
266
+\caption{Copy number estimates for nonpolymorphic markers on
267
+  chromosome X.  We assume that the median copy number (across
268
+  samples) at any given nonpolymorphic marker on chromosome X is 1 for
269
+  males and 2 for females.}
270
+\end{figure}
266 271
 
267 272
 
268 273
 Polymorphic markers, X chromosome:
... ...
@@ -270,98 +275,24 @@ Polymorphic markers, X chromosome:
270 275
 \begin{figure}
271 276
   \centering
272 277
 <<polymorphicX, fig=TRUE, width=8, height=4>>=
273
-## copy number estimates on X for SNPs are biased towards small values.
274 278
 X.markers <- sample(which(isSnp(cnSet) & chromosome(cnSet) == 23), 25e3)
275
-
276
-##genomewidesnp6Crlmm is correct.  FeatureData is incorrect.
277
-index <- match(featureNames(cnSet), rownames(snpProbes))
278
-sum(snpProbes[index, "chrom"] != chromosome(cnSet))
279
-
280
-M <- sample(which(cnSet$gender==1), 5)
281
-F <- sample(which(cnSet$gender==2), 5)
282
-
283
-fns <- list.files("~/hapmap_metadata", full.names=T)
284
-dat <- read.delim(fns[2])
285
-sns <- substr(sampleNames(cnSet), 1, 7)
286
-dat <- dat[match(sns, dat$IID), ]
287
-stopifnot(all.equal(as.character(dat$IID), sns))
288
-identical(cnSet$gender, dat$sex)
289
-
290
-cols <- brewer.pal(8, "Paired")[3:4]
291
-cols <- cols[cnSet$gender]
292
-ia <- A(cnSet)[X.markers, ]##c(M, F)]
293
-ib <- B(cnSet)[X.markers, ]##c(M, F)]
294
-i.total <- ia+ib
295
-boxplot(data.frame(log2(i.total)), pch=".", col=cols)
296
-legend("topleft", fill=unique(cols), legend=c("Male", "Female"))
297
-
298
-nus <- nuA(cnSet)[X.markers, ]
299
-phiss <- phiA(cnSet)[X.markers, ]
300
-
301
-
302
-xSet <- cnSet[X.markers, ]
303
-a <- as.matrix(A(xSet))
304
-b <- as.matrix(B(xSet))
305
-gt <- as.matrix(calls(xSet))
306
-nuA <- nu(xSet, "A")
307
-phA <- phi(xSet, "A")
308
-col <- brewer.pal(7, "Accent")[c(1, 4, 7)]
309
-fns.xset <- featureNames(xSet)
310
-
311
-path <- system.file("extdata", package="genomewidesnp6Crlmm")
312
-load(file.path(path, "snpProbes.rda"))
313
-index <- match(fns.xset, rownames(snpProbes))
314
-all(snpProbes[index, "chrom"]==23)
315
-
316
-pkg <- "pd.genomewidesnp.6"
317
-library(pd.genomewidesnp.6)
318
-conn <- db(get(pkg))
319
-sql <- "SELECT man_fsetid, chrom, physical_pos FROM featureSet"
320
-snps <- dbGetQuery(conn, sql)
321
-index <- match(fns.xset, snps$man_fsetid)
322
-all(snps$chrom[index] == "X")
323
-
324
-## are we writing intensities to the wrong location??
325
-
326
-
327
-
328
-###################################################
329
-### chunk number 25: ABscatterplots
330
-###################################################
331
-#line 599 "manuscript.Rnw"
332
-lA <- log2(a)
333
-lB <- log2(b)
334
-cols <- c("blue", "black", "red")
335
-par(las=1, mfrow=c(4,4), mar=rep(0.5,4), oma=c(4,4, 4,4))
336
-for(i in 1:16){
337
-	plot(lB[i, ], lA[i, ], col="grey50", bg=col[gt[i, ]], xaxt="n", yaxt="n", pch=21, cex=0.8,
338
-	     xlim=c(6.5, 12.5), ylim=c(6.5, 12.5), xlab="", ylab="")
339
-	for(CN in 1:3) 	lines(xSet, i, "GIGAS", CN, col=cols[CN], lwd=2, x.axis="B")
340
-}
341
-mtext(expression(log[2](I[B])), 1, outer=TRUE)
342
-par(las=3)
343
-mtext(expression(log[2](I[A])), 2, outer=TRUE)
344
-
345
-
346
-
347
-boxplot(data.frame(ia+ib), pch=".", col=cols)
348
-trace(ACN, browser)
349
-trace(C3, browser)
350 279
 ca.M <- CA(cnSet, i=X.markers, j=M)
351 280
 cb.M <- CB(cnSet, i=X.markers, j=M)
352 281
 ca.F <- CA(cnSet, i=X.markers, j=F)
353 282
 cb.F <- CB(cnSet, i=X.markers, j=F)
354 283
 cn.M <- ca.M+cb.M
355 284
 cn.F <- ca.F+cb.F
356
-tmp <- totalCopynumber(cnSet, i=X.markers, j=c(M, F))
357
-boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", outline=FALSE, col=cols, xaxt="n")
358
-boxplot(data.frame(tmp), pch=".", outline=FALSE, col=cols, xaxt="n")
285
+boxplot(data.frame(cbind(cn.M, cn.F)), pch=".", outline=FALSE, col=cols, xaxt="n", ylim=c(0,5))
286
+legend("topleft", fill=unique(cols), legend=c("Male", "Female"))
359 287
 abline(h=c(1,2))
360 288
 ## alternatively
361 289
 cn.F2 <- totalCopynumber(cnSet, i=X.markers, j=F)
362 290
 stopifnot(all.equal(cn.F, cn.F2))
363 291
 @
364
-\caption{Copy number estimates for polymorphic markers on chromosome X}
292
+\caption{Copy number estimates for polymorphic markers on chromosome
293
+  X.  We assume that the median copy number (across samples) at any
294
+  given polymorphic marker on chromosome X is 1 for males and 2 for
295
+  females.}
365 296
 \end{figure}
366 297
 
367 298
 Accessors for physical position and chromosome are also provided. In
... ...
@@ -441,7 +372,7 @@ rows <- which(isSnp(cnSet))
441 372
 cols <- which(batch == "C")
442 373
 posterior.prob <- tryCatch(i2p(snpCallProbability(cnSet)),
443 374
 			   error=function(e) print("This will not work for an ff object."))
444
-posterior.prob2 <- p2i(snpCallProbability(cnSet)[rows, cols])
375
+##posterior.prob2 <- p2i(snpCallProbability(cnSet)[rows, cols])
445 376
 @
446 377
 
447 378
 Accessors for the quantile normalized intensities for the A allele at
... ...
@@ -450,6 +381,7 @@ polymorphic loci:
450 381
 <<accessors>>=
451 382
 snp.index <- which(isSnp(cnSet))
452 383
 np.index <- which(!isSnp(cnSet))
384
+invisible(open(A(cnSet)))
453 385
 a <- (A(cnSet))[snp.index, ]
454 386
 dim(a)
455 387
 @
... ...
@@ -468,11 +400,13 @@ obtained by:
468 400
 
469 401
 <<>>=
470 402
 npIntensities <- (A(cnSet))[np.index, ]
403
+invisible(close(A(cnSet)))
471 404
 @
472 405
 
473 406
 Quantile normalized intensities for the B allele at polymorphic loci:
474 407
 
475 408
 <<B.polymorphic>>=
409
+invisible(open(B(cnSet)))
476 410
 b.snps <- (B(cnSet))[snp.index, ]
477 411
 @
478 412
 
... ...
@@ -481,6 +415,7 @@ nonpolymorphic loci:
481 415
 
482 416
 <<B.NAs>>=
483 417
 all(is.na((B(cnSet))[np.index, ]))
418
+invisible(close(B(cnSet)))
484 419
 @
485 420
 
486 421
 <<clean, echo=FALSE>>=
... ...
@@ -538,7 +473,7 @@ cn <- totalCopynumber(cnSet, i=marker.index, j=1)
538 473
 x <- position(cnSet)[marker.index]
539 474
 par(las=1, mar=c(4, 5, 4, 2))
540 475
 plot(x, cn, pch=".",
541
-     cex=2, xaxt="n", col="grey60", ylim=c(0,4),
476
+     cex=2, xaxt="n", col="grey60", ylim=c(0,6),
542 477
      ylab="copy number", xlab="physical position (Mb)",
543 478
      main=paste(sampleNames(cnSet)[1], ", CHR: 1"))
544 479
 ##np.index <- which(!isSnp(cnSet)[marker.index])
... ...
@@ -546,7 +481,7 @@ plot(x, cn, pch=".",
546 481
 ##       pch=".", cex=2, col="lightblue")
547 482
 axis(1, at=pretty(x), labels=pretty(x)/1e6)
548 483
 require(SNPchip)
549
-invisible(plotCytoband(1, new=FALSE, cytoband.ycoords=c(3.8, 4), label.cytoband=FALSE))
484
+invisible(plotCytoband(1, new=FALSE, cytoband.ycoords=c(5.3, 5.9), label.cytoband=FALSE))
550 485
 @
551 486
 
552 487
 \begin{figure}
... ...
@@ -595,7 +530,9 @@ the threshold specified in \Robject{crlmmCopynumber}.
595 530
 
596 531
 <<NAconfidenceScores>>=
597 532
 if(nmissing.ca > 0){
533
+	invisible(open(snpCallProbability(cnSet)))
598 534
 	gt.conf <- i2p(snpCallProbability(cnSet)[autosome.index, sample.index])
535
+	invisible(close(snpCallProbability(cnSet)))
599 536
 	below.thr <- gt.conf < GT.CONF.THR
600 537
 	index.allbelow <- as.integer(which(rowSums(below.thr) == length(sample.index)))
601 538
 	all.equal(index, index.allbelow)
... ...
@@ -628,7 +565,9 @@ that were all below the threshold.
628 565
 
629 566
 <<NAcrlmmConfidenceScore>>=
630 567
 if(nmissing.caX > 0){
568
+	invisible(open(snpCallProbability(cnSet)))
631 569
 	gt.conf <- i2p(snpCallProbability(cnSet)[X.index, sample.index])
570
+	invisible(close(snpCallProbability(cnSet)))
632 571
 	below.thr <- gt.conf < GT.CONF.THR
633 572
 	F <- which(cnSet$gender[sample.index] == 2)
634 573
 	M <- which(cnSet$gender[sample.index] == 1)
... ...
@@ -671,9 +610,9 @@ included as a parameter for the copy number estimation as an approach
671 610
 to reduce the sensitivity of genotype-specific summary statistics,
672 611
 such as the within-genotype median, to intensities from samples that
673 612
 do not clearly fall into one of the biallelic genotype clusters. There
674
-are drawbacks to this approach, including variance estimates that are
675
-overly optimistic.  More direct approaches for outlier detection and
676
-removal may be explored in the future.
613
+are drawbacks to this approach, including variance estimates that can
614
+be a bit optimistic at some loci.  More direct approaches for outlier
615
+detection and removal may be explored in the future.
677 616
 
678 617
 Copy number estimates for other chromosomes, such as mitochondrial and
679 618
 chromosome Y, are not currently available in \crlmm{}.
680 619
Binary files a/inst/scripts/copynumber.pdf and b/inst/scripts/copynumber.pdf differ