Browse code

added a bias correction for common copy number variants

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@37748 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 04/03/2009 19:24:13
Showing 5 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling via CRLMM Algorithm
4
-Version: 1.0.49
4
+Version: 1.0.50
5 5
 Date: 2008-12-28
6 6
 Author: Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>
... ...
@@ -8,4 +8,5 @@ importFrom(preprocessCore, normalize.quantiles.use.target)
8 8
 importFrom(utils, data, packageDescription, setTxtProgressBar, txtProgressBar)
9 9
 importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd)
10 10
 importFrom(genefilter, rowSds)
11
+importFrom(mvtnorm, dmvnorm)
11 12
 
... ...
@@ -75,8 +75,6 @@ nuphiAllele <- function(allele, Ystar, W, envir, p){
75 75
 	assign("phi.se", phi.se, envir=parent.frame(1))	
76 76
 }
77 77
 
78
-
79
-
80 78
 celDatesFrom <- function(celfiles, path=""){
81 79
 	celdates <- vector("character", length(celfiles))
82 80
 	celtimes <- vector("character", length(celfiles))
... ...
@@ -92,7 +90,6 @@ celDatesFrom <- function(celfiles, path=""){
92 90
 	return(celdts)
93 91
 }
94 92
 
95
-
96 93
 cnrma <- function(filenames, sns, cdfName, seed=1, verbose=FALSE){
97 94
 ##	require(preprocessCore) || stop("Package preprocessCore not available")
98 95
 	if (missing(sns)) sns <- basename(filenames)
... ...
@@ -236,6 +233,8 @@ instantiateObjects <- function(calls, NP, plate, envir, chrom){
236 233
 	assign("CB", CB, envir=envir)
237 234
 }
238 235
 
236
+
237
+
239 238
 computeCopynumber <- function(A,
240 239
 			      B,
241 240
 			      calls,
... ...
@@ -243,10 +242,17 @@ computeCopynumber <- function(A,
243 242
 			      NP,
244 243
 			      plate,
245 244
 			      fit.variance=NULL,
246
-			      MIN.OBS=3,
245
+			      MIN.OBS=1,
247 246
 			      envir,
248
-			      chrom, P, DF.PRIOR=50, CONF.THR=0.99, trim=0, upperTail=TRUE, ...){
247
+			      chrom, P, DF.PRIOR=50, CONF.THR=0.99,
248
+			      trim=0, upperTail=TRUE,
249
+			      bias.adj=FALSE,
250
+			      priorProb, ...){
249 251
 	if(length(ls(envir)) == 0) instantiateObjects(calls, NP, plate, envir, chrom)
252
+	if(bias.adj){
253
+		##assign uniform priors for total copy number states
254
+		if(missing(priorProb)) priorProb <- rep(1/4, 4)
255
+	}
250 256
 	##will be updating these objects
251 257
 	uplate <- get("uplate", envir)
252 258
 	message("Sufficient statistics")
... ...
@@ -261,7 +267,8 @@ computeCopynumber <- function(A,
261 267
 			 envir=envir,
262 268
 			 MIN.OBS=MIN.OBS,
263 269
 			 DF.PRIOR=DF.PRIOR,
264
-			 trim=trim, upperTail=upperTail)
270
+			 trim=trim, upperTail=upperTail,
271
+			 bias.adj=bias.adj)
265 272
 	}
266 273
 	message("\nEstimating coefficients")	
267 274
 	for(p in P){
... ...
@@ -395,7 +402,7 @@ nonpolymorphic <- function(plateIndex, NP, envir){
395 402
 ##	firstPass.NP
396 403
 }
397 404
 
398
-oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, upperTail, ...){
405
+oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, upperTail, bias.adj=FALSE, priorProb, ...){
399 406
 	p <- plateIndex
400 407
 	plate <- get("plate", envir)
401 408
 	AA <- G == 1
... ...
@@ -405,27 +412,30 @@ oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, uppe
405 412
 	Ns[, p, "AA"] <- rowSums(AA)
406 413
 	Ns[, p, "AB"] <- rowSums(AB)
407 414
 	Ns[, p, "BB"] <- rowSums(BB)
415
+	assign("Ns", Ns, envir)
408 416
 	AA[AA == FALSE] <- NA
409 417
 	AB[AB == FALSE] <- NA
410 418
 	BB[BB == FALSE] <- NA
411
-	Ip <- array(NA, dim=c(nrow(G), ncol(G), 3, 2))
412
-	dimnames(Ip) <- list(rownames(G), colnames(G), c("AA", "AB", "BB"), c("A", "B"))
413
-	Ip[, , "AA", "A"] <- AA*A
414
-	Ip[, , "AB", "A"] <- AB*A
415
-	Ip[, , "BB", "A"] <- BB*A
416
-	Ip[, , "AA", "B"] <- AA*B
417
-	Ip[, , "AB", "B"] <- AB*B
418
-	Ip[, , "BB", "B"] <- BB*B
419 419
 	##---------------------------------------------------------------------------
420 420
 	## Sufficient statistics (plate-specific)
421 421
 	##---------------------------------------------------------------------------
422
-	##These matrices instantiated in the calling function are updated
422
+	AA.A <- AA*A
423
+	AB.A <- AB*A
424
+	BB.A <- BB*A
425
+	AA.B <- AA*B
426
+	AB.B <- AB*B
427
+	BB.B <- BB*B
428
+	locationAndScale(p=p, AA.A=AA.A, AB.A=AB.A, BB.A=BB.A,
429
+			 AA.B=AA.B, AB.B=AB.B, BB.B=BB.B,
430
+			 envir=envir, DF.PRIOR=DF.PRIOR)
423 431
 	muA.AA <- get("muA.AA", envir)
424 432
 	muA.AB <- get("muA.AB", envir)
425 433
 	muA.BB <- get("muA.BB", envir)
426 434
 	muB.AA <- get("muB.AA", envir)
427 435
 	muB.AB <- get("muB.AB", envir)
428 436
 	muB.BB <- get("muB.BB", envir)
437
+	sigmaA <- get("sigmaA", envir)
438
+	sigmaB <- get("sigmaB", envir)
429 439
 	tau2A <- get("tau2A", envir)
430 440
 	tau2B <- get("tau2B", envir)
431 441
 	sig2A <- get("sig2A", envir)
... ...
@@ -433,13 +443,48 @@ oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, uppe
433 443
 	corr <- get("corr", envir)
434 444
 	corrA.BB <- get("corrA.BB", envir)  ## B allele
435 445
 	corrB.AA <- get("corrB.AA", envir)
436
-
437
-	AA.A <- AA*A
438
-	AB.A <- AB*A
439
-	BB.A <- BB*A
440
-	AA.B <- AA*B
441
-	AB.B <- AB*B
442
-	BB.B <- BB*B
446
+	if(bias.adj){
447
+		##First check that nu and phi are available
448
+		nuA <- get("nuA", envir)
449
+		if(all(is.na(nuA))) {
450
+			message("Background and signal coefficients have not yet been estimated -- can not do bias correction yet")
451
+			message("Must run computeCopynumber a second time with bias.adj=TRUE to do the adjustment")
452
+		} else {
453
+			message("running bias adjustment")			
454
+			normal <- biasAdj(A=A, B=B, plateIndex=p, envir=envir, priorProb=priorProb)
455
+			normal[normal == FALSE] <- NA
456
+			AA.A <- AA.A*normal
457
+			AB.A <- AB.A*normal
458
+			BB.A <- BB.A*normal
459
+			AA.B <- AA.B*normal
460
+			AB.B <- AB.B*normal
461
+			BB.B <- BB.B*normal
462
+			message("Recomputing location and scale parameters")						
463
+			locationAndScale(p=p, AA.A=AA.A, AB.A=AB.A, BB.A=BB.A,
464
+					 AA.B=AA.B, AB.B=AB.B, BB.B=BB.B,
465
+					 envir=envir, DF.PRIOR=DF.PRIOR)
466
+			muA.AA <- get("muA.AA", envir)
467
+			muA.AB <- get("muA.AB", envir)
468
+			muA.BB <- get("muA.BB", envir)
469
+			muB.AA <- get("muB.AA", envir)
470
+			muB.AB <- get("muB.AB", envir)
471
+			muB.BB <- get("muB.BB", envir)
472
+			tau2A <- get("tau2A", envir)
473
+			tau2B <- get("tau2B", envir)
474
+			sig2A <- get("sig2A", envir)
475
+			sig2B <- get("sig2B", envir)
476
+			sigmaA <- get("sigmaA", envir)
477
+			sigmaB <- get("sigmaB", envir)
478
+			corr <- get("corr", envir)
479
+			corrA.BB <- get("corrA.BB", envir)  ## B allele
480
+			corrB.AA <- get("corrB.AA", envir)
481
+			Ns.unadj <- Ns
482
+			assign("Ns.unadj", Ns.unadj, envir)
483
+			Ns[, p, "AA"] <- rowSums(!is.na(AA.A))
484
+			Ns[, p, "AB"] <- rowSums(!is.na(AB.A))
485
+			Ns[, p, "BB"] <- rowSums(!is.na(BB.A))			
486
+		}
487
+	}
443 488
 	if(trim > 0){
444 489
 		##rowMedians is not robust enough when a variant is common
445 490
 		##Try a trimmed rowMedian, trimming only one tail (must specify which)
... ...
@@ -474,114 +519,13 @@ oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, uppe
474 519
 		Ns[, p, "AB"] <- rowSums(!is.na(AB.A))
475 520
 		Ns[, p, "BB"] <- rowSums(!is.na(BB.A))		
476 521
 	}
477
-	muA.AA[, p] <- rowMedians(AA.A, na.rm=TRUE)
478
-	muA.AB[, p] <- rowMedians(AB.A, na.rm=TRUE)
479
-	muA.BB[, p] <- rowMedians(BB.A, na.rm=TRUE)
480
-	muB.AA[, p] <- rowMedians(AA.B, na.rm=TRUE)
481
-	muB.AB[, p] <- rowMedians(AB.B, na.rm=TRUE)
482
-	muB.BB[, p] <- rowMedians(BB.B, na.rm=TRUE)
483
-	sigmaA <- matrix(NA, nrow(AA), 3)
484
-	sigmaB <- matrix(NA, nrow(AA), 3)	
485
-	colnames(sigmaB) <- colnames(sigmaA) <- c("AA", "AB", "BB")
486
-
487
-
488
-	sigmaA[, "AA"] <- rowMAD(AA.A, na.rm=TRUE)
489
-	sigmaA[, "AB"] <- rowMAD(AB.A, na.rm=TRUE)
490
-	sigmaA[, "BB"] <- rowMAD(BB.A, na.rm=TRUE)
491
-	sigmaB[, "AA"] <- rowMAD(AA.B, na.rm=TRUE)
492
-	sigmaB[, "AB"] <- rowMAD(AB.B, na.rm=TRUE)
493
-	sigmaB[, "BB"] <- rowMAD(BB.B, na.rm=TRUE)
494
-
495
-##	sigmaA[, "AA"] <- rowSds(AA.A, na.rm=TRUE)
496
-##	sigmaA[, "AB"] <- rowSds(AB.A, na.rm=TRUE)
497
-##	sigmaA[, "BB"] <- rowSds(BB*A, na.rm=TRUE)
498
-##	sigmaB[, "AA"] <- rowSds(AA*B, na.rm=TRUE)
499
-##	sigmaB[, "AB"] <- rowSds(AB*B, na.rm=TRUE)
500
-##	sigmaB[, "BB"] <- rowSds(BB*B, na.rm=TRUE)
501
-	##shrink
502
-	DF <- Ns[, p, ]-1
503
-	DF[DF < 1] <- 1
504
-	medsA <- apply(sigmaA, 2, "median", na.rm=TRUE)
505
-	medsB <- apply(sigmaB, 2, "median", na.rm=TRUE)			
506
-	for(m in 1:3){
507
-		sigmaA[, m] <- (sigmaA[, m]*DF[, m] + medsA[m]*DF.PRIOR)/(DF.PRIOR+DF[, m])
508
-		sigmaA[is.na(sigmaA[, m]), m] <- medsA[m]
509
-		sigmaB[, m] <- (sigmaB[, m]*DF[, m] + medsB[m]*DF.PRIOR)/(DF.PRIOR+DF[, m])
510
-		sigmaB[is.na(sigmaB[, m]), m] <- medsB[m]		
511
-	}
512
-	index.AA <- which(Ns[, p, "AA"] >= 2)
513
-	index.AB <- which(Ns[, p, "AB"] >= 2)
514
-	index.BB <- which(Ns[, p, "BB"] >= 2)
515
-	
516
-	x <- Ip[index.BB, , "BB", "A"]
517
-	##tau2A[index.BB, p] <- rowVars(x, na.rm=TRUE)##var(log(IA)| BB)
518
-	tau2A[index.BB, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2
519
-	DF <- Ns[, p, "BB"]-1
520
-	DF[DF < 1] <- 1
521
-	med <- median(tau2A[, p], na.rm=TRUE)
522
-	tau2A[, p] <- (tau2A[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
523
-	tau2A[is.na(tau2A[, p]), p] <- med
524
-	
525
-	x <- Ip[index.BB, , "BB", "B"]
526
-##	sig2B[index.BB, p] <- rowVars(log2(x), na.rm=TRUE) ##var(log(IB)|BB)
527
-	sig2B[index.BB, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2	
528
-	##system.time(tmp <- rowCovs(x, x, na.rm=TRUE)) ##var(log(IB)|BB)
529
-	##system.time(tmp2 <- rowVars(log2(x), na.rm=TRUE))
530
-	med <- median(sig2B[, p], na.rm=TRUE)
531
-	sig2B[, p] <- (sig2B[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
532
-	sig2B[is.na(sig2B[, p]), p] <- med
533
-	
534
-	x <- Ip[index.AA, , "AA", "B"]
535
-	##tau2B[index.AA, p] <- rowCovs(x, x, na.rm=TRUE)##var(log(IB)| AA)
536
-	tau2B[index.AA, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2		
537
-	DF <- Ns[, p, "AA"]-1
538
-	DF[DF < 1] <- 1
539
-	med <- median(tau2B[, p], na.rm=TRUE)
540
-	tau2B[, p] <- (tau2B[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
541
-	tau2B[is.na(tau2B[, p]), p] <- med
542
-	
543
-	x <- Ip[index.AA, , "AA", "A"]
544
-	##sig2A[index.AA, p] <- rowVars(log2(x), na.rm=TRUE)##var(log(IA)|AA)
545
-	sig2A[index.AA, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2##var(log(IA)|AA)	
546
-	##sig2A[index.AA, p] <- rowCovs(x, x, na.rm=TRUE) ##var(log(IA)|AA)
547
-	med <- median(sig2A[, p], na.rm=TRUE)
548
-	sig2A[, p] <- (sig2A[, p]*DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
549
-	sig2A[is.na(sig2A[, p]), p] <- med	
550
-
551
-	##estimate the correlation where there is at least 1 or more copies (AB genotypes)
552
-	if(length(index.AB) > 0){ ##all homozygous is possible	
553
-		x <- Ip[index.AB, , "AB", "A"]
554
-		y <- Ip[index.AB, , "AB", "B"]
555
-		corr[index.AB, p] <- rowCors(x, y, na.rm=TRUE)
556
-		corr[corr < 0] <- 0
557
-		DF <- Ns[, p, "AB"]-1
558
-		DF[DF<1] <- 1
559
-		med <- median(corr[, p], na.rm=TRUE)
560
-		corr[, p] <- (corr[, p]*DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
561
-		corr[is.na(corr[, p]), p] <- med
562
-	}
563
-	##estimate the correlation of the background errors and AA, BB genotypes
564
-	backgroundB <- Ip[index.AA, , "AA", "B"]
565
-	signalA <- Ip[index.AA, , "AA", "A"]
566
-	corrB.AA[index.AA, p] <- rowCors(backgroundB, signalA, na.rm=TRUE)
567
-	DF <- Ns[, p, "AA"]-1
568
-	DF[DF < 1] <- 1
569
-	med <- median(corrB.AA[, p], na.rm=TRUE)
570
-	corrB.AA[, p] <- (corrB.AA[, p]*DF + med*DF.PRIOR)/(DF.PRIOR + DF)
571
-	corrB.AA[is.na(corrB.AA[, p]), p] <- med
572
-
573
-	backgroundA <- Ip[index.BB, , "BB", "A"]
574
-	signalB <- Ip[index.BB, , "BB", "B"]
575
-	corrA.BB[index.BB, p] <- rowCors(backgroundA, signalB, na.rm=TRUE)
576
-	DF <- Ns[, p, "BB"]-1
577
-	DF[DF < 1] <- 1
578
-	med <- median(corrA.BB[, p], na.rm=TRUE)
579
-	corrA.BB[, p] <- (corrA.BB[, p]*DF + med*DF.PRIOR)/(DF.PRIOR + DF)
580
-	corrA.BB[is.na(corrA.BB[, p]), p] <- med	
581
-	
582 522
 	##---------------------------------------------------------------------------
583 523
 	## Predict sufficient statistics for unobserved genotypes (plate-specific)
584 524
 	##---------------------------------------------------------------------------
525
+	index.AA <- which(Ns[, p, "AA"] >= 3)
526
+	index.AB <- which(Ns[, p, "AB"] >= 3)
527
+	index.BB <- which(Ns[, p, "BB"] >= 3)
528
+	
585 529
 	correct.orderA <- muA.AA[, p] > muA.BB[, p]
586 530
 	correct.orderB <- muB.BB[, p] > muB.AA[, p]
587 531
 	if(length(index.AB) > 0){
... ...
@@ -654,9 +598,9 @@ oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, uppe
654 598
 		dev.off()
655 599
 	}
656 600
 	##missing two genotypes
657
-	noAA <- Ns[, p, "AA"] <= 2
658
-	noAB <- Ns[, p, "AB"] <= 2
659
-	noBB <- Ns[, p, "BB"] <= 2
601
+	noAA <- Ns[, p, "AA"] < MIN.OBS
602
+	noAB <- Ns[, p, "AB"] < MIN.OBS
603
+	noBB <- Ns[, p, "BB"] < MIN.OBS
660 604
 	##---------------------------------------------------------------------------
661 605
 	## Two genotype clusters not observed -- would sequence help? (didn't seem that helpful)
662 606
 	## 1 extract index of complete data
... ...
@@ -717,8 +661,6 @@ oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, uppe
717 661
 	dn.Ns <- dimnames(Ns)
718 662
 	Ns <- array(as.integer(Ns), dim=dim(Ns))
719 663
 	dimnames(Ns)[[3]] <- dn.Ns[[3]]
720
-
721
-	assign("Ns", Ns, envir)	
722 664
 	assign("muA.AA", muA.AA, envir)
723 665
 	assign("muA.AB", muA.AB, envir)
724 666
 	assign("muA.BB", muA.BB, envir)
... ...
@@ -733,12 +675,148 @@ oneBatch <- function(plateIndex, G, A, B, MIN.OBS=3, DF.PRIOR, envir, trim, uppe
733 675
 	assign("sig2B", sig2B, envir)
734 676
 	assign("corr", corr, envir)
735 677
 	assign("corrB.AA", corrB.AA, envir)
736
-	assign("corrA.BB", corrA.BB, envir)		
678
+	assign("corrA.BB", corrA.BB, envir)				
679
+	assign("Ns", Ns, envir)	
737 680
 	plates.completed <- get("plates.completed", envir)
738 681
 	plates.completed[p] <- TRUE
739 682
 	assign("plates.completed", plates.completed, envir)
740 683
 }
741 684
 
685
+locationAndScale <- function(p, AA.A, AB.A, BB.A,
686
+			     AA.B, AB.B, BB.B, envir,
687
+			     DF.PRIOR){
688
+	muA.AA <- get("muA.AA", envir)
689
+	muA.AB <- get("muA.AB", envir)
690
+	muA.BB <- get("muA.BB", envir)
691
+	muB.AA <- get("muB.AA", envir)
692
+	muB.AB <- get("muB.AB", envir)
693
+	muB.BB <- get("muB.BB", envir)
694
+	tau2A <- get("tau2A", envir)
695
+	tau2B <- get("tau2B", envir)
696
+	sig2A <- get("sig2A", envir)
697
+	sig2B <- get("sig2B", envir)
698
+	corr <- get("corr", envir)
699
+	corrA.BB <- get("corrA.BB", envir)  ## B allele
700
+	corrB.AA <- get("corrB.AA", envir)
701
+	Ns <- get("Ns", envir)	
702
+	muA.AA[, p] <- rowMedians(AA.A, na.rm=TRUE)
703
+	muA.AB[, p] <- rowMedians(AB.A, na.rm=TRUE)
704
+	muA.BB[, p] <- rowMedians(BB.A, na.rm=TRUE)
705
+	muB.AA[, p] <- rowMedians(AA.B, na.rm=TRUE)
706
+	muB.AB[, p] <- rowMedians(AB.B, na.rm=TRUE)
707
+	muB.BB[, p] <- rowMedians(BB.B, na.rm=TRUE)
708
+	sigmaA <- matrix(NA, nrow(AA.A), 3)
709
+	sigmaB <- matrix(NA, nrow(AA.A), 3)	
710
+	colnames(sigmaB) <- colnames(sigmaA) <- c("AA", "AB", "BB")
711
+	sigmaA[, "AA"] <- rowMAD(AA.A, na.rm=TRUE)
712
+	sigmaA[, "AB"] <- rowMAD(AB.A, na.rm=TRUE)
713
+	sigmaA[, "BB"] <- rowMAD(BB.A, na.rm=TRUE)
714
+	sigmaB[, "AA"] <- rowMAD(AA.B, na.rm=TRUE)
715
+	sigmaB[, "AB"] <- rowMAD(AB.B, na.rm=TRUE)
716
+	sigmaB[, "BB"] <- rowMAD(BB.B, na.rm=TRUE)
717
+
718
+	##shrink
719
+	DF <- Ns[, p, ]-1
720
+	DF[DF < 1] <- 1
721
+	medsA <- apply(sigmaA, 2, "median", na.rm=TRUE)
722
+	medsB <- apply(sigmaB, 2, "median", na.rm=TRUE)			
723
+	for(m in 1:3){
724
+		sigmaA[, m] <- (sigmaA[, m]*DF[, m] + medsA[m]*DF.PRIOR)/(DF.PRIOR+DF[, m])
725
+		sigmaA[is.na(sigmaA[, m]), m] <- medsA[m]
726
+		sigmaB[, m] <- (sigmaB[, m]*DF[, m] + medsB[m]*DF.PRIOR)/(DF.PRIOR+DF[, m])
727
+		sigmaB[is.na(sigmaB[, m]), m] <- medsB[m]		
728
+	}
729
+	index.AA <- which(Ns[, p, "AA"] >= 2)
730
+	index.AB <- which(Ns[, p, "AB"] >= 2)
731
+	index.BB <- which(Ns[, p, "BB"] >= 2)
732
+
733
+	x <- BB.A[index.BB, ]
734
+	##x <- Ip[index.BB, , "BB", "A"]
735
+	##tau2A[index.BB, p] <- rowVars(x, na.rm=TRUE)##var(log(IA)| BB)
736
+	tau2A[index.BB, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2
737
+	DF <- Ns[, p, "BB"]-1
738
+	DF[DF < 1] <- 1
739
+	med <- median(tau2A[, p], na.rm=TRUE)
740
+	tau2A[, p] <- (tau2A[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
741
+	tau2A[is.na(tau2A[, p]), p] <- med
742
+	
743
+	##x <- Ip[index.BB, , "BB", "B"]
744
+	x <- BB.B[index.BB, ]
745
+	sig2B[index.BB, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2	
746
+	med <- median(sig2B[, p], na.rm=TRUE)
747
+	sig2B[, p] <- (sig2B[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
748
+	sig2B[is.na(sig2B[, p]), p] <- med
749
+	
750
+	##x <- Ip[index.AA, , "AA", "B"]
751
+	x <- AA.B[index.AA, ]
752
+	tau2B[index.AA, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2		
753
+	DF <- Ns[, p, "AA"]-1
754
+	DF[DF < 1] <- 1
755
+	med <- median(tau2B[, p], na.rm=TRUE)
756
+	tau2B[, p] <- (tau2B[, p] * DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
757
+	tau2B[is.na(tau2B[, p]), p] <- med
758
+	
759
+	##x <- Ip[index.AA, , "AA", "A"]
760
+	x <- AA.A[index.AA, ]
761
+	sig2A[index.AA, p] <- rowMAD(log2(x), log2(x), na.rm=TRUE)^2##var(log(IA)|AA)	
762
+	med <- median(sig2A[, p], na.rm=TRUE)
763
+	sig2A[, p] <- (sig2A[, p]*DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
764
+	sig2A[is.na(sig2A[, p]), p] <- med	
765
+
766
+	##estimate the correlation where there is at least 1 or more copies (AB genotypes)
767
+	if(length(index.AB) > 0){ ##all homozygous is possible
768
+		x <- AB.A[index.AB, ]
769
+		y <- AB.B[index.AB, ]
770
+		##x <- Ip[index.AB, , "AB", "A"]
771
+		##y <- Ip[index.AB, , "AB", "B"]
772
+		corr[index.AB, p] <- rowCors(x, y, na.rm=TRUE)
773
+		corr[corr < 0] <- 0
774
+		DF <- Ns[, p, "AB"]-1
775
+		DF[DF<1] <- 1
776
+		med <- median(corr[, p], na.rm=TRUE)
777
+		corr[, p] <- (corr[, p]*DF  +  med * DF.PRIOR)/(DF.PRIOR + DF)
778
+		corr[is.na(corr[, p]), p] <- med
779
+	}
780
+	##estimate the correlation of the background errors and AA, BB genotypes
781
+	##backgroundB <- Ip[index.AA, , "AA", "B"]
782
+	backgroundB <- AA.B[index.AA, ]
783
+	signalA <- AA.A[index.AA, ]
784
+	##signalA <- Ip[index.AA, , "AA", "A"]
785
+	corrB.AA[index.AA, p] <- rowCors(backgroundB, signalA, na.rm=TRUE)
786
+	DF <- Ns[, p, "AA"]-1
787
+	DF[DF < 1] <- 1
788
+	med <- median(corrB.AA[, p], na.rm=TRUE)
789
+	corrB.AA[, p] <- (corrB.AA[, p]*DF + med*DF.PRIOR)/(DF.PRIOR + DF)
790
+	corrB.AA[is.na(corrB.AA[, p]), p] <- med
791
+
792
+	##backgroundA <- Ip[index.BB, , "BB", "A"]
793
+	backgroundA <- BB.A[index.BB, ]
794
+	signalB <- BB.B[index.BB, ]
795
+	##signalB <- Ip[index.BB, , "BB", "B"]
796
+	corrA.BB[index.BB, p] <- rowCors(backgroundA, signalB, na.rm=TRUE)
797
+	DF <- Ns[, p, "BB"]-1
798
+	DF[DF < 1] <- 1
799
+	med <- median(corrA.BB[, p], na.rm=TRUE)
800
+	corrA.BB[, p] <- (corrA.BB[, p]*DF + med*DF.PRIOR)/(DF.PRIOR + DF)
801
+	corrA.BB[is.na(corrA.BB[, p]), p] <- med
802
+
803
+	assign("muA.AA", muA.AA, envir)
804
+	assign("muA.AB", muA.AB, envir)
805
+	assign("muA.BB", muA.BB, envir)
806
+	assign("muB.AA", muB.AA, envir)
807
+	assign("muB.AB", muB.AB, envir)
808
+	assign("muB.BB", muB.BB, envir)
809
+	assign("sigmaA", sigmaA, envir)
810
+	assign("sigmaB", sigmaB, envir)	
811
+	assign("tau2A", tau2A, envir)
812
+	assign("tau2B", tau2B, envir)
813
+	assign("sig2A", sig2A, envir)
814
+	assign("sig2B", sig2B, envir)
815
+	assign("corr", corr, envir)
816
+	assign("corrB.AA", corrB.AA, envir)
817
+	assign("corrA.BB", corrA.BB, envir)			
818
+}
819
+
742 820
 coefs <- function(plateIndex, conf, MIN.OBS=3, envir, CONF.THR=0.99){
743 821
 	p <- plateIndex
744 822
 	##if(p == 5) browser()
... ...
@@ -967,3 +1045,86 @@ polymorphic <- function(plateIndex, A, B, envir){
967 1045
 	## nonpolymorphic probes
968 1046
 	##---------------------------------------------------------------------------
969 1047
 }
1048
+
1049
+
1050
+biasAdj <- function(A, B, plateIndex, envir, priorProb){
1051
+	sig2A <- get("sig2A", envir)
1052
+	sig2B <- get("sig2B", envir)
1053
+	tau2A <- get("tau2A", envir)
1054
+	tau2B <- get("tau2B", envir)
1055
+	corrA.BB <- get("corrA.BB", envir)
1056
+	corrB.AA <- get("corrB.AA", envir)
1057
+	corr <- get("corr", envir)
1058
+	nuA <- get("nuA", envir)
1059
+	nuB <- get("nuB", envir)
1060
+	phiA <- get("phiA", envir)
1061
+	phiB <- get("phiB", envir)
1062
+	p <- plateIndex
1063
+	plate <- get("plate", envir)
1064
+	if(missing(priorProb)) priorProb <- rep(1/4, 4) ##uniform
1065
+	emit <- array(NA, dim=c(nrow(A), ncol(A), 10))##SNPs x sample x 'truth'	
1066
+	m <- 1##snp counter	
1067
+	for(i in 1:nrow(A)){
1068
+		if(i %% 100 == 0) cat(".")
1069
+		counter <- 1##state counter
1070
+		for(CT in 0:3){
1071
+			for(CA in 0:CT){
1072
+				CB <- CT-CA
1073
+				A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0))
1074
+				B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0))
1075
+				scale <- c(A.scale, B.scale)
1076
+				if(CA == 0 & CB == 0) rho <- 0
1077
+				if(CA == 0 & CB > 0) rho <- corrA.BB[i, p]
1078
+				if(CA > 0 & CB == 0) rho <- corrB.AA[i, p]
1079
+				if(CA > 0 & CB > 0) rho <- corr[i, p]
1080
+				means <- c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p]))
1081
+				covs <- rho*A.scale*B.scale
1082
+				##ensure positive definite			
1083
+				##Sigma <- as.matrix(nearPD(matrix(c(A.scale^2, covs,
1084
+				##covs, B.scale^2), 2, 2))[[1]])
1085
+				Sigma <- matrix(c(A.scale^2, covs, covs, B.scale^2), 2,2)
1086
+				X <- log2(cbind(A[i, ], B[i, ]))
1087
+				tmp <- dmvnorm(X, mean=means, sigma=Sigma) 
1088
+				emit[m, , counter] <- tmp
1089
+				counter <- counter+1
1090
+			}
1091
+		}
1092
+		m <- m+1
1093
+	}
1094
+	homDel <- priorProb[1]*emit[, , 1]
1095
+	hemDel <- priorProb[2]*emit[, , c(2, 3)] # + priorProb[3]*emit[, c(4, 5, 6)] + priorProb[4]*emit[, c(7:10)]
1096
+	norm <- priorProb[3]*emit[, , 4:6]
1097
+	amp <- priorProb[4]*emit[, , 7:10]
1098
+	##sum over the different combinations within each copy number state
1099
+	hemDel <- apply(hemDel, c(1,2), sum)
1100
+	norm <- apply(norm, c(1, 2), sum)
1101
+	amp <- apply(amp, c(1,2), sum)
1102
+
1103
+	tmp <- array(NA, dim=c(nrow(A), ncol(A), 4))
1104
+	tmp[, , 1] <- homDel
1105
+	tmp[, , 2] <- hemDel
1106
+	tmp[, , 3] <- norm
1107
+	tmp[, , 4] <- amp
1108
+	tmp2 <- apply(tmp, c(1, 2), function(x) order(x, decreasing=TRUE)[1])
1109
+	##Adjust for SNPs that have less than 80% in an altered state
1110
+	##flag the remainder?
1111
+	tmp3 <- tmp2 != 3
1112
+	propAlt <- rowMeans(tmp3)##prop normal
1113
+	ii <- propAlt < 0.75
1114
+	##only exclude observations from one tail, depending on
1115
+	##whether more are up or down
1116
+	##(should probably iterate)
1117
+	moreup <- rowSums(tmp2 > 3) > rowSums(tmp2 < 3)
1118
+	notUp <-  tmp2[ii & moreup, ] <= 3
1119
+	notDown <- tmp2[ii & !moreup, ] >= 3
1120
+
1121
+	normal <- matrix(TRUE, nrow(A), ncol(A))
1122
+	normal[ii & moreup, ] <- notUp
1123
+	normal[ii & !moreup, ] <- notDown
1124
+	flagAltered <- which(propAlt > 0.5)
1125
+	assign("flagAltered", flagAltered, envir) 
1126
+##	tmp3 <- tmp2[ii, ] == 3
1127
+##	tmp4 <- matrix(TRUE, nrow(A), ncol(A))
1128
+##	tmp4[ii, ] <- tmp3
1129
+	return(normal)
1130
+}
... ...
@@ -78,24 +78,23 @@ if(!exists("cnrmaResult")){
78 78
 @ 
79 79
 
80 80
 \subsection{Copy number}
81
+<<chr>>=
82
+CHR <- 21
83
+@ 
81 84
 
82 85
 Copy number can be assessed one chromosome at a time.  Here we specify
83
-chromosome 15 and and load a list of indices to subset the data. The
86
+chromosome \Sexpr{CHR} and and load a list of indices to subset the data. The
84 87
 first element in the list correspond to indices of polymorphic probes
85
-on chromosome 15; the second element corresponds to indices of
86
-nonpolymorphic probes on chromosome 15.
88
+on chromosome \Sexpr{CHR}; the second element corresponds to indices of
89
+nonpolymorphic probes on chromosome \Sexpr{21}.
87 90
 
88 91
 <<chromosomeIndex>>=
89
-CHR <- 21
90 92
 CHR_INDEX <- paste(CHR, "index", sep="")
91 93
 data(list=CHR_INDEX, package="genomewidesnp6Crlmm")
92 94
 str(index)
93 95
 @ 
94 96
 
95
-Next we load 3 files that were saved from the preprocessing step and
96
-then subset these lists using the above indices to extract the
97
-preprocessed intensities and genotypes needed for estimating copy
98
-number.  Specifically, we require 6 items:
97
+We require 6 items for copy number estimation:
99 98
 
100 99
 \begin{itemize}
101 100
   \item quantile-normalized A intensities (I1 x J)
... ...
@@ -9,7 +9,9 @@
9 9
   
10 10
 }
11 11
 \usage{
12
-computeCopynumber(A, B, calls, conf, NP, plate, fit.variance = NULL, MIN.OBS = 3, envir, chrom, P, DF.PRIOR = 50, CONF.THR = 0.99, trim = 0, upperTail = TRUE, ...)
12
+computeCopynumber(A, B, calls, conf, NP, plate, fit.variance = NULL,
13
+MIN.OBS = 1, envir, chrom, P, DF.PRIOR = 50, CONF.THR = 0.99, trim = 0,
14
+upperTail = TRUE, bias.adj=FALSE, priorProb, ...)
13 15
 }
14 16
 %- maybe also 'usage' for other objects documented here.
15 17
 \arguments{
... ...
@@ -34,7 +36,14 @@ computeCopynumber(A, B, calls, conf, NP, plate, fit.variance = NULL, MIN.OBS = 3
34 36
   \item{trim}{Temporary option: a number between 0 and 1.  Lops off a percentage of one tail of the
35 37
     intensities for the A and B alleles}
36 38
   \item{upperTail}{Logical: if TRUE and trim = p, quantiles greater than
37
-  1-p are replaced with NAs}
39
+    1-p are replaced with NAs}
40
+  \item{bias.adj}{Logical: whether to adjust the location and scale
41
+    parameters to account for biases due to common copy number
42
+    variants.  This is a SNP-specific adjustment.  Parameters for
43
+    background and slope must have already been estimated.}
44
+  \item{priorProb}{Numerical vector of length 4.  The prior probability
45
+    of each copy number state (0, 1, 2, 3, and 4). The default is a
46
+    uniform prior.  Ignored if bias.adj=FALSE}
38 47
   \item{\dots}{Currently ignored}
39 48
 }
40 49