Browse code

require high confidence score of genotypes for computing sufficient statistics

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@37810 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 07/03/2009 19:03:38
Showing 3 changed files

... ...
@@ -1,13 +1,13 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling via CRLMM Algorithm
4
-Version: 1.0.52
4
+Version: 1.0.53
5 5
 Date: 2008-12-28
6 6
 Author: Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>
8 8
 Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 6.0.
9 9
 License: Artistic-2.0
10
-Imports: affyio, preprocessCore, utils, stats, genefilter, splines, Biobase
10
+Imports: affyio, preprocessCore, utils, stats, genefilter, splines, Biobase, mvtnorm
11 11
 Suggests: hapmapsnp5, genomewidesnp5Crlmm, genomewidesnp6Crlmm, methods, GGdata, snpMatrix
12 12
 Collate: AllClasses.R
13 13
          crlmm-methods.R
... ...
@@ -8,5 +8,5 @@ importFrom(preprocessCore, normalize.quantiles.use.target)
8 8
 importFrom(utils, data, packageDescription, setTxtProgressBar, txtProgressBar)
9 9
 importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd)
10 10
 importFrom(genefilter, rowSds)
11
-##importFrom(mvtnorm, dmvnorm)
11
+importFrom(mvtnorm, dmvnorm)
12 12
 
... ...
@@ -417,10 +417,12 @@ nonpolymorphic <- function(plateIndex, NP, envir){
417 417
 oneBatch <- function(plateIndex, G, A, B, conf, CONF.THR=0.99, MIN.OBS=3, DF.PRIOR, envir, trim, upperTail, bias.adj=FALSE, priorProb, ...){
418 418
 	p <- plateIndex
419 419
 	plate <- get("plate", envir)
420
-	AA <- G == 1
421
-	AB <- G == 2
422
-	BB <- G == 3
423
-	Ns <- get("Ns", envir)
420
+	Ns <- get("Ns", envir)	
421
+	highConf <- 1-exp(-conf/1000)
422
+	highConf <- highConf > CONF.THR
423
+	AA <- G == 1 & highConf
424
+	AB <- G == 2 & highConf
425
+	BB <- G == 3 & highConf
424 426
 	Ns[, p, "AA"] <- rowSums(AA)
425 427
 	Ns[, p, "AB"] <- rowSums(AB)
426 428
 	Ns[, p, "BB"] <- rowSums(BB)
... ...
@@ -534,21 +536,18 @@ oneBatch <- function(plateIndex, G, A, B, conf, CONF.THR=0.99, MIN.OBS=3, DF.PRI
534 536
 	##---------------------------------------------------------------------------
535 537
 	## Predict sufficient statistics for unobserved genotypes (plate-specific)
536 538
 	##---------------------------------------------------------------------------
537
-	highConf <- 1-exp(-conf/1000)
538
-	highConf <- highConf > CONF.THR
539
-	confInd <- rowMeans(highConf) > CONF.THR
540
-	NN <- Ns
541
-	NN[, p, "AA"] <- rowSums(AA & highConf, na.rm=TRUE) ##how many AA were called with high confidence
542
-	NN[, p, "AB"] <- rowSums(AB & highConf, na.rm=TRUE)
543
-	NN[, p, "BB"] <- rowSums(BB & highConf, na.rm=TRUE)
544
-	index.AA <- which(NN[, p, "AA"] >= 3)
545
-	index.AB <- which(NN[, p, "AB"] >= 3)
546
-	index.BB <- which(NN[, p, "BB"] >= 3)
539
+##	NN <- Ns
540
+##	NN[, p, "AA"] <- rowSums(AA & highConf, na.rm=TRUE) ##how many AA were called with high confidence
541
+##	NN[, p, "AB"] <- rowSums(AB & highConf, na.rm=TRUE)
542
+##	NN[, p, "BB"] <- rowSums(BB & highConf, na.rm=TRUE)
543
+	index.AA <- which(Ns[, p, "AA"] >= 3)
544
+	index.AB <- which(Ns[, p, "AB"] >= 3)
545
+	index.BB <- which(Ns[, p, "BB"] >= 3)
547 546
 	correct.orderA <- muA.AA[, p] > muA.BB[, p]
548 547
 	correct.orderB <- muB.BB[, p] > muB.AA[, p]
549 548
 	if(length(index.AB) > 0){
550
-		nobs <- rowSums(NN[, p, ] >= MIN.OBS) == 3
551
-	} else nobs <- rowSums(NN[, p, c(1,3)] >= MIN.OBS) == 2
549
+		nobs <- rowSums(Ns[, p, ] >= MIN.OBS) == 3
550
+	} else nobs <- rowSums(Ns[, p, c(1,3)] >= MIN.OBS) == 2
552 551
 	index.complete <- which(correct.orderA & correct.orderB & nobs) ##be selective here
553 552
 	size <- min(5000, length(index.complete))
554 553
 	if(size == 5000) index.complete <- sample(index.complete, 5000)
... ...
@@ -556,9 +555,9 @@ oneBatch <- function(plateIndex, G, A, B, conf, CONF.THR=0.99, MIN.OBS=3, DF.PRI
556 555
 		warning("fewer than 200 snps pass criteria for predicting the sufficient statistics")
557 556
 		stop()
558 557
 	}
559
-	index.AA <- which(NN[, p, "AA"] == 0 & (NN[, p, "AB"] >= MIN.OBS & NN[, p, "BB"] >= MIN.OBS))
560
-	index.AB <- which(NN[, p, "AB"] == 0 & (NN[, p, "AA"] >= MIN.OBS & NN[, p, "BB"] >= MIN.OBS))
561
-	index.BB <- which(NN[, p, "BB"] == 0 & (NN[, p, "AB"] >= MIN.OBS & NN[, p, "AA"] >= MIN.OBS))
558
+	index.AA <- which(Ns[, p, "AA"] == 0 & (Ns[, p, "AB"] >= MIN.OBS & Ns[, p, "BB"] >= MIN.OBS))
559
+	index.AB <- which(Ns[, p, "AB"] == 0 & (Ns[, p, "AA"] >= MIN.OBS & Ns[, p, "BB"] >= MIN.OBS))
560
+	index.BB <- which(Ns[, p, "BB"] == 0 & (Ns[, p, "AB"] >= MIN.OBS & Ns[, p, "AA"] >= MIN.OBS))
562 561
 	if(length(index.AA) > 0){
563 562
 		##Predict mean for AA genotypes
564 563
 		X.AA <- cbind(1, muA.AB[index.complete, p], muA.BB[index.complete, p],
... ...
@@ -605,9 +604,9 @@ oneBatch <- function(plateIndex, G, A, B, conf, CONF.THR=0.99, MIN.OBS=3, DF.PRI
605 604
 #	index.AA <- which(NN[, p, "AA"] == 0 & (NN[, p, "AB"] < MIN.OBS | NN[, p, "BB"] < MIN.OBS))
606 605
 #	index.AB <- which(NN[, p, "AB"] == 0 & (NN[, p, "AA"] < MIN.OBS | NN[, p, "BB"] < MIN.OBS))
607 606
 #	index.BB <- which(NN[, p, "BB"] == 0 & (NN[, p, "AB"] >= MIN.OBS | NN[, p, "AA"] >= MIN.OBS))	
608
-	noAA <- NN[, p, "AA"] < MIN.OBS
609
-	noAB <- NN[, p, "AB"] < MIN.OBS
610
-	noBB <- NN[, p, "BB"] < MIN.OBS
607
+	noAA <- Ns[, p, "AA"] < MIN.OBS
608
+	noAB <- Ns[, p, "AB"] < MIN.OBS
609
+	noBB <- Ns[, p, "BB"] < MIN.OBS
611 610
 	##---------------------------------------------------------------------------
612 611
 	## Two genotype clusters not observed -- would sequence help? (didn't seem that helpful)
613 612
 	## 1 extract index of complete data