Browse code

Bug fix for estimation of copy number at nonpolymorphic loci on chromosome X.

Other changes:

- the default threshold for the confidence threshold (GT.CONF.THR) is
now 0.8

- fixed a bug in fit.lm4 that resulted in nans and Inf for nuA and
phiA at nonpolymorphic markers on chromosome X.

- uncommented code in the ACN function to calculate copy number at
nonpolymorphic loci on chromosome X. Seems to be working for both
men and women.

- Added suggestions to the vignette for exploring the missing value
pattern. Most often this will likely be due to the genotype
confidence threshold

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@50805 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 11/11/2010 04:13:08
Showing 6 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.9.4
4
+Version: 1.9.5
5 5
 Date: 2010-10-17
6 6
 Author: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -1,6 +1,6 @@
1 1
 useDynLib("crlmm", .registration=TRUE)
2 2
 ## this is temporary
3
-exportPattern("^[^\\.]")
3
+## exportPattern("^[^\\.]")
4 4
 
5 5
 ##---------------------------------------------------------------------------
6 6
 ## Biobase
... ...
@@ -906,21 +906,25 @@ fit.lm4 <- function(strata,
906 906
 		}
907 907
 		tmp <- cbind(medianA.AA.snp[, k], medianB.BB.snp[,k], phiA.snp[, k], phiB.snp[, k])
908 908
 		tmp <- tmp[rowSums(is.na(tmp) == 0) & rowSums(tmp < 20) == 0, ]
909
-		stopifnot(nrow(tmp) > 100)
909
+		if(nrow(tmp) < 100){
910
+			stop("too few markers for estimating nonpolymorphic CN on chromosome X")
911
+		}
910 912
 		X <- cbind(1, log2(c(tmp[, 1], tmp[, 2])))
911 913
 		Y <- log2(c(tmp[, 3], tmp[, 4]))
912 914
 		betahat <- solve(crossprod(X), crossprod(X, Y))
913 915
 		if(enough.men){
914
-			X.men <- cbind(1, medianA.AA.M[, k])
916
+			##X.men <- cbind(1, medianA.AA.M[, k])
917
+			X.men <- cbind(1, log2(medianA.AA.M[, k]))
915 918
 			Yhat1 <- as.numeric(X.men %*% betahat)
916 919
 			## put intercept and slope for men in nuB and phiB
917 920
 			phiB[, k] <- 2^(Yhat1)
918
-			nuB[, k] <- 2^(medianA.AA.M[, k]) - phiB[, k]
921
+			##nuB[, k] <- 2^(medianA.AA.M[, k]) - phiB[, k]
922
+			nuB[, k] <- medianA.AA.M[, k] - 1*phiB[, k] ## male has 1 copy
919 923
 		}
920
-		X.fem <- cbind(1, medianA.AA.F[, k])
924
+		X.fem <- cbind(1, log2(medianA.AA.F[, k]))
921 925
 		Yhat2 <- as.numeric(X.fem %*% betahat)
922 926
 		phiA[, k] <- 2^(Yhat2)
923
-		nuA[, k] <- 2^(medianA.AA.F[, k]) - 2*phiA[, k]
927
+		nuA[, k] <- medianA.AA.F[, k] - 2*phiA[, k]
924 928
 	}
925 929
 	if(THR.NU.PHI){
926 930
 		nuA[nuA < MIN.NU] <- MIN.NU
... ...
@@ -1685,6 +1689,7 @@ summarizeSnps <- function(strata,
1685 1689
 	CP <- as.matrix(snpCallProbability(object)[index, ])
1686 1690
 	AA <- as.matrix(A(object)[index, ])
1687 1691
 	BB <- as.matrix(B(object)[index, ])
1692
+	FL <- as.matrix(flags(object)[index, ])
1688 1693
 	if(verbose) message("        Computing summaries...")
1689 1694
 	for(k in seq_along(batches)){
1690 1695
 		B <- batches[[k]]
... ...
@@ -1695,6 +1700,12 @@ summarizeSnps <- function(strata,
1695 1700
 		xx <- CP[, B]
1696 1701
 		highConf <- (1-exp(-xx/1000)) > GT.CONF.THR
1697 1702
 		G <- G*highConf
1703
+		## Some markers may have genotype confidences scores that are ALL below the threshold
1704
+		## For these markers, provide statistical summaries based on all the samples and flag
1705
+		## Provide summaries for these markers and flag to indicate the problem
1706
+		ii <- which(rowSums(G) == 0)
1707
+		G[ii, ] <- GG[ii, B]
1708
+		## table(rowSums(G==0))
1698 1709
 		##G <- G*highConf*NORM
1699 1710
 		A <- AA[, B]
1700 1711
 		B <- BB[, B]
... ...
@@ -1771,7 +1782,7 @@ crlmmCopynumber <- function(object,
1771 1782
 			    prior.prob=rep(1/4,4),
1772 1783
 			    seed=1,
1773 1784
 			    verbose=TRUE,
1774
-			    GT.CONF.THR=0.95,
1785
+			    GT.CONF.THR=0.80,
1775 1786
 			    MIN.NU=2^3,
1776 1787
 			    MIN.PHI=2^3,
1777 1788
 			    THR.NU.PHI=TRUE,
... ...
@@ -1797,14 +1808,20 @@ crlmmCopynumber <- function(object,
1797 1808
 		       X.SNP="chromosome X SNPs",
1798 1809
 		       X.NP="chromosome X nonpolymorphic markers")
1799 1810
 	}
1800
-	if(verbose) message("Computing summary statistics of the genotype clusters for each batch")
1811
+	if(verbose & is.lds) {
1812
+		message("Large data support with ff package is enabled.")
1813
+		message("To reduce RAM, the markers are divided into several strata (see ?ocProbesets for details) and summary statistics are computed on each stratum")
1814
+	}
1801 1815
 	for(i in seq_along(type)){
1802 1816
 		## do all types
1803 1817
 		marker.type <- type[i]
1804 1818
 		if(verbose) message(paste("...", mylabel(marker.type)))
1805 1819
 		##if(verbose) message(paste("Computing summary statistics for ", mylabel(marker.type), " genotype clusters for each batch")
1806
-		marker.index <- whichMarkers(marker.type, is.snp,
1807
-					     is.autosome, is.annotated, is.X)
1820
+		marker.index <- whichMarkers(marker.type,
1821
+					     is.snp,
1822
+					     is.autosome,
1823
+					     is.annotated,
1824
+					     is.X)
1808 1825
 		object <- genotypeSummary(object=object,
1809 1826
 					  GT.CONF.THR=GT.CONF.THR,
1810 1827
 					  type=marker.type,
... ...
@@ -456,28 +456,28 @@ ACN <- function(object, allele, i , j){
456 456
 				marker.index <- i[is.X & !is.snp]
457 457
 				acn.index <- which(is.X & !is.snp)
458 458
 				acn[acn.index, ] <- NA
459
-##				female.index <- j[object$gender[j] == 2]
460
-##				## 3. CHR X NPs: women
461
-##				if(length(female.index) > 0){
462
-##					female.batch.index <- match(unique(as.character(batch(object))[female.index]), batchNames(object))
463
-##					jj <- which(object$gender[j] == 2)
464
-##					acn[acn.index, jj] <- C1(object, marker.index, female.batch.index, female.index)
465
-##				}
459
+				female.index <- j[object$gender[j] == 2]
460
+				## 3. CHR X NPs: women
461
+				if(length(female.index) > 0){
462
+					female.batch.index <- match(unique(as.character(batch(object))[female.index]), batchNames(object))
463
+					jj <- which(object$gender[j] == 2)
464
+					acn[acn.index, jj] <- C1(object, marker.index, female.batch.index, female.index)
465
+				}
466 466
 ##				## 4. CHR X NPs: men
467
-##				male.index <- j[object$gender[j] == 1]
468
-##				if(length(male.index) > 0){
469
-##					if(is.ff){
470
-##						open(nuB(object))
471
-##						open(phiB(object))
472
-##					}
473
-##					male.batch.index <- match(unique(as.character(batch(object))[male.index]), batchNames(object))
474
-##					jj <- which(object$gender[j] == 1)
475
-##					acn[acn.index, jj] <- C2(object, marker.index, male.batch.index, male.index, NP.X=TRUE)
476
-##					if(is.ff){
477
-##						close(nuB(object))
478
-##						close(phiB(object))
479
-##					}
480
-##				}
467
+				male.index <- j[object$gender[j] == 1]
468
+				if(length(male.index) > 0){
469
+					if(is.ff){
470
+						open(nuB(object))
471
+						open(phiB(object))
472
+					}
473
+					male.batch.index <- match(unique(as.character(batch(object))[male.index]), batchNames(object))
474
+					jj <- which(object$gender[j] == 1)
475
+					acn[acn.index, jj] <- C2(object, marker.index, male.batch.index, male.index, NP.X=TRUE)
476
+					if(is.ff){
477
+						close(nuB(object))
478
+						close(phiB(object))
479
+					}
480
+				}
481 481
 			}
482 482
 		}
483 483
 		if(is.ff){
... ...
@@ -51,19 +51,19 @@ platforms must have a corresponding annotation package (installed
51 51
 separately) that are listed below.
52 52
 
53 53
 <<annotationPackages>>=
54
-annotationPackages()
54
+pkgs <- annotationPackages()
55
+pkgs <- pgks[grep("Crlmm", pkgs)]
56
+pkgs
55 57
 @
56 58
 
57
-Only the annotation package that end with the 'Crlmm' postfix are
58
-supported for copy number estimation. For instance,
59
-'pd.genomewidesnp6' and 'genomewidesnp6Crlmm' are both annotation
60
-packages for the Affymetrix 6.0 platform, but the
61
-'genomewidesnp6Crlmm' annotation package must be used for copy number
62
-estimation.  The annotation package is specified through the 'cdfName'
63
-we specify the cdf name for Affymetrix 6.0, provide the complete path
64
-to the CEL files, and indicate where intermediate files from the copy
65
-number estimation are to be saved.
59
+Note that 'pd.genomewidesnp6' and 'genomewidesnp6Crlmm' are both
60
+annotation packages for the Affymetrix 6.0 platform available on
61
+Bioconductor, but the 'genomewidesnp6Crlmm' annotation package must be
62
+used for copy number estimation.  The annotation package is specified
63
+through the 'cdfName' -- the identifier without the 'Crlmm' postfix.
64
+In the following code, we specify the cdf name for Affymetrix 6.0,
65
+provide the complete path to the CEL files, and indicate where
66
+intermediate files from the copy number estimation are to be saved.
66 67
 
67 68
 <<setup>>=
68 69
 cdfName <- "genomewidesnp6"
... ...
@@ -188,10 +188,11 @@ The \Rfunction{crlmmCopynumber} performs the following steps:
188 188
   displayed during the processing.
189 189
 
190 190
 <<LDS_copynumber>>=
191
-cnSet <- checkExists("cnSet", .path=outdir, .FUN=crlmmCopynumber, object=gtSet)
191
+GT.CONF.THR <- 0.90
192
+cnSet <- checkExists("cnSet", .path=outdir, .FUN=crlmmCopynumber, object=gtSet, GT.CONF.THR=GT.CONF.THR)
192 193
 @
193 194
 
194
-In an effort to reduct I/O, the \Rpackage{crlmmCopynumber} function no
195
+In an effort to reduce I/O, the \Rpackage{crlmmCopynumber} function no
195 196
 longer stores the allele-specific estimates of copy number as part of
196 197
 the object.  Rather, several functions are available that will compute
197 198
 relatively quickly the allele-specific estimates from the stored
... ...
@@ -476,6 +477,118 @@ Scatterplots of the A and B allele intensities (log-scale) can be
476 477
 useful for assessing the biallelic genotype calls.  This section of
477 478
 the vignette is currently under development.
478 479
 
480
+\section{Reasons for missing values}
481
+
482
+There are several reasons for estimates of the allele-specific copy
483
+number to have missing values (\texttt{NA}'s). This section briefly
484
+elaborates on the source of missing values in the HapMap analysis and
485
+discusses possible alternatives to reduce the number of missing
486
+values.  Note that allele-specific copy number, 'CA' and 'CB', is not
487
+saved in the \Robject{cnSet} object.  Rather, the respective accessors
488
+calculate 'CA' and 'CB' on the fly from the normalized intensities and
489
+from the marker-specific parameter estimates in the linear model.  In
490
+general, a missing value arises when the background or slope parameter
491
+was not estimated in the linear model. Most often, missing values
492
+occur when the genotype confidence scores for a SNP were below the
493
+threshold used by the \Robject{crlmmCopynumber} function. For the
494
+HapMap analysis, we used a confidence threshold of
495
+\Sexpr{GT.CONF.THR}.
496
+
497
+<<NA>>=
498
+autosome.index <- which(isSnp(cnSet) & chromosome(cnSet) < 23)
499
+sample.index <- which(batch(cnSet) == "C")
500
+ca <- CA(cnSet, i=autosome.index, j=sample.index)
501
+missing.ca <- is.na(ca)
502
+(nmissing.ca <- sum(missing.ca))
503
+@
504
+
505
+If \Robject{nmissing.ca} is nonzero, then one could check the genotype
506
+confidence scores provided by the crlmm genotyping algorithm against
507
+the threshold specified in \Robject{crlmmCopynumber}.
508
+
509
+<<NAconfidenceScores>>=
510
+if(nmissing.ca > 0){
511
+	gt.conf <- i2p(snpCallProbability(cnSet)[autosome.index, sample.index])
512
+	below.thr <- gt.conf < GT.CONF.THR
513
+	index.allbelow <- as.integer(which(rowSums(below.thr) == length(sample.index)))
514
+	all.equal(index, index.allbelow)
515
+	## or calculate the proportion of missing effected by low crlmm confidence
516
+	sum(index.allbelow %in% index)/length(index)
517
+}
518
+@
519
+
520
+We repeat the above check for missing values at polymorphic loci on
521
+chromosome X.  In this case, we compare the \Robject{rowSums} of the
522
+missing values to the number of samples to check whether all of the
523
+estimates are missing for a given SNP.
524
+
525
+<<NAchromosomeX>>=
526
+X.index <- which(isSnp(cnSet) & chromosome(cnSet) == 23)
527
+ca.X <- CA(cnSet, i=X.index, j=sample.index)
528
+missing.caX <- is.na(ca.X)
529
+nmissing.caX <- sum(missing.caX)
530
+missing.snp.index <- which(rowSums(missing.caX) == length(sample.index))
531
+index <- which(rowSums(missing.caX) == length(sample.index))
532
+length(index)*length(sample.index)/nmissing.caX
533
+@
534
+
535
+From the above codechunk, we see that \Sexpr{length(index)} SNPs have
536
+NAs for all the samples. Next, we tally the number of NAs for
537
+polymorphic markers on chromosome X that are below the confidence
538
+threshold.  For the HapMap analysis, all of the missing values arose
539
+from SNPs in which either the men or the women had confidence scores
540
+that were all below the threshold.
541
+
542
+<<NAcrlmmConfidenceScore>>=
543
+if(nmissing.caX > 0){
544
+	gt.conf <- i2p(snpCallProbability(cnSet)[X.index, sample.index])
545
+	below.thr <- gt.conf < GT.CONF.THR
546
+	F <- which(cnSet$gender[sample.index] == 2)
547
+	M <- which(cnSet$gender[sample.index] == 1)
548
+	index.allbelowF <- as.integer(which(rowSums(below.thr[, F]) == length(F)))
549
+	index.allbelowM <- as.integer(which(rowSums(below.thr[, M]) == length(M)))
550
+	index.allbelow <- as.integer(unique(c(index.allbelowF, index.allbelowM)))
551
+	all.equal(index, index.allbelow)
552
+	## or calculate the proportion of missing effected by low crlmm confidence
553
+	sum(index.allbelow %in% index)/length(index)
554
+}
555
+@
556
+
557
+Next, we verify that the number of missing values is the same for the
558
+'B' allele at autosomal polymorphic loci
559
+
560
+<<NAmissingForBallele>>=
561
+cb <- CB(cnSet, i=autosome.index, j=sample.index)
562
+missing.cb <- is.na(cb)
563
+nmissing.cb <- sum(missing.cb)
564
+@
565
+
566
+For nonpolymorphic loci, the genotype confidence score is irrelevant
567
+and estimates should be available at all markers.
568
+
569
+<<NAnonpolymorphic.autosome>>=
570
+np.index <- which(!isSnp(cnSet) & chromosome(cnSet)==23)
571
+cb <- CB(cnSet, i=np.index, j=sample.index)
572
+stopifnot(sum(is.na(cb)) == 0)
573
+ca.F <- CA(cnSet, i=np.index, j=F)
574
+ca.M <- CA(cnSet, i=np.index, j=M)
575
+sum(is.na(ca.F))
576
+sum(is.na(ca.M))
577
+@
578
+
579
+In total, there were \Sexpr{length(missing.snp.index)} polymorphic
580
+markers on chromosome X for which copy number estimates are not
581
+available.  Lowering the confidence threshold would permit estimation
582
+of copy number at most of these loci.  A confidence threshold is
583
+included as a parameter for the copy number estimation as an approach
584
+to reduce the sensitivity of genotype-specific summary statistics,
585
+such as the within-genotype median, to intensities from samples that
586
+do not clearly fall into one of the biallelic genotype clusters. There
587
+are drawbacks to this approach, including variance estimates that are
588
+overly optimistic.  More direct approaches for outlier detection and
589
+removal may be explored in the future.
590
+
591
+
479 592
 \section{Session information}
480 593
 <<sessionInfo, results=tex>>=
481 594
 toLatex(sessionInfo())
... ...
@@ -10,7 +10,7 @@
10 10
 crlmmCopynumber(object, MIN.SAMPLES=10, SNRMin = 5, MIN.OBS = 1, 
11 11
 	        DF.PRIOR = 50, bias.adj = FALSE,
12 12
                 prior.prob = rep(1/4, 4), seed = 1, verbose = TRUE,
13
-                GT.CONF.THR = 0.95, MIN.NU = 2^3, MIN.PHI = 2^3,
13
+                GT.CONF.THR = 0.80, MIN.NU = 2^3, MIN.PHI = 2^3,
14 14
                 THR.NU.PHI = TRUE, type=c("SNP", "NP", "X.SNP", "X.NP"))
15 15
 }
16 16