Other changes:
- the default threshold for the confidence threshold (GT.CONF.THR) is
now 0.8
- fixed a bug in fit.lm4 that resulted in nans and Inf for nuA and
phiA at nonpolymorphic markers on chromosome X.
- uncommented code in the ACN function to calculate copy number at
nonpolymorphic loci on chromosome X. Seems to be working for both
men and women.
- Added suggestions to the vignette for exploring the missing value
pattern. Most often this will likely be due to the genotype
confidence threshold
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@50805 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.9.4 |
|
4 |
+Version: 1.9.5 |
|
5 | 5 |
Date: 2010-10-17 |
6 | 6 |
Author: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry |
7 | 7 |
Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
... | ... |
@@ -906,21 +906,25 @@ fit.lm4 <- function(strata, |
906 | 906 |
} |
907 | 907 |
tmp <- cbind(medianA.AA.snp[, k], medianB.BB.snp[,k], phiA.snp[, k], phiB.snp[, k]) |
908 | 908 |
tmp <- tmp[rowSums(is.na(tmp) == 0) & rowSums(tmp < 20) == 0, ] |
909 |
- stopifnot(nrow(tmp) > 100) |
|
909 |
+ if(nrow(tmp) < 100){ |
|
910 |
+ stop("too few markers for estimating nonpolymorphic CN on chromosome X") |
|
911 |
+ } |
|
910 | 912 |
X <- cbind(1, log2(c(tmp[, 1], tmp[, 2]))) |
911 | 913 |
Y <- log2(c(tmp[, 3], tmp[, 4])) |
912 | 914 |
betahat <- solve(crossprod(X), crossprod(X, Y)) |
913 | 915 |
if(enough.men){ |
914 |
- X.men <- cbind(1, medianA.AA.M[, k]) |
|
916 |
+ ##X.men <- cbind(1, medianA.AA.M[, k]) |
|
917 |
+ X.men <- cbind(1, log2(medianA.AA.M[, k])) |
|
915 | 918 |
Yhat1 <- as.numeric(X.men %*% betahat) |
916 | 919 |
## put intercept and slope for men in nuB and phiB |
917 | 920 |
phiB[, k] <- 2^(Yhat1) |
918 |
- nuB[, k] <- 2^(medianA.AA.M[, k]) - phiB[, k] |
|
921 |
+ ##nuB[, k] <- 2^(medianA.AA.M[, k]) - phiB[, k] |
|
922 |
+ nuB[, k] <- medianA.AA.M[, k] - 1*phiB[, k] ## male has 1 copy |
|
919 | 923 |
} |
920 |
- X.fem <- cbind(1, medianA.AA.F[, k]) |
|
924 |
+ X.fem <- cbind(1, log2(medianA.AA.F[, k])) |
|
921 | 925 |
Yhat2 <- as.numeric(X.fem %*% betahat) |
922 | 926 |
phiA[, k] <- 2^(Yhat2) |
923 |
- nuA[, k] <- 2^(medianA.AA.F[, k]) - 2*phiA[, k] |
|
927 |
+ nuA[, k] <- medianA.AA.F[, k] - 2*phiA[, k] |
|
924 | 928 |
} |
925 | 929 |
if(THR.NU.PHI){ |
926 | 930 |
nuA[nuA < MIN.NU] <- MIN.NU |
... | ... |
@@ -1685,6 +1689,7 @@ summarizeSnps <- function(strata, |
1685 | 1689 |
CP <- as.matrix(snpCallProbability(object)[index, ]) |
1686 | 1690 |
AA <- as.matrix(A(object)[index, ]) |
1687 | 1691 |
BB <- as.matrix(B(object)[index, ]) |
1692 |
+ FL <- as.matrix(flags(object)[index, ]) |
|
1688 | 1693 |
if(verbose) message(" Computing summaries...") |
1689 | 1694 |
for(k in seq_along(batches)){ |
1690 | 1695 |
B <- batches[[k]] |
... | ... |
@@ -1695,6 +1700,12 @@ summarizeSnps <- function(strata, |
1695 | 1700 |
xx <- CP[, B] |
1696 | 1701 |
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR |
1697 | 1702 |
G <- G*highConf |
1703 |
+ ## Some markers may have genotype confidences scores that are ALL below the threshold |
|
1704 |
+ ## For these markers, provide statistical summaries based on all the samples and flag |
|
1705 |
+ ## Provide summaries for these markers and flag to indicate the problem |
|
1706 |
+ ii <- which(rowSums(G) == 0) |
|
1707 |
+ G[ii, ] <- GG[ii, B] |
|
1708 |
+ ## table(rowSums(G==0)) |
|
1698 | 1709 |
##G <- G*highConf*NORM |
1699 | 1710 |
A <- AA[, B] |
1700 | 1711 |
B <- BB[, B] |
... | ... |
@@ -1771,7 +1782,7 @@ crlmmCopynumber <- function(object, |
1771 | 1782 |
prior.prob=rep(1/4,4), |
1772 | 1783 |
seed=1, |
1773 | 1784 |
verbose=TRUE, |
1774 |
- GT.CONF.THR=0.95, |
|
1785 |
+ GT.CONF.THR=0.80, |
|
1775 | 1786 |
MIN.NU=2^3, |
1776 | 1787 |
MIN.PHI=2^3, |
1777 | 1788 |
THR.NU.PHI=TRUE, |
... | ... |
@@ -1797,14 +1808,20 @@ crlmmCopynumber <- function(object, |
1797 | 1808 |
X.SNP="chromosome X SNPs", |
1798 | 1809 |
X.NP="chromosome X nonpolymorphic markers") |
1799 | 1810 |
} |
1800 |
- if(verbose) message("Computing summary statistics of the genotype clusters for each batch") |
|
1811 |
+ if(verbose & is.lds) { |
|
1812 |
+ message("Large data support with ff package is enabled.") |
|
1813 |
+ message("To reduce RAM, the markers are divided into several strata (see ?ocProbesets for details) and summary statistics are computed on each stratum") |
|
1814 |
+ } |
|
1801 | 1815 |
for(i in seq_along(type)){ |
1802 | 1816 |
## do all types |
1803 | 1817 |
marker.type <- type[i] |
1804 | 1818 |
if(verbose) message(paste("...", mylabel(marker.type))) |
1805 | 1819 |
##if(verbose) message(paste("Computing summary statistics for ", mylabel(marker.type), " genotype clusters for each batch") |
1806 |
- marker.index <- whichMarkers(marker.type, is.snp, |
|
1807 |
- is.autosome, is.annotated, is.X) |
|
1820 |
+ marker.index <- whichMarkers(marker.type, |
|
1821 |
+ is.snp, |
|
1822 |
+ is.autosome, |
|
1823 |
+ is.annotated, |
|
1824 |
+ is.X) |
|
1808 | 1825 |
object <- genotypeSummary(object=object, |
1809 | 1826 |
GT.CONF.THR=GT.CONF.THR, |
1810 | 1827 |
type=marker.type, |
... | ... |
@@ -456,28 +456,28 @@ ACN <- function(object, allele, i , j){ |
456 | 456 |
marker.index <- i[is.X & !is.snp] |
457 | 457 |
acn.index <- which(is.X & !is.snp) |
458 | 458 |
acn[acn.index, ] <- NA |
459 |
-## female.index <- j[object$gender[j] == 2] |
|
460 |
-## ## 3. CHR X NPs: women |
|
461 |
-## if(length(female.index) > 0){ |
|
462 |
-## female.batch.index <- match(unique(as.character(batch(object))[female.index]), batchNames(object)) |
|
463 |
-## jj <- which(object$gender[j] == 2) |
|
464 |
-## acn[acn.index, jj] <- C1(object, marker.index, female.batch.index, female.index) |
|
465 |
-## } |
|
459 |
+ female.index <- j[object$gender[j] == 2] |
|
460 |
+ ## 3. CHR X NPs: women |
|
461 |
+ if(length(female.index) > 0){ |
|
462 |
+ female.batch.index <- match(unique(as.character(batch(object))[female.index]), batchNames(object)) |
|
463 |
+ jj <- which(object$gender[j] == 2) |
|
464 |
+ acn[acn.index, jj] <- C1(object, marker.index, female.batch.index, female.index) |
|
465 |
+ } |
|
466 | 466 |
## ## 4. CHR X NPs: men |
467 |
-## male.index <- j[object$gender[j] == 1] |
|
468 |
-## if(length(male.index) > 0){ |
|
469 |
-## if(is.ff){ |
|
470 |
-## open(nuB(object)) |
|
471 |
-## open(phiB(object)) |
|
472 |
-## } |
|
473 |
-## male.batch.index <- match(unique(as.character(batch(object))[male.index]), batchNames(object)) |
|
474 |
-## jj <- which(object$gender[j] == 1) |
|
475 |
-## acn[acn.index, jj] <- C2(object, marker.index, male.batch.index, male.index, NP.X=TRUE) |
|
476 |
-## if(is.ff){ |
|
477 |
-## close(nuB(object)) |
|
478 |
-## close(phiB(object)) |
|
479 |
-## } |
|
480 |
-## } |
|
467 |
+ male.index <- j[object$gender[j] == 1] |
|
468 |
+ if(length(male.index) > 0){ |
|
469 |
+ if(is.ff){ |
|
470 |
+ open(nuB(object)) |
|
471 |
+ open(phiB(object)) |
|
472 |
+ } |
|
473 |
+ male.batch.index <- match(unique(as.character(batch(object))[male.index]), batchNames(object)) |
|
474 |
+ jj <- which(object$gender[j] == 1) |
|
475 |
+ acn[acn.index, jj] <- C2(object, marker.index, male.batch.index, male.index, NP.X=TRUE) |
|
476 |
+ if(is.ff){ |
|
477 |
+ close(nuB(object)) |
|
478 |
+ close(phiB(object)) |
|
479 |
+ } |
|
480 |
+ } |
|
481 | 481 |
} |
482 | 482 |
} |
483 | 483 |
if(is.ff){ |
... | ... |
@@ -51,19 +51,19 @@ platforms must have a corresponding annotation package (installed |
51 | 51 |
separately) that are listed below. |
52 | 52 |
|
53 | 53 |
<<annotationPackages>>= |
54 |
-annotationPackages() |
|
54 |
+pkgs <- annotationPackages() |
|
55 |
+pkgs <- pgks[grep("Crlmm", pkgs)] |
|
56 |
+pkgs |
|
55 | 57 |
@ |
56 | 58 |
|
57 |
-Only the annotation package that end with the 'Crlmm' postfix are |
|
58 |
-supported for copy number estimation. For instance, |
|
59 |
-'pd.genomewidesnp6' and 'genomewidesnp6Crlmm' are both annotation |
|
60 |
-packages for the Affymetrix 6.0 platform, but the |
|
61 |
-'genomewidesnp6Crlmm' annotation package must be used for copy number |
|
62 |
-estimation. The annotation package is specified through the 'cdfName' |
|
63 |
-we specify the cdf name for Affymetrix 6.0, provide the complete path |
|
64 |
-to the CEL files, and indicate where intermediate files from the copy |
|
65 |
-number estimation are to be saved. |
|
59 |
+Note that 'pd.genomewidesnp6' and 'genomewidesnp6Crlmm' are both |
|
60 |
+annotation packages for the Affymetrix 6.0 platform available on |
|
61 |
+Bioconductor, but the 'genomewidesnp6Crlmm' annotation package must be |
|
62 |
+used for copy number estimation. The annotation package is specified |
|
63 |
+through the 'cdfName' -- the identifier without the 'Crlmm' postfix. |
|
64 |
+In the following code, we specify the cdf name for Affymetrix 6.0, |
|
65 |
+provide the complete path to the CEL files, and indicate where |
|
66 |
+intermediate files from the copy number estimation are to be saved. |
|
66 | 67 |
|
67 | 68 |
<<setup>>= |
68 | 69 |
cdfName <- "genomewidesnp6" |
... | ... |
@@ -188,10 +188,11 @@ The \Rfunction{crlmmCopynumber} performs the following steps: |
188 | 188 |
displayed during the processing. |
189 | 189 |
|
190 | 190 |
<<LDS_copynumber>>= |
191 |
-cnSet <- checkExists("cnSet", .path=outdir, .FUN=crlmmCopynumber, object=gtSet) |
|
191 |
+GT.CONF.THR <- 0.90 |
|
192 |
+cnSet <- checkExists("cnSet", .path=outdir, .FUN=crlmmCopynumber, object=gtSet, GT.CONF.THR=GT.CONF.THR) |
|
192 | 193 |
@ |
193 | 194 |
|
194 |
-In an effort to reduct I/O, the \Rpackage{crlmmCopynumber} function no |
|
195 |
+In an effort to reduce I/O, the \Rpackage{crlmmCopynumber} function no |
|
195 | 196 |
longer stores the allele-specific estimates of copy number as part of |
196 | 197 |
the object. Rather, several functions are available that will compute |
197 | 198 |
relatively quickly the allele-specific estimates from the stored |
... | ... |
@@ -476,6 +477,118 @@ Scatterplots of the A and B allele intensities (log-scale) can be |
476 | 477 |
useful for assessing the biallelic genotype calls. This section of |
477 | 478 |
the vignette is currently under development. |
478 | 479 |
|
480 |
+\section{Reasons for missing values} |
|
481 |
+ |
|
482 |
+There are several reasons for estimates of the allele-specific copy |
|
483 |
+number to have missing values (\texttt{NA}'s). This section briefly |
|
484 |
+elaborates on the source of missing values in the HapMap analysis and |
|
485 |
+discusses possible alternatives to reduce the number of missing |
|
486 |
+values. Note that allele-specific copy number, 'CA' and 'CB', is not |
|
487 |
+saved in the \Robject{cnSet} object. Rather, the respective accessors |
|
488 |
+calculate 'CA' and 'CB' on the fly from the normalized intensities and |
|
489 |
+from the marker-specific parameter estimates in the linear model. In |
|
490 |
+general, a missing value arises when the background or slope parameter |
|
491 |
+was not estimated in the linear model. Most often, missing values |
|
492 |
+occur when the genotype confidence scores for a SNP were below the |
|
493 |
+threshold used by the \Robject{crlmmCopynumber} function. For the |
|
494 |
+HapMap analysis, we used a confidence threshold of |
|
495 |
+\Sexpr{GT.CONF.THR}. |
|
496 |
+ |
|
497 |
+<<NA>>= |
|
498 |
+autosome.index <- which(isSnp(cnSet) & chromosome(cnSet) < 23) |
|
499 |
+sample.index <- which(batch(cnSet) == "C") |
|
500 |
+ca <- CA(cnSet, i=autosome.index, j=sample.index) |
|
501 |
+missing.ca <- is.na(ca) |
|
502 |
+(nmissing.ca <- sum(missing.ca)) |
|
503 |
+@ |
|
504 |
+ |
|
505 |
+If \Robject{nmissing.ca} is nonzero, then one could check the genotype |
|
506 |
+confidence scores provided by the crlmm genotyping algorithm against |
|
507 |
+the threshold specified in \Robject{crlmmCopynumber}. |
|
508 |
+ |
|
509 |
+<<NAconfidenceScores>>= |
|
510 |
+if(nmissing.ca > 0){ |
|
511 |
+ gt.conf <- i2p(snpCallProbability(cnSet)[autosome.index, sample.index]) |
|
512 |
+ below.thr <- gt.conf < GT.CONF.THR |
|
513 |
+ index.allbelow <- as.integer(which(rowSums(below.thr) == length(sample.index))) |
|
514 |
+ all.equal(index, index.allbelow) |
|
515 |
+ ## or calculate the proportion of missing effected by low crlmm confidence |
|
516 |
+ sum(index.allbelow %in% index)/length(index) |
|
517 |
+} |
|
518 |
+@ |
|
519 |
+ |
|
520 |
+We repeat the above check for missing values at polymorphic loci on |
|
521 |
+chromosome X. In this case, we compare the \Robject{rowSums} of the |
|
522 |
+missing values to the number of samples to check whether all of the |
|
523 |
+estimates are missing for a given SNP. |
|
524 |
+ |
|
525 |
+<<NAchromosomeX>>= |
|
526 |
+X.index <- which(isSnp(cnSet) & chromosome(cnSet) == 23) |
|
527 |
+ca.X <- CA(cnSet, i=X.index, j=sample.index) |
|
528 |
+missing.caX <- is.na(ca.X) |
|
529 |
+nmissing.caX <- sum(missing.caX) |
|
530 |
+missing.snp.index <- which(rowSums(missing.caX) == length(sample.index)) |
|
531 |
+index <- which(rowSums(missing.caX) == length(sample.index)) |
|
532 |
+length(index)*length(sample.index)/nmissing.caX |
|
533 |
+@ |
|
534 |
+ |
|
535 |
+From the above codechunk, we see that \Sexpr{length(index)} SNPs have |
|
536 |
+NAs for all the samples. Next, we tally the number of NAs for |
|
537 |
+polymorphic markers on chromosome X that are below the confidence |
|
538 |
+threshold. For the HapMap analysis, all of the missing values arose |
|
539 |
+from SNPs in which either the men or the women had confidence scores |
|
540 |
+that were all below the threshold. |
|
541 |
+ |
|
542 |
+<<NAcrlmmConfidenceScore>>= |
|
543 |
+if(nmissing.caX > 0){ |
|
544 |
+ gt.conf <- i2p(snpCallProbability(cnSet)[X.index, sample.index]) |
|
545 |
+ below.thr <- gt.conf < GT.CONF.THR |
|
546 |
+ F <- which(cnSet$gender[sample.index] == 2) |
|
547 |
+ M <- which(cnSet$gender[sample.index] == 1) |
|
548 |
+ index.allbelowF <- as.integer(which(rowSums(below.thr[, F]) == length(F))) |
|
549 |
+ index.allbelowM <- as.integer(which(rowSums(below.thr[, M]) == length(M))) |
|
550 |
+ index.allbelow <- as.integer(unique(c(index.allbelowF, index.allbelowM))) |
|
551 |
+ all.equal(index, index.allbelow) |
|
552 |
+ ## or calculate the proportion of missing effected by low crlmm confidence |
|
553 |
+ sum(index.allbelow %in% index)/length(index) |
|
554 |
+} |
|
555 |
+@ |
|
556 |
+ |
|
557 |
+Next, we verify that the number of missing values is the same for the |
|
558 |
+'B' allele at autosomal polymorphic loci |
|
559 |
+ |
|
560 |
+<<NAmissingForBallele>>= |
|
561 |
+cb <- CB(cnSet, i=autosome.index, j=sample.index) |
|
562 |
+missing.cb <- is.na(cb) |
|
563 |
+nmissing.cb <- sum(missing.cb) |
|
564 |
+@ |
|
565 |
+ |
|
566 |
+For nonpolymorphic loci, the genotype confidence score is irrelevant |
|
567 |
+and estimates should be available at all markers. |
|
568 |
+ |
|
569 |
+<<NAnonpolymorphic.autosome>>= |
|
570 |
+np.index <- which(!isSnp(cnSet) & chromosome(cnSet)==23) |
|
571 |
+cb <- CB(cnSet, i=np.index, j=sample.index) |
|
572 |
+stopifnot(sum(is.na(cb)) == 0) |
|
573 |
+ca.F <- CA(cnSet, i=np.index, j=F) |
|
574 |
+ca.M <- CA(cnSet, i=np.index, j=M) |
|
575 |
+sum(is.na(ca.F)) |
|
576 |
+sum(is.na(ca.M)) |
|
577 |
+@ |
|
578 |
+ |
|
579 |
+In total, there were \Sexpr{length(missing.snp.index)} polymorphic |
|
580 |
+markers on chromosome X for which copy number estimates are not |
|
581 |
+available. Lowering the confidence threshold would permit estimation |
|
582 |
+of copy number at most of these loci. A confidence threshold is |
|
583 |
+included as a parameter for the copy number estimation as an approach |
|
584 |
+to reduce the sensitivity of genotype-specific summary statistics, |
|
585 |
+such as the within-genotype median, to intensities from samples that |
|
586 |
+do not clearly fall into one of the biallelic genotype clusters. There |
|
587 |
+are drawbacks to this approach, including variance estimates that are |
|
588 |
+overly optimistic. More direct approaches for outlier detection and |
|
589 |
+removal may be explored in the future. |
|
590 |
+ |
|
591 |
+ |
|
479 | 592 |
\section{Session information} |
480 | 593 |
<<sessionInfo, results=tex>>= |
481 | 594 |
toLatex(sessionInfo()) |
... | ... |
@@ -10,7 +10,7 @@ |
10 | 10 |
crlmmCopynumber(object, MIN.SAMPLES=10, SNRMin = 5, MIN.OBS = 1, |
11 | 11 |
DF.PRIOR = 50, bias.adj = FALSE, |
12 | 12 |
prior.prob = rep(1/4, 4), seed = 1, verbose = TRUE, |
13 |
- GT.CONF.THR = 0.95, MIN.NU = 2^3, MIN.PHI = 2^3, |
|
13 |
+ GT.CONF.THR = 0.80, MIN.NU = 2^3, MIN.PHI = 2^3, |
|
14 | 14 |
THR.NU.PHI = TRUE, type=c("SNP", "NP", "X.SNP", "X.NP")) |
15 | 15 |
} |
16 | 16 |
|