- updated Makefile so it won't fail on Bioconductor
- added alias for readIdatFiles2
- removed a few of the exported functions from NAMESPACE that were there for purposes of debugging
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@48622 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -60,7 +60,6 @@ exportMethods(open, "[", show, lM, lines) |
60 | 60 |
export(crlmm, |
61 | 61 |
crlmmCopynumber, |
62 | 62 |
crlmmIllumina, |
63 |
-## ellipse, |
|
64 | 63 |
genotype, |
65 | 64 |
readIdatFiles, |
66 | 65 |
readIdatFiles2, |
... | ... |
@@ -70,8 +69,7 @@ export(crlmm, |
70 | 69 |
genotype2, genotypeLD, |
71 | 70 |
batch, |
72 | 71 |
crlmmCopynumber2, crlmmCopynumberLD) |
73 |
-##export(initializeParamObject, biasAdjNP2) |
|
74 |
-export(construct) |
|
72 |
+ |
|
75 | 73 |
|
76 | 74 |
|
77 | 75 |
|
... | ... |
@@ -1,9 +1,9 @@ |
1 | 1 |
All: copynumber crlmmDownstream clean |
2 | 2 |
|
3 |
-copynumber: copynumber.tex |
|
3 |
+copynumber: copynumber.pdf |
|
4 | 4 |
cp -p ../scripts/copynumber.pdf . |
5 | 5 |
|
6 |
-crlmmDownstream: crlmmDownstream.tex |
|
6 |
+crlmmDownstream: crlmmDownstream.pdf |
|
7 | 7 |
cp -p ../scripts/crlmmDownstream.pdf . |
8 | 8 |
|
9 | 9 |
clean: |
10 | 10 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,88 @@ |
1 |
+setClassUnion("integerOrMissing", c("integer", "missing", "numeric")) |
|
2 |
+setGeneric("totalCopyNumber", function(object, i, j, ...) standardGeneric("totalCopyNumber")) |
|
3 |
+setMethod("totalCopyNumber", |
|
4 |
+ signature=signature(object="CNSet", i="integerOrMissing", j="integerOrMissing"), |
|
5 |
+ function(object, i, j, ...){ |
|
6 |
+ if(missing(i) & missing(j)){ |
|
7 |
+ if(inherits(CA(object), "ff") | inherits(CA(object), "ffdf")) stop("Must specify i and/or j for ff objects") |
|
8 |
+ } |
|
9 |
+ if(missing(i) & !missing(j)){ |
|
10 |
+ snp.index <- which(isSnp(object)) |
|
11 |
+ cn.total <- as.matrix(CA(object)[, j]) |
|
12 |
+ if(length(snp.index) > 0){ |
|
13 |
+ cb <- as.matrix(CB(object)[snp.index, j]) |
|
14 |
+ snps <- (1:nrow(cn.total))[i %in% snp.index] |
|
15 |
+ cn.total[snps, ] <- cn.total[snps, j] + cb |
|
16 |
+ } |
|
17 |
+ } |
|
18 |
+ if(!missing(i) & missing(j)){ |
|
19 |
+ snp.index <- intersect(which(isSnp(object)), i) |
|
20 |
+ cn.total <- as.matrix(CA(object)[i, ]) |
|
21 |
+ if(length(snp.index) > 0){ |
|
22 |
+ cb <- as.matrix(CB(object)[snp.index, ]) |
|
23 |
+ snps <- (1:nrow(cn.total))[i %in% snp.index] |
|
24 |
+ cn.total[snps, ] <- cn.total[snps, ] + cb |
|
25 |
+ } |
|
26 |
+ } |
|
27 |
+ if(!missing(i) & !missing(j)){ |
|
28 |
+ snp.index <- intersect(which(isSnp(object)), i) |
|
29 |
+ cn.total <- as.matrix(CA(object)[i, j]) |
|
30 |
+ if(length(snp.index) > 0){ |
|
31 |
+ cb <- as.matrix(CB(object)[snp.index, j]) |
|
32 |
+ snps <- (1:nrow(cn.total))[i %in% snp.index] |
|
33 |
+ cn.total[snps, ] <- cn.total[snps, ] + cb |
|
34 |
+ } |
|
35 |
+ } |
|
36 |
+ cn.total <- cn.total/100 |
|
37 |
+ dimnames(cn.total) <- NULL |
|
38 |
+ return(cn.total) |
|
39 |
+}) |
|
40 |
+ |
|
41 |
+ |
|
42 |
+constructFeatureData <- function(gns, cdfName){ |
|
43 |
+ pkgname <- paste(cdfName, "Crlmm", sep="") |
|
44 |
+ path <- system.file("extdata", package=pkgname) |
|
45 |
+ load(file.path(path, "cnProbes.rda")) |
|
46 |
+ load(file.path(path, "snpProbes.rda")) |
|
47 |
+ cnProbes$chr <- chromosome2integer(cnProbes$chr) |
|
48 |
+ cnProbes <- as.matrix(cnProbes) |
|
49 |
+ snpProbes$chr <- chromosome2integer(snpProbes$chr) |
|
50 |
+ snpProbes <- as.matrix(snpProbes) |
|
51 |
+ mapping <- rbind(snpProbes, cnProbes, deparse.level=0) |
|
52 |
+ mapping <- mapping[match(gns, rownames(mapping)), ] |
|
53 |
+ isSnp <- 1L-as.integer(gns %in% rownames(cnProbes)) |
|
54 |
+ mapping <- cbind(mapping, isSnp, deparse.level=0) |
|
55 |
+ stopifnot(identical(rownames(mapping), gns)) |
|
56 |
+ colnames(mapping) <- c("chromosome", "position", "isSnp") |
|
57 |
+ new("AnnotatedDataFrame", |
|
58 |
+ data=data.frame(mapping), |
|
59 |
+ varMetadata=data.frame(labelDescription=colnames(mapping))) |
|
60 |
+} |
|
61 |
+constructAssayData <- function(np, snp, object, storage.mode="environment", order.index){ |
|
62 |
+ stopifnot(identical(snp$gns, featureNames(object))) |
|
63 |
+ gns <- c(featureNames(object), np$gns) |
|
64 |
+ sns <- np$sns |
|
65 |
+ np <- np[1:2] |
|
66 |
+ snp <- snp[1:2] |
|
67 |
+ stripnames <- function(x) { |
|
68 |
+ dimnames(x) <- NULL |
|
69 |
+ x |
|
70 |
+ } |
|
71 |
+ np <- lapply(np, stripnames) |
|
72 |
+ snp <- lapply(snp, stripnames) |
|
73 |
+ A <- rbind(snp[[1]], np[[1]], deparse.level=0)[order.index, ] |
|
74 |
+ B <- rbind(snp[[2]], np[[2]], deparse.level=0)[order.index, ] |
|
75 |
+ gt <- stripnames(calls(object)) |
|
76 |
+ emptyMatrix <- matrix(integer(), nrow(np[[1]]), ncol(A)) |
|
77 |
+ gt <- rbind(gt, emptyMatrix, deparse.level=0)[order.index,] |
|
78 |
+ pr <- stripnames(snpCallProbability(object)) |
|
79 |
+ pr <- rbind(pr, emptyMatrix, deparse.level=0)[order.index, ] |
|
80 |
+ emptyMatrix <- matrix(integer(), nrow(A), ncol(A)) |
|
81 |
+ aD <- assayDataNew(storage.mode, |
|
82 |
+ alleleA=A, |
|
83 |
+ alleleB=B, |
|
84 |
+ call=gt, |
|
85 |
+ callProbability=pr, |
|
86 |
+ CA=emptyMatrix, |
|
87 |
+ CB=emptyMatrix) |
|
88 |
+} |
... | ... |
@@ -17,37 +17,40 @@ |
17 | 17 |
\newcommand{\R}{\textsf{R}} |
18 | 18 |
|
19 | 19 |
\begin{document} |
20 |
-\title{Copy number estimation and genotype calling with \Rpackage{crlmm}} |
|
21 |
-\date{\today} |
|
20 |
+\title{Using \crlmm{} for copy number estimation and genotype calling |
|
21 |
+ with Illumina platforms} |
|
22 |
+ |
|
23 |
+\date{\today} |
|
24 |
+ |
|
22 | 25 |
\author{Rob Scharpf} |
23 | 26 |
\maketitle |
24 | 27 |
|
25 | 28 |
Allele-specific copy number estimation in the crlmm package is |
26 | 29 |
available for several Illumina platforms. As described in the |
27 |
-\texttt{copynumber.pdf} vignette, this algorithm works best when there |
|
28 |
-are a sufficient number of samples such that AA, AB, and BB genotypes |
|
29 |
-are observed at most loci. For small studies (e.g., fewer than 50 |
|
30 |
-samples), there will be a large number of SNPs that are |
|
30 |
+\texttt{copynumber} vignette, copy number estimation in \crlmm{} works |
|
31 |
+best when there are a sufficient number of samples such that AA, AB, |
|
32 |
+and BB genotypes are observed at most loci. For small studies (e.g., |
|
33 |
+fewer than 50 samples), there will be a large number of SNPs that are |
|
31 | 34 |
monomorphic. For monomorphic SNPs, the estimation problem becomes more |
32 | 35 |
difficult and alternative strategies that estimate the relative total |
33 |
-copy number (as to absolute allele-specific copy number) may be |
|
34 |
-preferable. In the following code, we list the platforms for which |
|
35 |
-annotation packages are available for computations performed by |
|
36 |
-\crlmm{}. Next we create a directory where output files will be stored |
|
37 |
-and indicate the directory that contains the IDAT files that will be |
|
38 |
-used in our analysis. |
|
36 |
+copy number may be preferable. In addition to installing \crlmm{}, |
|
37 |
+one must also install the appropriate annotation package for the |
|
38 |
+Illumina platform. In the following code, we list the platforms for |
|
39 |
+which annotation packages are currently available. Next we create a |
|
40 |
+directory where output files will be stored and indicate the directory |
|
41 |
+that contains the IDAT files that will be used in our analysis. |
|
39 | 42 |
|
40 | 43 |
<<setup, echo=FALSE, results=hide>>= |
41 | 44 |
options(width=70) |
42 | 45 |
options(continue=" ") |
43 | 46 |
@ |
44 | 47 |
|
45 |
-<<>>= |
|
48 |
+<<libraries>>= |
|
46 | 49 |
library(crlmm) |
47 | 50 |
crlmm:::validCdfNames() |
48 |
-outdir <- "/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/trunk/illumina_vignette" |
|
51 |
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/crlmmVignette/release/illumina" |
|
52 |
+dir.create(outdir, showWarnings=FALSE, recursive=TRUE) |
|
49 | 53 |
datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
50 |
-cdfName <- "human370v1c" |
|
51 | 54 |
@ |
52 | 55 |
|
53 | 56 |
To perform copy number analysis on the Illumina platform, several |
... | ... |
@@ -69,106 +72,200 @@ scan date of the arrays for later use. The scan dates can be pulled |
69 | 72 |
from the RG object and added to the \Robject{SnpSet} returned by |
70 | 73 |
\Rfunction{crlmmIllumina} as illustrated below. |
71 | 74 |
|
75 |
+%\texttt{copynumber.pdf} vignette, we provide an option to preprocess |
|
76 |
+%and genotype using ordinary matrices (large RAM, but quicker access) |
|
77 |
+%or \Robject{ff} objects (low RAM, but more I/O overhead). In the |
|
78 |
+%analysis below, the sample size is small (n=43) and we use matrices. |
|
79 |
+%(Users wanting the \Rpackage{ff} implementation should try |
|
80 |
+%\Rfunction{crlmm:::crlmmIllumina2} with the same arguments.) |
|
81 |
+ |
|
72 | 82 |
<<samplesToProcess>>= |
73 | 83 |
samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE) |
74 | 84 |
samplesheet <- samplesheet[-c(28:46,61:75,78:79), ] |
75 | 85 |
arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"])) |
76 |
-##Check that files exist |
|
77 | 86 |
grnfiles = all(file.exists(paste(arrayNames, "_Grn.idat", sep=""))) |
78 | 87 |
redfiles = all(file.exists(paste(arrayNames, "_Red.idat", sep=""))) |
79 |
-myfun <- function(outdir, filenames, samplesheet, arrayInfoColNames, cdfName, cnFile, snpFile){ |
|
80 |
- if(missing(cnFile)) cnFile <- file.path(outdir, "cnFile.rda") |
|
81 |
- if(missing(snpFile)) snpFile <- file.path(outdir, "snpFile.rda") |
|
82 |
- RG <- readIdatFiles(sampleSheet=samplesheet, |
|
83 |
- path=dirname(filenames), |
|
84 |
- arrayInfoColNames=arrayInfoColNames, |
|
85 |
- saveDate=TRUE) |
|
86 |
- crlmmResult <- crlmmIllumina(RG=RG, |
|
87 |
- cdfName="human370v1c", |
|
88 |
- sns=pData(RG)$ID, |
|
89 |
- returnParams=TRUE, |
|
90 |
- cnFile=cnFile, |
|
91 |
- snpFile=snpFile, |
|
92 |
- save.it=TRUE) |
|
93 |
- protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate |
|
94 |
- range(protocolData(crlmmResult)$ScanDate) |
|
95 |
- return(crlmmResult) |
|
88 |
+@ |
|
89 |
+ |
|
90 |
+<<samplesToProcess2, echo=FALSE>>= |
|
91 |
+if(!exists("crlmmResult")){ |
|
92 |
+ if(!file.exists(file.path(outdir, "crlmmResult.rda"))){ |
|
93 |
+ RG <- readIdatFiles(samplesheet, |
|
94 |
+ path=dirname(arrayNames[1]), |
|
95 |
+ arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
96 |
+ saveDate=TRUE) |
|
97 |
+ crlmmResult <- crlmmIllumina(RG=RG, |
|
98 |
+ cdfName="human370v1c", |
|
99 |
+ sns=pData(RG)$ID, |
|
100 |
+ returnParams=TRUE, |
|
101 |
+ cnFile=file.path(outdir, "cnFile.rda"), |
|
102 |
+ snpFile=file.path(outdir, "snpFile.rda"), |
|
103 |
+ save.it=TRUE) |
|
104 |
+ protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate |
|
105 |
+ range(protocolData(crlmmResult)$ScanDate) |
|
106 |
+ save(crlmmResult, file=file.path(outdir, "crlmmResult.rda")) |
|
107 |
+ rm(RG); gc() |
|
108 |
+ } |
|
96 | 109 |
} |
97 |
-trace(myfun, browser) |
|
98 |
-crlmmResult <- checkExists("crlmmResult", path=outdir, FUN=myfun, |
|
99 |
- outdir=outdir, |
|
100 |
- filenames=arrayNames, |
|
101 |
- samplesheet=samplesheet, |
|
102 |
- arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
103 |
- cdfName=cdfName) |
|
104 |
-##if(!exists("crlmmResult")){ |
|
105 |
-## if(!file.exists(file.path(outdir, "crlmmResult.rda"))){ |
|
106 |
-## RG <- readIdatFiles(samplesheet, |
|
107 |
-## path=dirname(arrayNames[1]), |
|
108 |
-## arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
109 |
-## saveDate=TRUE) |
|
110 |
-## crlmmResult <- crlmmIllumina(RG=RG, |
|
111 |
-## cdfName="human370v1c", |
|
112 |
-## sns=pData(RG)$ID, |
|
113 |
-## returnParams=TRUE, |
|
114 |
-## cnFile=file.path(outdir, "cnFile.rda"), |
|
115 |
-## snpFile=file.path(outdir, "snpFile.rda"), |
|
116 |
-## save.it=TRUE) |
|
117 |
-## protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate |
|
118 |
-## range(protocolData(crlmmResult)$ScanDate) |
|
119 |
-## save(crlmmResult, file=file.path(outdir, "crlmmResult.rda")) |
|
120 |
-## rm(RG); gc() |
|
121 |
-## } else { |
|
122 |
-## message("Loading previously saved crlmm results") |
|
123 |
-## load(file.path(outdir, "snpFile.rda")) |
|
124 |
-## res <- get("res") |
|
125 |
-## load(file.path(outdir, "cnFile.rda")) |
|
126 |
-## cnAB <- get("cnAB") |
|
127 |
-## load(file.path(outdir, "crlmmResult.rda")) |
|
128 |
-## } |
|
129 |
-##} |
|
130 | 110 |
@ |
131 | 111 |
|
132 |
-This creates a \Robject{crlmmSetList} object in the \Robject{outdir} |
|
133 |
-directory. The first element of this object contains the |
|
134 |
-quantile-normalized A and B intensities. The second element in the list |
|
135 |
-contains the crlmm genotype calls. |
|
112 |
+<<samplesToProcess3, eval=FALSE>>= |
|
113 |
+RG <- readIdatFiles(samplesheet, |
|
114 |
+ path=dirname(arrayNames[1]), |
|
115 |
+ arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
116 |
+ saveDate=TRUE) |
|
117 |
+crlmmResult <- crlmmIllumina(RG=RG, |
|
118 |
+ cdfName="human370v1c", |
|
119 |
+ sns=pData(RG)$ID, |
|
120 |
+ returnParams=TRUE, |
|
121 |
+ cnFile=file.path(outdir, "cnFile.rda"), |
|
122 |
+ snpFile=file.path(outdir, "snpFile.rda"), |
|
123 |
+ save.it=TRUE) |
|
124 |
+protocolData(crlmmResult)$ScanDate <- protocolData(RG)$ScanDate |
|
125 |
+range(protocolData(crlmmResult)$ScanDate) |
|
126 |
+rm(RG); gc() |
|
127 |
+@ |
|
136 | 128 |
|
129 |
+\noindent Finally, we load a few of the intermediate files that were |
|
130 |
+created during the preprocessing and genotyping. |
|
131 |
+<<loadIntermediate, eval=FALSE>>= |
|
132 |
+load(file.path(outdir, "snpFile.rda")) |
|
133 |
+res <- get("res") |
|
134 |
+load(file.path(outdir, "cnFile.rda")) |
|
135 |
+cnAB <- get("cnAB") |
|
136 |
+load(file.path(outdir, "crlmmResult.rda")) |
|
137 |
+@ |
|
137 | 138 |
|
138 |
-<<SNR>>= |
|
139 |
-CHR <- 1 |
|
140 |
-filename <- paste(outdir, "/crlmmSetList_", CHR, ".rda", sep="") |
|
141 |
-load(filename) |
|
142 |
-hist(crlmmSetList[[1]]$SNR) |
|
139 |
+<<loadIntermediate, eval=TRUE, echo=FALSE>>= |
|
140 |
+if(!exists("res")){ |
|
141 |
+ load(file.path(outdir, "snpFile.rda")) |
|
142 |
+ res <- get("res") |
|
143 |
+ load(file.path(outdir, "cnFile.rda")) |
|
144 |
+ cnAB <- get("cnAB") |
|
145 |
+ load(file.path(outdir, "crlmmResult.rda")) |
|
146 |
+} |
|
143 | 147 |
@ |
144 | 148 |
|
145 |
-Run update on the CrlmmSetList object to obtain copy number estimates. Estimate |
|
146 |
-copy number for chromosome 1. |
|
149 |
+After running the crlmm algorithm, we construct a container for |
|
150 |
+storing the quantile normalized intensities, genotype calls, and |
|
151 |
+allele-specific copy number estimates. A few helper functions for |
|
152 |
+facilitating the construction of this container have been added to the |
|
153 |
+inst/scripts directory of this package and can be sourced as follows. |
|
154 |
+Documentation for these helper functions will be available in the |
|
155 |
+devel version of this package. |
|
156 |
+ |
|
157 |
+<<constructContainer, eval=FALSE>>= |
|
158 |
+path <- system.file("scripts", package="crlmm") |
|
159 |
+source(file.path(path, "helperFunctions.R")) |
|
160 |
+fD <- constructFeatureData(c(res$gns, cnAB$gns), cdfName="human370v1c") |
|
161 |
+new.order <- order(fD$chromosome, fD$position) |
|
162 |
+fD <- fD[new.order, ] |
|
163 |
+aD <- constructAssayData(cnAB, res, crlmmResult, order.index=new.order) |
|
164 |
+protocolData(crlmmResult)$batch <- vector("integer", ncol(crlmmResult)) |
|
165 |
+container <- new("CNSetLM", |
|
166 |
+ assayData=aD, |
|
167 |
+ phenoData=phenoData(crlmmResult), |
|
168 |
+ protocolData=protocolData(crlmmResult), |
|
169 |
+ featureData=fD, |
|
170 |
+ annotation="human370v1c") |
|
171 |
+@ |
|
147 | 172 |
|
148 |
-<<estimateCn>>= |
|
149 |
-update(filename) |
|
150 |
-##Or equivalently, |
|
151 |
-##crlmmSetList <- computeCopynumber(crlmmSetList) |
|
152 |
-##save(crlmmSetList, file=file.path(outdir, paste("crlmmSetList_", CHR, ".rda", sep=""))) |
|
173 |
+<<container2,echo=FALSE>>= |
|
174 |
+if(!file.exists(file.path(outdir, "cnSet"))){ |
|
175 |
+ path <- system.file("scripts", package="crlmm") |
|
176 |
+ source(file.path(path, "helperFunctions.R")) |
|
177 |
+ fD <- constructFeatureData(c(res$gns, cnAB$gns), cdfName="human370v1c") |
|
178 |
+ new.order <- order(fD$chromosome, fD$position) |
|
179 |
+ fD <- fD[new.order, ] |
|
180 |
+ aD <- constructAssayData(cnAB, res, crlmmResult, order.index=new.order) |
|
181 |
+ protocolData(crlmmResult)$batch <- vector("integer", ncol(crlmmResult)) |
|
182 |
+ container <- new("CNSetLM", |
|
183 |
+ assayData=aD, |
|
184 |
+ phenoData=phenoData(crlmmResult), |
|
185 |
+ protocolData=protocolData(crlmmResult), |
|
186 |
+ featureData=fD, |
|
187 |
+ annotation="human370v1c") |
|
188 |
+} |
|
153 | 189 |
@ |
154 | 190 |
|
155 |
-Samples with low signal to noise ratios tend to have a lot of variation |
|
156 |
-in the point estimates of copy number. One may want to exclude these |
|
157 |
-samples, or smooth after filtering outliers. Here we load the |
|
158 |
-crlmmSetList object. See the copynumber.Rnw vignette for example plots. |
|
159 | 191 |
|
160 |
-<<crlmmSetList>>= |
|
161 |
-##reload the updated object |
|
162 |
-load(filename) |
|
163 |
-cn <- copyNumber(crlmmSetList) |
|
192 |
+ |
|
193 |
+As described in the \texttt{copynumber} vignette, two \R{} functions |
|
194 |
+for copy number estimation are available: \Rfunction{crlmmCopynumber} |
|
195 |
+and \Rfunction{crlmmCopynumber2}. The latter requires that the assay |
|
196 |
+data elements are represented using \Robject{ff} objects. As the |
|
197 |
+dataset for this vignette is small (43 arrays) and the above steps did |
|
198 |
+not make use of the \Robject{ff} features, constructing \Robject{ff} |
|
199 |
+objects at this point in the analysis would serve little purpose. The |
|
200 |
+decision to use ordinary matrices or \Robject{ff} objects should be |
|
201 |
+decided at the beginning of the analysis and then propogated to both |
|
202 |
+the genotyping and copy number estimation steps. Here, we use the |
|
203 |
+\Rfunction{crlmmCopynumber} to estimate copy number. |
|
204 |
+ |
|
205 |
+<<estimateCopynumber, echo=FALSE>>= |
|
206 |
+if(!exists("cnSet")){ |
|
207 |
+ if(!file.exists(file.path(outdir, "cnSet.rda"))){ |
|
208 |
+ cnSet <- crlmmCopynumber(container, verbose=FALSE) |
|
209 |
+ save(cnSet, file=file.path(outdir, "cnSet.rda")) |
|
210 |
+ } else load(file.path(outdir, "cnSet.rda")) |
|
211 |
+} |
|
212 |
+@ |
|
213 |
+ |
|
214 |
+<<estimateCopynumber, eval=FALSE>>= |
|
215 |
+cnSet <- crlmmCopynumber(container, verbose=TRUE) |
|
216 |
+@ |
|
217 |
+ |
|
218 |
+%In the following code, we define helper functions to construct a |
|
219 |
+%\Robject{featureData} object that chromosome and physical position |
|
220 |
+%annotation for each marker. We then define a constructor for |
|
221 |
+%initializing the assay data -- the copy number estimation algorithm |
|
222 |
+%requires normalized intensities, genotype calls, and genotype |
|
223 |
+%confidence scores. Note that the order of the construction is |
|
224 |
+%important. In particular, the validity method for the |
|
225 |
+%\Robject{CNSetLM} object requires a 'batch' label in the |
|
226 |
+%\Robject{protocolData} slot and that 'chromosome', 'position', and |
|
227 |
+%'isSnp' labels are present in the \Robject{featureData} slot. With |
|
228 |
+%the object \Robject{cnSet} in hand, one can proceed with the copy |
|
229 |
+%number analysis as outlined in the \texttt{copynumber.pdf} vignette. |
|
230 |
+%Useful accessors and visualizations of the locus-level estimates are |
|
231 |
+%also discussed in the \texttt{copynumber.pdf} vignette. |
|
232 |
+ |
|
233 |
+ |
|
234 |
+\paragraph{Accessors for extracting the locus-level copy number |
|
235 |
+ estimates.} |
|
236 |
+ |
|
237 |
+As an example of how to use accessors to obtain the allele-specific CN |
|
238 |
+estimates, the following code chunk extracts the allele-specific copy |
|
239 |
+number for polymorphic markers on chromosome 21. |
|
240 |
+ |
|
241 |
+<<acn-accessor>>= |
|
242 |
+marker.index <- which(chromosome(cnSet) == 21 & isSnp(cnSet)) |
|
243 |
+ca <- CA(cnSet)[marker.index, ]/100 |
|
244 |
+cb <- CB(cnSet)[marker.index, ]/100 |
|
245 |
+missing.index <- which(rowSums(is.na(ca))==ncol(cnSet)) |
|
246 |
+ca <- ca[-missing.index, ] |
|
247 |
+cb <- cb[-missing.index, ] |
|
248 |
+@ |
|
249 |
+\noindent Negating the \Robject{isSnp} function could be used to |
|
250 |
+extract the estimates at nonpolymorphic markers. For instance, |
|
251 |
+<<monomorphic-accessor>>= |
|
252 |
+cn.monomorphic <- CA(cnSet)[which(chromosome(cnSet) == 21 & !isSnp(cnSet)), ]/100 |
|
164 | 253 |
@ |
165 | 254 |
|
166 |
-The following helper function can facilitate access to the total copy |
|
167 |
-number. |
|
255 |
+At polymorphic loci, the total copy number is the sum of the number of |
|
256 |
+copies of the A allele and the number of copies for the B allele. At |
|
257 |
+nonpolymorphic loci, the total copy number is the number of copies for |
|
258 |
+the A allele. The helper function totalCopyNumber can be used to |
|
259 |
+extract the total copy number for all polymorphic and nonpolymorphic |
|
260 |
+markers. Documentation of the \Rfunction{totalCopyNumber} will be |
|
261 |
+available in the devel version of the \Rpackage{oligoClasses}. |
|
168 | 262 |
|
169 | 263 |
<<copyNumberHelper>>= |
170 |
-trace(totalCopyNumber, browser, signature="CNSet") |
|
171 |
-cn.total2 <- totalCopyNumber(cnSet, i=which(chromosome(cnSet)==1), j=1:20) |
|
264 |
+cn.total <- ca+cb |
|
265 |
+cn.total2 <- totalCopyNumber(cnSet, i=marker.index) |
|
266 |
+cn.total2 <- cn.total2[-missing.index, ] |
|
267 |
+dimnames(cn.total) <- NULL |
|
268 |
+all.equal(cn.total2, cn.total) |
|
172 | 269 |
@ |
173 | 270 |
|
174 | 271 |
A few simple visualizations may be helpful at this point. The first |
... | ... |
@@ -187,14 +284,15 @@ hist(cnSet$SNR, breaks=15) |
187 | 284 |
<<firstSample, fig=TRUE>>= |
188 | 285 |
low.snr <- which(cnSet$SNR == min(cnSet$SNR)) |
189 | 286 |
high.snr <- which(cnSet$SNR == max(cnSet$SNR)) |
190 |
-x <- position(cnSet)[chromosome(cnSet) == 1] |
|
287 |
+x <- position(cnSet)[marker.index] |
|
288 |
+x <- x[-missing.index] |
|
191 | 289 |
par(mfrow=c(2,1), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1)) |
192 | 290 |
for(j in c(low.snr, high.snr)){ |
193 |
- cn <- copyNumber(cnSet)[, j]/100 |
|
291 |
+ cn <- cn.total[, j] |
|
194 | 292 |
cn[cn < 0.05] <- 0.05 |
195 | 293 |
plot(x, |
196 |
- cn[chromosome(cnSet) == 1], |
|
197 |
- pch=".", ylab="copy number", xaxt="n", log="y") |
|
294 |
+ cn, |
|
295 |
+ pch=".", ylab="copy number", xaxt="n", ylim=c(0, 6)) |
|
198 | 296 |
} |
199 | 297 |
axis(1, at=pretty(x), labels=pretty(x/1e6)) |
200 | 298 |
mtext("Mb", 1, outer=TRUE, line=2) |
... | ... |
@@ -208,10 +306,7 @@ followed by a segmentation or hidden markov model. |
208 | 306 |
<<removeOutliers, fig=TRUE>>= |
209 | 307 |
par(mfrow=c(2,1), las=1, mar=c(0.5, 4, 0.5, 0.5), oma=c(4, 1, 1,1)) |
210 | 308 |
for(j in c(low.snr, high.snr)){ |
211 |
- x <- position(cnSet)[chromosome(cnSet)==1] |
|
212 |
- cn <- copyNumber(cnSet)[, j]/100 |
|
213 |
- cn[cn < 0.05] <- 0.05 |
|
214 |
- ##cn <- log(cn) |
|
309 |
+ cn <- cn.total[, j] |
|
215 | 310 |
x <- x[!is.na(cn)] |
216 | 311 |
cn <- cn[!is.na(cn)] |
217 | 312 |
y <- as.numeric(runmed(cn, k=3)) |
... | ... |
@@ -259,6 +354,10 @@ mtext("Mb", 1, outer=TRUE, line=2) |
259 | 354 |
% } |
260 | 355 |
%} |
261 | 356 |
%@ |
357 |
+\section{Session information} |
|
358 |
+<<sessionInfo, results=tex>>= |
|
359 |
+toLatex(sessionInfo()) |
|
360 |
+@ |
|
262 | 361 |
|
263 | 362 |
|
264 | 363 |
\end{document} |
... | ... |
@@ -4,6 +4,7 @@ |
4 | 4 |
\alias{CNSetLM} |
5 | 5 |
\alias{CNSetLM-class} |
6 | 6 |
\alias{[,CNSetLM-method} |
7 |
+\alias{lines,CNSetLM-method} |
|
7 | 8 |
\alias{lM,CNSetLM-method} |
8 | 9 |
\alias{lM<-,CNSetLM,list_or_ffdf-method} |
9 | 10 |
\alias{open,CNSetLM-method} |
... | ... |
@@ -41,10 +42,10 @@ Class \code{"\linkS4class{Versioned}"}, by class "CNSet", distance 6. |
41 | 42 |
\section{Methods}{ |
42 | 43 |
\describe{ |
43 | 44 |
\item{[}{\code{signature(x = "CNSetLM")}: subset \code{CNSetLM} objects} |
45 |
+ \item{lines}{\code{signature(x="CNSetLM")}: for drawing prediction regions on A vs B scatterplots} |
|
44 | 46 |
\item{lM}{\code{signature(object = "CNSetLM")}: Extract list or |
45 | 47 |
ffdf object containing linear model parameters} |
46 |
-## \item{lM<-}{\code{signature(object = "CNSetLM", value = "list_or_ffdf")}: ... } |
|
47 |
- \item{open}{\code{signature(con = "CNSetLM")}: opens file connects |
|
48 |
+ \item{open}{\code{signature(con = "CNSetLM")}: opens file connects |
|
48 | 49 |
to ff objects for assayData elements and linear model parameters} |
49 | 50 |
\item{show}{\code{signature(object = "CNSetLM")}: print method |
50 | 51 |
for the class } |
... | ... |
@@ -54,9 +55,7 @@ Class \code{"\linkS4class{Versioned}"}, by class "CNSet", distance 6. |
54 | 55 |
\seealso{ |
55 | 56 |
\code{\linkS4class{SnpSuperSet}}, \code{\linkS4class{CNSet}} |
56 | 57 |
} |
57 |
-% ~~objects to See Also as \code{\link{~~fun~~}}, ~~~ |
|
58 |
-% or \code{\linkS4class{CLASSNAME}} for links to other classes |
|
59 |
-%} |
|
58 |
+ |
|
60 | 59 |
\examples{ |
61 | 60 |
showClass("CNSetLM") |
62 | 61 |
} |
... | ... |
@@ -14,7 +14,7 @@ genotype(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps = |
14 | 14 |
rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, |
15 | 15 |
gender = NULL, returnParams = TRUE, badSNP = 0.7) |
16 | 16 |
|
17 |
-genotypeLD(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps = 0.1, verbose = TRUE, seed = 1, sns, copynumber = FALSE, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
17 |
+genotypeLD(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps = 0.1, verbose = TRUE, seed = 1, sns, copynumber = TRUE, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
18 | 18 |
} |
19 | 19 |
\arguments{ |
20 | 20 |
\item{filenames}{ complete path to CEL files} |