git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54327 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -284,7 +284,7 @@ genotype <- function(filenames, |
284 | 284 |
eps=eps, |
285 | 285 |
seed=seed, |
286 | 286 |
verbose=verbose) |
287 |
- ok <- cnrmaAffy(cnSet=cnSet, filenames=filenames, seed=seed, |
|
287 |
+ ok <- cnrmaAffy(cnSet=cnSet, filenames=filenames, cdfName=cdfName, seed=seed, |
|
288 | 288 |
verbose=verbose) |
289 | 289 |
stopifnot(ok) |
290 | 290 |
ok <- genotypeAffy(cnSet=cnSet, mixtureParams=mixtureParams, |
... | ... |
@@ -324,7 +324,7 @@ genotypeAffy <- function(cnSet, mixtureParams, SNRMin=5, recallMin=10, |
324 | 324 |
return(TRUE) |
325 | 325 |
} |
326 | 326 |
|
327 |
-cnrmaAffy <- function(cnSet, filenames, seed=1, verbose=TRUE){ |
|
327 |
+cnrmaAffy <- function(cnSet, filenames, cdfName, seed=1, verbose=TRUE){ |
|
328 | 328 |
snp.I <- isSnp(cnSet) |
329 | 329 |
snp.index <- which(snp.I) |
330 | 330 |
np.index <- which(!snp.I) |
... | ... |
@@ -24,18 +24,19 @@ |
24 | 24 |
\author{Rob Scharpf} |
25 | 25 |
\maketitle |
26 | 26 |
|
27 |
-The workflow for copy number analyses in the \crlmm{} package requires |
|
28 |
-preprocessing and genotyping, followed by estimation of parameters for |
|
29 |
-copy number estimation. Supported platforms are those for which a |
|
30 |
-corresponding annotation package is available (see Tables |
|
31 |
-\ref{overview} and \ref{illumina}). Table \ref{overview} provides an |
|
32 |
-overview of the available vignettes pertaining to copy number |
|
33 |
-estimation. These vignettes are located in the \verb+inst/scripts+ |
|
34 |
-subdirectory of the \crlmm{} package. HapMap datasets are used to |
|
35 |
-illustrate the workflow and are not provided as part of the \crlmm{} |
|
36 |
-package. Users wishing to reproduce the analysis should download the |
|
37 |
-HapMap CEL files (Affymetrix) or the \verb+idat+ files (Illumina) and |
|
38 |
-modify the paths to the raw data files as appropriate. |
|
27 |
+The workflow for copy number analyses in the \crlmm{} package includes |
|
28 |
+preprocessing and genotyping of the raw intensities followed by |
|
29 |
+estimation of parameters for copy number estimation using |
|
30 |
+\Rfunction{crlmmCopynumber}. Supported platforms are those for which |
|
31 |
+a corresponding annotation package is available. Table \ref{overview} |
|
32 |
+provides an overview of the available vignettes pertaining to copy |
|
33 |
+number estimation. These vignettes are located in the |
|
34 |
+\verb+inst/scripts+ subdirectory of the \crlmm{} package. HapMap |
|
35 |
+datasets are used to illustrate the workflow and are not provided as |
|
36 |
+part of the \crlmm{} package. Users wishing to reproduce the analysis |
|
37 |
+should download the HapMap CEL files (Affymetrix) or the \verb+idat+ |
|
38 |
+files (Illumina) and modify the paths to the raw data files as |
|
39 |
+appropriate. |
|
39 | 40 |
|
40 | 41 |
\begin{table}[h!] |
41 | 42 |
\begin{center} |
... | ... |
@@ -43,35 +44,20 @@ modify the paths to the raw data files as appropriate. |
43 | 44 |
\hline |
44 | 45 |
Vignette & Platform & Annotation package & Scope \\ |
45 | 46 |
\hline |
46 |
- Infrastructure & & & The CNSet container / large data support \\ |
|
47 |
+Infrastructure & Affy/Illumina & |
|
48 |
+& The CNSet container / large data support using the \Rpackage{ff} package \\ |
|
47 | 49 |
AffymetrixPreprocessCN & Affy 5.0, 6.0 & genomewidesnp5Crlmm, genomewidesnp6Crlmm & Preprocessing and genotyping \\ |
48 |
- IlluminaPreprocessCN & Illumina platforms & several$^\dagger$ & Preprocessing and genotyping \\ |
|
50 |
+ IlluminaPreprocessCN & Illumina & several$^\dagger$ & Preprocessing and genotyping \\ |
|
49 | 51 |
copynumber & Affy/Illumina & N/A & raw copy number estimates \\ |
50 | 52 |
% SmoothingRawCN & Affy/Illumina & N/A & smoothing via segmentation or hidden Markov models \\ |
51 | 53 |
\hline |
52 | 54 |
\end{tabular} |
53 | 55 |
\end{center} |
54 | 56 |
\caption{\label{overview} Vignettes for copy number |
55 |
- estimation. $^\dagger$ See table \ref{illumina} for the annotation |
|
56 |
- packages available for the Illumina platform.} |
|
57 |
-\end{table} |
|
58 |
- |
|
59 |
-\begin{table}[h!] |
|
60 |
-\begin{center} |
|
61 |
-\begin{tabular}{|l|} |
|
62 |
-\hline |
|
63 |
- human370v1cCrlmm \\ |
|
64 |
- human370quadv3cCrlmm \\ |
|
65 |
- human550v3bCrlmm \\ |
|
66 |
- human650v3aCrlmm \\ |
|
67 |
- human610quadv1bCrlmm \\ |
|
68 |
- human660quadv1aCrlmm \\ |
|
69 |
- human1mduov3bCrlmm \\ |
|
70 |
- humanomni1quadv1bCrlmm \\ |
|
71 |
-\hline |
|
72 |
-\end{tabular} |
|
73 |
-\end{center} |
|
74 |
-\caption{\label{illumina} Annotation packages for the Illumina platform.} |
|
57 |
+ estimation. $^\dagger$ Annotation packages available for the |
|
58 |
+ Illumina platform include \Rpackage{human370v1cCrlmm}, \Rpackage{human370quadv3cCrlmm}, |
|
59 |
+ \Rpackage{human550v3bCrlmm}, \Rpackage{human650v3aCrlmm}, \Rpackage{human610quadv1bCrlmm}, |
|
60 |
+ \Rpackage{human660quadv1aCrlmm}, \Rpackage{human1mduov3bCrlmm}, and \Rpackage{humanomni1quadv1bCrlmm}} |
|
75 | 61 |
\end{table} |
76 | 62 |
|
77 | 63 |
%We make use of the \R{} package \Rpackage{cacheSweave} for cacheing |
... | ... |
@@ -81,7 +67,9 @@ modify the paths to the raw data files as appropriate. |
81 | 67 |
|
82 | 68 |
In general, the workflow is |
83 | 69 |
\begin{enumerate} |
84 |
-\item preprocessing and genotyping (\verb+AffymetrixPreprocessCN+ or \verb+IlluminaPreprocessCN+ vignettes) |
|
70 |
+\item preprocess and genotype the arrays |
|
71 |
+ (\verb+AffymetrixPreprocessCN+ for Affymetrix and |
|
72 |
+ \verb+IlluminaPreprocessCN+ vignettes for Illumina) |
|
85 | 73 |
\item copy number estimation (\verb+copynumber+ vignette) |
86 | 74 |
%\item inferring regions of copy number gain and loss |
87 | 75 |
% (\verb+SmoothingRawCN+ vignette) |
... | ... |
@@ -55,12 +55,15 @@ This vignette analyzes HapMap samples assayed on the Affymetrix 6.0 |
55 | 55 |
platform. The annotation package for this platform is |
56 | 56 |
\Rpackage{genomewidesnp6Crlmm}. We assign the name of the annotation |
57 | 57 |
package without the \verb+Crlmm+ postfix to the name \verb+cdfName+. |
58 |
+We use the \R{} package \Rpackage{cacheSweave} to cache long |
|
59 |
+computations in this vignette. Users should refer to the |
|
60 |
+\Rpackage{cacheSweave} package for additional details regarding |
|
61 |
+cacheing. |
|
58 | 62 |
|
59 | 63 |
<<cdfname>>= |
60 | 64 |
cdfName <- "genomewidesnp6" |
61 | 65 |
@ |
62 | 66 |
|
63 |
- |
|
64 | 67 |
The HapMap CEL files are stored in a local directory assigned to |
65 | 68 |
\verb+pathToCels+ in the following code. The genotyping step will |
66 | 69 |
create several files with \verb+ff+ extensions. We will store these |
... | ... |
@@ -83,14 +86,8 @@ the genotyping step will be stored in \verb+outdir+. |
83 | 86 |
ldPath(outdir) |
84 | 87 |
@ |
85 | 88 |
|
86 |
-This vignette uses the \R{} package \Rpackage{cacheSweave} to cache |
|
87 |
-long computations. The following step is only necessarily if one |
|
88 |
-wishes to cache some of the computations. In particular, we specify |
|
89 |
-that the cached computations will be saved in the \verb+outdir+ |
|
90 |
-through the function \Rfunction{setCacheDir}. Users should refer to |
|
91 |
-the \Rpackage{cacheSweave} package for additional details regarding |
|
92 |
-cacheing. |
|
93 | 89 |
|
90 |
+% only needed if cacheing |
|
94 | 91 |
<<cachedir, echo=FALSE>>= |
95 | 92 |
setCacheDir(outdir) |
96 | 93 |
@ |
... | ... |
@@ -159,11 +156,11 @@ obtained throught the \Rfunction{print} or \Rfunction{show} methods. |
159 | 156 |
print(cnSet) |
160 | 157 |
@ |
161 | 158 |
|
162 |
-Note that the object is fairly small as the intensities and genotype |
|
163 |
-calls are stored on disk rather than in active memory. |
|
159 |
+Note that the object is relatively small as the intensities and |
|
160 |
+genotype calls are stored on disk rather than in active memory. |
|
164 | 161 |
|
165 | 162 |
<<objectsize>>= |
166 |
-object.size(cnSet) |
|
163 |
+print(object.size(cnSet), units="Mb") |
|
167 | 164 |
@ |
168 | 165 |
|
169 | 166 |
We save the \Robject{cnSet} object in a local directory for subsequent |
... | ... |
@@ -178,8 +175,9 @@ saveObject <- function(outdir, cnSet) { |
178 | 175 |
@ |
179 | 176 |
|
180 | 177 |
Users can proceed to the \verb+copynumber+ vignette for copy number |
181 |
-analyses. |
|
182 |
- |
|
178 |
+analyses. See the \verb+Infrastructure+ vignette for additional |
|
179 |
+details on the \Robject{CNSet} class, including an overview of the |
|
180 |
+available accessors. |
|
183 | 181 |
|
184 | 182 |
|
185 | 183 |
\section{Session information} |
... | ... |
@@ -184,8 +184,11 @@ normalized using the function \Rfunction{preprocessInf}. The function |
184 | 184 |
parameters for the mixture model used by the CRLMM genotyping |
185 | 185 |
algorithm. |
186 | 186 |
|
187 |
-<<preprocess,cache=TRUE>>= |
|
187 |
+<<preprocess,cache=TRUE, results=hide>>= |
|
188 | 188 |
mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet, arrayNames=arrayNames, arrayInfoColNames=arrayInfo) |
189 |
+@ |
|
190 |
+ |
|
191 |
+<<showMixtureParams>>= |
|
189 | 192 |
invisible(open(mixtureParams)) |
190 | 193 |
str(mixtureParams[]) |
191 | 194 |
invisible(close(mixtureParams)) |
... | ... |
@@ -211,20 +214,10 @@ function \Rfunction{genotypeInf}. |
211 | 214 |
|
212 | 215 |
<<genotype,cache=TRUE>>= |
213 | 216 |
updated <- genotypeInf(cnSet, mixtureParams=mixtureParams) |
214 |
-print(updated) |
|
215 | 217 |
@ |
216 |
- |
|
217 |
-The posterior probabilities for the genotype calls in the |
|
218 |
-\verb+callProbability+ element of the \verb+assayData+ are stored as |
|
219 |
-integers to reduce the file size on disk. The scores can be |
|
220 |
-transformed to the probability scale using the \Rfunction{i2p} |
|
221 |
-function as illustrated in the following code chunk. |
|
222 |
- |
|
223 |
-<<callprobs>>= |
|
224 |
-invisible(open(snpCallProbability(cnSet))) |
|
225 |
-callProbs <- as.matrix(snpCallProbability(cnSet)[1:5, 1:5]) |
|
226 |
-i2p(callProbs) |
|
227 |
-invisible(close(snpCallProbability(cnSet))) |
|
218 |
+\vspace{-0.5em} |
|
219 |
+<<>>= |
|
220 |
+updated |
|
228 | 221 |
@ |
229 | 222 |
|
230 | 223 |
\textbf{Wrapper:} As an alternative to calling the functions |
... | ... |
@@ -265,27 +258,14 @@ delete(cnSet2$SKW) |
265 | 258 |
rm(cnSet2) |
266 | 259 |
@ |
267 | 260 |
|
268 |
-Users may now proceed to the CopyNumber vignette for copy number |
|
269 |
-analyses. |
|
270 |
- |
|
271 |
- |
|
261 |
+Users can proceed to the \verb+copynumber+ vignette for copy number |
|
262 |
+analyses. See the \verb+Infrastructure+ vignette for additional |
|
263 |
+details on the \Robject{CNSet} class, including an overview of the |
|
264 |
+available accessors. |
|
272 | 265 |
|
273 | 266 |
\section{Session information} |
274 | 267 |
<<sessionInfo, results=tex>>= |
275 | 268 |
toLatex(sessionInfo()) |
276 | 269 |
@ |
277 | 270 |
|
278 |
- |
|
279 |
-<<>>= |
|
280 |
-cnset.updated <- crlmmCopynumber(cnSet) |
|
281 |
-@ |
|
282 |
- |
|
283 |
-<<>>= |
|
284 |
-snp.index <- which(chromosome(cnSet)==1) |
|
285 |
-cnset2 <- cnSet[snp.index, ] |
|
286 |
-trace(crlmmCopynumber, browser) |
|
287 |
-object <- crlmmCopynumber(cnset2) |
|
288 |
-## elements no longer ff |
|
289 |
-@ |
|
290 |
- |
|
291 | 271 |
\end{document} |
... | ... |
@@ -22,36 +22,27 @@ |
22 | 22 |
\title{Infrastructure for copy number analysis in \crlmm{}} |
23 | 23 |
\date{\today} |
24 | 24 |
\author{Rob Scharpf} |
25 |
+ |
|
25 | 26 |
\maketitle |
26 | 27 |
|
28 |
+\begin{abstract} |
|
27 | 29 |
|
28 |
-\section{Set up} |
|
30 |
+ This vignette provides an overview of the \Rclass{CNSet} class and a |
|
31 |
+ brief discussion of the underlying infrastructure for large data |
|
32 |
+ support with the \Rpackage{ff} package. This package instantiates |
|
33 |
+ an object of class \Rclass{CNSet} using a trivial dataset with 3 |
|
34 |
+ files. As this sample size is too small for estimating copy number |
|
35 |
+ with the \crlmm{} package, the final section of this vignette loads |
|
36 |
+ an object created by the analysis of 180 HapMap CEL files |
|
37 |
+ (Affymetrix 6.0 platform). In particular, this object was |
|
38 |
+ instantiated by running (1) the \verb+AffymetrixPreprocessCN+ |
|
39 |
+ vignette and (2) the \verb+copynumber+ vignette. |
|
29 | 40 |
|
30 |
-As in the previous vignettes, we load the required libraries and |
|
31 |
-specify a path for storing output files. |
|
41 |
+\end{abstract} |
|
32 | 42 |
|
33 | 43 |
<<libraries,results=hide>>= |
34 | 44 |
library(ff) |
35 | 45 |
library(crlmm) |
36 |
-library(cacheSweave) |
|
37 |
-require(DNAcopy) |
|
38 |
-require(VanillaICE) |
|
39 |
-if(getRversion() < "2.13.0"){ |
|
40 |
- rpath <- getRversion() |
|
41 |
-} else rpath <- "trunk" |
|
42 |
-outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="") |
|
43 |
-@ |
|
44 |
- |
|
45 |
-<<ram>>= |
|
46 |
-ldPath(outdir) |
|
47 |
-setCacheDir(outdir) |
|
48 |
-@ |
|
49 |
- |
|
50 |
-We begin by loading the \Robject{cnSet} object created by the |
|
51 |
-\verb+AffymetrixPreprocessCN+ vignette. |
|
52 |
- |
|
53 |
-<<loadcnset>>= |
|
54 |
-if(!exists("cnSet")) load(file.path(outdir, "cnSet.rda")) |
|
55 | 46 |
@ |
56 | 47 |
|
57 | 48 |
\section{Supported platforms} |
... | ... |
@@ -98,31 +89,25 @@ ocSamples(200) |
98 | 89 |
|
99 | 90 |
\section{The \Rclass{CNSet} container} |
100 | 91 |
|
101 |
-The \Rfunction{show} method provides a concise summary of the |
|
102 |
-\Robject{cnSet} object. |
|
92 |
+\subsection{Instantiating an object of class \Rclass{CNSet}} |
|
103 | 93 |
|
104 |
-<<show>>= |
|
105 |
-cnSet |
|
106 |
-@ |
|
94 |
+An object of class \Rclass{CNSet} can be instantiated by one of two |
|
95 |
+methods: |
|
107 | 96 |
|
108 |
-We briefly outline some of the unique aspects of the |
|
109 |
-\Rclass{CNSet}-class using in \crlmm{} that may differ from the more |
|
110 |
-standard extensions of the virtual class \Rclass{eSet} defined in the |
|
111 |
-\Rpackage{Biobase} package. |
|
97 |
+\begin{itemize} |
|
112 | 98 |
|
113 |
-\paragraph{Instantiating a \Rclass{CNSet}:} An object of class |
|
114 |
-\Rclass{CNSet} can be instantiated by one of two methods: |
|
115 |
-\begin{enumerate} |
|
116 |
-\item during the preprocessing of the raw intensities for Illumina and |
|
99 |
+\item[Approach 1:] during the preprocessing of the raw intensities for Illumina and |
|
117 | 100 |
Affymetrix arrays by the the functions \Rfunction{constructInf} and |
118 |
- \Rfunction{genotype}, respectively. |
|
101 |
+ \Rfunction{genotype}, respectively. (The \Rfunction{genotype} calls |
|
102 |
+ the non-exported function \Rfunction{constructAffy} to initialize a |
|
103 |
+ \Rclass{CNSet} object for Affymetrix platforms.) |
|
119 | 104 |
|
120 |
-\item by subsetting an existing \Robject{CNSet} object. As per usual, |
|
121 |
- the `[' method can be used to extract a subset of markers $i$ as in |
|
122 |
- `[i, ]', a subset of samples $j$ as in `[, j]', or a subset of |
|
123 |
- markers $i$ and samples $j$ as in `[i, j]'. |
|
105 |
+\item[Approach 2:] by subsetting an existing \Robject{CNSet} object. |
|
106 |
+ As per usual, the `[' method can be used to extract a subset of |
|
107 |
+ markers $i$ as in `[i, ]', a subset of samples $j$ as in `[, j]', or |
|
108 |
+ a subset of markers $i$ and samples $j$ as in `[i, j]'. |
|
124 | 109 |
|
125 |
-\end{enumerate} |
|
110 |
+\end{itemize} |
|
126 | 111 |
|
127 | 112 |
There are important differences in the underlying data representation |
128 | 113 |
depending on how the object was instantiated. In particular, objects |
... | ... |
@@ -131,36 +116,106 @@ generated by the functions \Rfunction{constructInf} and |
131 | 116 |
in memory through protocols defined in the \R{} package \ff{}. For |
132 | 117 |
instance, the normalized intensities and genotype calls in a |
133 | 118 |
\Rclass{CNSet}-instance from approach (1) are \ff{}-derived objects. |
134 |
-However, when such an object is subset by the `[' method, the data is |
|
135 |
-pulled from disk to memory and stored as an ordinary matrix in \R{}. |
|
136 |
-To illustrate, the \Robject{cnSet} loaded at the start of this session |
|
137 |
-was created by the function \Rfunction{genotype}. As the data is |
|
138 |
-stored on disk rather than in memory, we inspect attributes of the |
|
139 |
-object representing the normalized intensities for the A allele by |
|
140 |
-first opening the file connection and then extracting the |
|
141 |
-\ff{}-derived class of the object as well as where the data is stored |
|
142 |
-on disk. |
|
119 |
+By contrast, when such an objected generated by approach (1) is subset |
|
120 |
+by the `[' method, an object of the same class is returned but the |
|
121 |
+\Rpackage{ff}-derived objects are coerced to ordinary matrices. Note, |
|
122 |
+therefore, that both approaches (1) and (2) may involve substantial |
|
123 |
+I/O. |
|
143 | 124 |
|
144 |
-<<approach1>>= |
|
145 |
-invisible(open(cnSet)) |
|
146 |
-class(A(cnSet)) |
|
147 |
-filename(A(cnSet)) |
|
125 |
+\subsubsection{Approach 1} |
|
126 |
+ |
|
127 |
+To illustrate the first approach, we begin by specifying a local |
|
128 |
+directory to store output files and setting the \Rfunction{ldPath} |
|
129 |
+function with this path. |
|
130 |
+ |
|
131 |
+<<path>>= |
|
132 |
+if(getRversion() < "2.13.0"){ |
|
133 |
+ rpath <- getRversion() |
|
134 |
+} else rpath <- "trunk" |
|
135 |
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="") |
|
136 |
+ldPath(outdir) |
|
148 | 137 |
@ |
149 | 138 |
|
150 |
-Using approach (2), we construct a new \Robject{CNSet} object. |
|
151 | 139 |
|
152 |
-<<approach2>>= |
|
153 |
-cnset.subset <- cnSet[1:5, 1:10] |
|
154 |
-invisible(close(cnSet)) |
|
155 |
-class(A(cnset.subset)) |
|
140 |
+Next we load the annotation package required, as well as the \R{} |
|
141 |
+package \Rpackage{hapmapsnp6} that contains 3 example CEL files. We |
|
142 |
+use the \Rfunction{system.file} function to find the path to the CEL |
|
143 |
+files. |
|
144 |
+ |
|
145 |
+<<annotation_metadata>>= |
|
146 |
+require(genomewidesnp6Crlmm) & require(hapmapsnp6) |
|
147 |
+path <- system.file("celFiles", package="hapmapsnp6") |
|
148 |
+celfiles <- list.celfiles(path, full.names=TRUE) |
|
149 |
+@ |
|
150 |
+ |
|
151 |
+Typically, an object of class \Rclass{CNSet} is instantiated as part |
|
152 |
+of the preprocessing and genotyping by calling the function |
|
153 |
+\Rfunction{genotype}, as illustrated in the |
|
154 |
+\verb+AffymtrixPreprocessCN+ vignette. |
|
155 |
+ |
|
156 |
+<<instantiateToyExample,cache=TRUE>>= |
|
157 |
+exampleSet <- genotype(celfiles, batch=rep("1", 3), cdfName="genomewidesnp6") |
|
156 | 158 |
@ |
157 | 159 |
|
158 |
-The files with \verb+.ff+ extension should not be removed or |
|
159 |
-relocated. The safest way to move these files if necessary is to |
|
160 |
-clone all of the \ff{} objects using the \Rfunction{clone}, followed |
|
161 |
-by the \Rfunction{delete} function to remove the original files on |
|
162 |
-disk. See the documentation in the \ff{} package for additional |
|
163 |
-details. |
|
160 |
+Several files with \verb+.ff+ extensions now appear in the directory |
|
161 |
+indicated by the \Rfunction{ldPath} function. |
|
162 |
+ |
|
163 |
+<<ldpath>>= |
|
164 |
+ldPath() |
|
165 |
+@ |
|
166 |
+ |
|
167 |
+One could also instantiate an object of class \Rclass{CNSet} without |
|
168 |
+preprocessing/genotyping by calling the non-exported function |
|
169 |
+\Rfunction{constructAffy} directly using the \Rfunction{:::} operator. |
|
170 |
+ |
|
171 |
+<<constructAffy,eval=FALSE>>= |
|
172 |
+tmp <- crlmm:::constructAffy(celfiles, batch=rep("1", 3), cdfName="genomewidesnp6") |
|
173 |
+@ |
|
174 |
+ |
|
175 |
+The \Rfunction{show} method provides a concise summary of the |
|
176 |
+\Robject{exampleSet} object. Note the class of the elements in the |
|
177 |
+\verb+batchStatistics+ and \verb+assayData+ slots is indicated in the |
|
178 |
+first line of the summary. |
|
179 |
+ |
|
180 |
+<<show>>= |
|
181 |
+invisible(open(exampleSet)) |
|
182 |
+exampleSet |
|
183 |
+@ |
|
184 |
+ |
|
185 |
+% We briefly outline some of the unique aspects of the |
|
186 |
+% \Rclass{CNSet}-class using in \crlmm{} that may differ from the more |
|
187 |
+% standard extensions of the virtual class \Rclass{eSet} defined in |
|
188 |
+% the \Rpackage{Biobase} package. |
|
189 |
+ |
|
190 |
+As the \verb+assayData+ elements of the \Robject{exampleSet} object |
|
191 |
+are stored on disk rather than in memory, we inspect attributes of the |
|
192 |
+elements by first opening the file connection using the |
|
193 |
+\Rfunction{open}. For example, in the following code we extract the |
|
194 |
+normalized intensities for the A allele by first opening the object |
|
195 |
+returned by the \Rfunction{A} function. The name of the file where |
|
196 |
+the data is stored on disk is provided by the |
|
197 |
+\Rfunction{filename}. Finally, the normalized intensities can be |
|
198 |
+pulled from disk to memory by the `[' method. It can be useful to |
|
199 |
+wrap the `[' method by the \Rfunction{as.matrix} to ensure that the |
|
200 |
+output is the desired class. |
|
201 |
+ |
|
202 |
+<<approach1>>= |
|
203 |
+invisible(open(exampleSet)) |
|
204 |
+class(A(exampleSet)) |
|
205 |
+filename(A(exampleSet)) |
|
206 |
+as.matrix(A(exampleSet)[1:5, ]) |
|
207 |
+@ |
|
208 |
+ |
|
209 |
+\paragraph{Moving \texttt{*.ff} files} Ideally, the files with |
|
210 |
+\verb+.ff+ extension should not be moved. However, if this is not |
|
211 |
+possible, the safest way to move these files is to clone all of the |
|
212 |
+\ff{} objects using the \Rfunction{clone}, followed by the |
|
213 |
+\Rfunction{delete} function to remove the original files on disk. An |
|
214 |
+example of the \Rfunction{delete} function is included at the end of |
|
215 |
+the \verb+IlluminaPreprocessCN+ vignette. See the documentation for |
|
216 |
+the \Rfunction{clone} and \Rfunction{delete} functions in the \ff{} |
|
217 |
+package for additional details. |
|
218 |
+ |
|
164 | 219 |
|
165 | 220 |
\paragraph{Order of operations:} For \Rclass{CNSet}-instances derived |
166 | 221 |
by approach (1), users should be mindful of the substantial I/O when |
... | ... |
@@ -171,30 +226,49 @@ operation to emphasize the order of operations): |
171 | 226 |
|
172 | 227 |
<<orderOfOperations>>= |
173 | 228 |
##inefficient |
174 |
-A(cnSet[1:5, 1:5]) |
|
229 |
+##invisible(open(cnSet)) |
|
230 |
+A(exampleSet[1:5, ]) |
|
175 | 231 |
## preferred |
176 |
-(A(cnSet))[1:5, 1:5] |
|
232 |
+(A(exampleSet))[1:5, ] |
|
177 | 233 |
@ |
178 | 234 |
|
179 | 235 |
|
180 |
-\textbf{\texttt{featureData}} |
|
236 |
+\subsubsection{Approach 2: using `['} |
|
237 |
+ |
|
238 |
+Here we instantiate a new object of class \Rclass{CNSet} by applying the |
|
239 |
+`[' method to an existing object of class \Rclass{CNSet}. |
|
240 |
+ |
|
241 |
+<<approach2>>= |
|
242 |
+cnset.subset <- exampleSet[1:5, ] |
|
243 |
+@ |
|
244 |
+ |
|
245 |
+Note the class of the \verb+batchStatistics+ and \verb+assayData+ |
|
246 |
+elements of the \Robject{cnset.subset} object printed in the first |
|
247 |
+line of the summary. |
|
248 |
+ |
|
249 |
+<<show.subset>>= |
|
250 |
+show(cnset.subset) |
|
251 |
+@ |
|
252 |
+ |
|
253 |
+\subsection{Slots of class \Rclass{CNSet}} |
|
254 |
+\subsubsection{\texttt{featureData}} |
|
181 | 255 |
|
182 | 256 |
Information on physical position, chromosome, and whether the marker |
183 | 257 |
is a SNP can be accessed through accessors defined for the |
184 | 258 |
\verb+featureData+. |
185 | 259 |
|
186 | 260 |
<<featuredataAccessors>>= |
187 |
-fvarLabels(cnSet) |
|
188 |
-position(cnSet)[1:10] |
|
189 |
-chromosome(cnSet)[1:10] |
|
190 |
-is.snp <- isSnp(cnSet) |
|
261 |
+fvarLabels(exampleSet) |
|
262 |
+position(exampleSet)[1:10] |
|
263 |
+chromosome(exampleSet)[1:10] |
|
264 |
+is.snp <- isSnp(exampleSet) |
|
191 | 265 |
table(is.snp) |
192 | 266 |
snp.index <- which(is.snp) |
193 | 267 |
np.index <- which(!is.snp) |
194 |
-chr1.index <- which(chromosome(cnSet) == 1) |
|
268 |
+chr1.index <- which(chromosome(exampleSet) == 1) |
|
195 | 269 |
@ |
196 | 270 |
|
197 |
-\textbf{\texttt{assayData}} |
|
271 |
+\subsubsection{\texttt{assayData}} |
|
198 | 272 |
|
199 | 273 |
The \verb+assayData+ elements are of the class |
200 | 274 |
\Rclass{ff\_matrix}/\Rclass{ffdf} or \Rclass{matrix}, depending on how |
... | ... |
@@ -203,7 +277,7 @@ the \Rclass{CNSet} object was instantiated. Elements in the |
203 | 277 |
function. |
204 | 278 |
|
205 | 279 |
<<assayData>>= |
206 |
-ls(assayData(cnSet)) |
|
280 |
+ls(assayData(exampleSet)) |
|
207 | 281 |
@ |
208 | 282 |
|
209 | 283 |
The normalized intensities for the A and B alleles have names |
... | ... |
@@ -216,7 +290,7 @@ but can be tranlated to the probability scale using the \R{} function |
216 | 290 |
\Rfunction{i2p}. For example, |
217 | 291 |
|
218 | 292 |
<<i2p>>= |
219 |
-scores <- as.matrix(snpCallProbability(cnSet)[1:5, 1:2]) |
|
293 |
+scores <- as.matrix(snpCallProbability(exampleSet)[1:5, 1:2]) |
|
220 | 294 |
i2p(scores) |
221 | 295 |
@ |
222 | 296 |
|
... | ... |
@@ -227,15 +301,16 @@ platform. A consequence of keeping the rows of the assay data |
227 | 301 |
elements the same for all of the statistical summaries is that the |
228 | 302 |
matrix used to store genotype calls is larger than necessary. |
229 | 303 |
|
230 |
-Note that NA's are stored in the slot for normalized 'B' allele |
|
231 |
-intensities: |
|
304 |
+% only true for affy. for illumina, we have intensities stored in 'B' |
|
305 |
+%Note that NA's are stored in the slot for normalized 'B' allele |
|
306 |
+%intensities: |
|
232 | 307 |
|
233 |
-<<B.NAs>>= |
|
234 |
-np.index <- which(!is.snp) |
|
235 |
-stopifnot(all(is.na(B(cnSet)[np.index, ]))) |
|
236 |
-@ |
|
308 |
+%<<B.NAs>>= |
|
309 |
+%np.index <- which(!is.snp) |
|
310 |
+%stopifnot(all(is.na(B(cnSet)[np.index, ]))) |
|
311 |
+%@ |
|
237 | 312 |
|
238 |
-\textbf{\texttt{batch} and \texttt{batchStatistics}} |
|
313 |
+\subsubsection{\texttt{batch} and \texttt{batchStatistics}} |
|
239 | 314 |
|
240 | 315 |
As defined in Leek \textit{et al.} 2010, \textit{Batch effects are |
241 | 316 |
sub-groups of measurements that have qualitatively different |
... | ... |
@@ -260,7 +335,7 @@ to access the \verb+batch+ information on the samples as in the |
260 | 335 |
following example. |
261 | 336 |
|
262 | 337 |
<<batchAccessor>>= |
263 |
-table(batch(cnSet)) |
|
338 |
+batch(exampleSet) |
|
264 | 339 |
@ |
265 | 340 |
|
266 | 341 |
For the \verb+batchStatistics+ slot, the elements in the environment |
... | ... |
@@ -272,7 +347,7 @@ the elements in the environment can be list using the \R{} function |
272 | 347 |
\Rfunction{ls}. |
273 | 348 |
|
274 | 349 |
<<batchStatistics>>= |
275 |
-ls(batchStatistics(cnSet)) |
|
350 |
+ls(batchStatistics(exampleSet)) |
|
276 | 351 |
@ |
277 | 352 |
|
278 | 353 |
Currently, the batch-specific summaries are stored to allow some |
... | ... |
@@ -292,7 +367,7 @@ intended to be accessed directly by the user. |
292 | 367 |
%median variance (across markers) within a batch. ) The elements of the |
293 | 368 |
%slot can be listed as follows. |
294 | 369 |
|
295 |
-\textbf{\texttt{phenoData}} |
|
370 |
+\subsubsection{\texttt{phenoData}} |
|
296 | 371 |
|
297 | 372 |
Sample-level summaries obtained during the preprocessing/genotyping |
298 | 373 |
steps include skew (SKW), the signal to noise ratio (SNR), and gender |
... | ... |
@@ -303,26 +378,29 @@ approach (1), these elements are of the class |
303 | 378 |
\Rclass{ff\_vector}. |
304 | 379 |
|
305 | 380 |
<<phenodataAccessors>>= |
306 |
-varLabels(cnSet) |
|
307 |
-class(cnSet$gender) |
|
308 |
-invisible(open(cnSet$gender)) |
|
309 |
-cnSet$gender |
|
381 |
+varLabels(exampleSet) |
|
382 |
+class(exampleSet$gender) |
|
383 |
+invisible(open(exampleSet$gender)) |
|
384 |
+exampleSet$gender |
|
310 | 385 |
@ |
311 | 386 |
|
312 | 387 |
The `[' methods without arguments can be used to coerce to a vector. |
313 | 388 |
|
314 | 389 |
<<vector>>= |
315 |
-cnSet$gender[] |
|
316 |
-invisible(close(cnSet$gender)) |
|
390 |
+c("male", "female")[exampleSet$gender[]] |
|
391 |
+invisible(close(exampleSet$gender)) |
|
317 | 392 |
@ |
318 | 393 |
|
319 |
-\textbf{\texttt{protocolData}} |
|
394 |
+ |
|
395 |
+ |
|
396 |
+ |
|
397 |
+\subsubsection{\texttt{protocolData}} |
|
320 | 398 |
|
321 | 399 |
The scan date of the arrays are stored in the \verb+protocolData+. |
322 | 400 |
|
323 | 401 |
<<protocolData>>= |
324 |
-varLabels(protocolData(cnSet)) |
|
325 |
-protocolData(cnSet)$ScanDate[1:10] |
|
402 |
+varLabels(protocolData(exampleSet)) |
|
403 |
+protocolData(exampleSet)$ScanDate |
|
326 | 404 |
@ |
327 | 405 |
|
328 | 406 |
|
... | ... |
@@ -388,7 +466,28 @@ protocolData(cnSet)$ScanDate[1:10] |
388 | 466 |
%useful for assessing the biallelic genotype calls. This section of |
389 | 467 |
%the vignette is currently under development. |
390 | 468 |
|
391 |
-\section{Trouble shooting} |
|
469 |
+\section{Trouble shooting with a HapMap example} |
|
470 |
+ |
|
471 |
+This section uses an object of class \Rclass{CNSet} instantiated by |
|
472 |
+the \verb+AffymetrixPreprocessCN+ vignette and saved to a local path |
|
473 |
+on our computing cluster indicated by the object \Robject{outdir} |
|
474 |
+below. The \verb+copynumber+ vignette was used to fill out the |
|
475 |
+\verb+batchStatistics+ slot of the \Robject{cnSet} object. |
|
476 |
+ |
|
477 |
+<<path>>= |
|
478 |
+if(getRversion() < "2.13.0"){ |
|
479 |
+ rpath <- getRversion() |
|
480 |
+} else rpath <- "trunk" |
|
481 |
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/copynumber_vignette", sep="") |
|
482 |
+@ |
|
483 |
+ |
|
484 |
+Next, we load the \Robject{cnSet} object. |
|
485 |
+ |
|
486 |
+<<loadcnset>>= |
|
487 |
+if(!exists("cnSet")) load(file.path(outdir, "cnSet.rda")) |
|
488 |
+invisible(open(cnSet)) |
|
489 |
+@ |
|
490 |
+ |
|
392 | 491 |
|
393 | 492 |
\subsection{Missing values} |
394 | 493 |
|
... | ... |
@@ -406,12 +505,15 @@ protocolData(cnSet)$ScanDate[1:10] |
406 | 505 |
Most often, missing values occur when the genotype confidence scores |
407 | 506 |
for a SNP were below the threshold used by the |
408 | 507 |
\Robject{crlmmCopynumber} function. For the HapMap analysis, we used a |
409 |
-confidence threshold of 0.80 (the default). |
|
508 |
+confidence threshold of 0.80 (the default). In the following code, we |
|
509 |
+assess NA's appearing for the raw copy number estimates for the first |
|
510 |
+10 samples. |
|
410 | 511 |
|
411 | 512 |
<<NA>>= |
412 | 513 |
GT.CONF.THR <- 0.80 |
413 | 514 |
autosome.index <- which(isSnp(cnSet) & chromosome(cnSet) < 23) |
414 |
-sample.index <- which(batch(cnSet) == "C") |
|
515 |
+sample.index <- 1:10 |
|
516 |
+ct <- totalCopynumber(cnSet, i=autosome.index, j=sample.index) |
|
415 | 517 |
ca <- CA(cnSet, i=autosome.index, j=sample.index) |
416 | 518 |
cb <- CB(cnSet, i=autosome.index, j=sample.index) |
417 | 519 |
missing.ca <- is.na(ca) |
... | ... |
@@ -427,9 +529,9 @@ threshold specified in \Robject{crlmmCopynumber}. |
427 | 529 |
|
428 | 530 |
<<NAconfidenceScores>>= |
429 | 531 |
if(nmissing.ca > 0){ |
430 |
- invisible(open(snpCallProbability(cnSet))) |
|
532 |
+ ##invisible(open(snpCallProbability(cnSet))) |
|
431 | 533 |
gt.conf <- i2p(snpCallProbability(cnSet)[autosome.index, sample.index]) |
432 |
- invisible(close(snpCallProbability(cnSet))) |
|
534 |
+ ##invisible(close(snpCallProbability(cnSet))) |
|
433 | 535 |
below.thr <- gt.conf < GT.CONF.THR |
434 | 536 |
index.allbelow <- as.integer(which(rowSums(below.thr) == length(sample.index))) |
435 | 537 |
nmissingBecauseOfGtThr <- length(index.allbelow) * length(sample.index) |
... | ... |
@@ -451,13 +553,15 @@ missing values to the number of samples to check whether all of the |
451 | 553 |
estimates are missing for a given SNP. |
452 | 554 |
|
453 | 555 |
<<NAchromosomeX>>= |
556 |
+## start with first batch |
|
557 |
+sample.index <- 1:90 |
|
454 | 558 |
X.index <- which(isSnp(cnSet) & chromosome(cnSet) == 23) |
455 | 559 |
ca.X <- CA(cnSet, i=X.index, j=sample.index) |
456 | 560 |
missing.caX <- is.na(ca.X) |
457 | 561 |
(nmissing.caX <- sum(missing.caX)) |
458 | 562 |
missing.snp.index <- which(rowSums(missing.caX) == length(sample.index)) |
459 | 563 |
index <- which(rowSums(missing.caX) == length(sample.index)) |
460 |
-length(index)*length(sample.index)/nmissing.caX |
|
564 |
+##length(index)*length(sample.index)/nmissing.caX |
|
461 | 565 |
@ |
462 | 566 |
|
463 | 567 |
From the above codechunk, we see that \Sexpr{length(index)} SNPs have |
... | ... |
@@ -468,21 +572,15 @@ from SNPs in which either the men or the women had confidence scores |
468 | 572 |
that were all below the threshold. |
469 | 573 |
|
470 | 574 |
<<NAcrlmmConfidenceScore>>= |
471 |
-if(nmissing.caX > 0){ |
|
472 |
- invisible(open(snpCallProbability(cnSet))) |
|
473 |
- gt.conf <- i2p(snpCallProbability(cnSet)[X.index, sample.index]) |
|
474 |
- invisible(close(snpCallProbability(cnSet))) |
|
475 |
- below.thr <- gt.conf < GT.CONF.THR |
|
476 |
- F <- which(cnSet$gender[sample.index] == 2) |
|
477 |
- M <- which(cnSet$gender[sample.index] == 1) |
|
478 |
- ##nus <- nuA(cnSet)[X.index, "C"] |
|
479 |
- index.allbelowF <- as.integer(which(rowSums(below.thr[, F]) == length(F))) |
|
480 |
- index.allbelowM <- as.integer(which(rowSums(below.thr[, M]) == length(M))) |
|
481 |
- index.allbelow <- as.integer(unique(c(index.allbelowF, index.allbelowM))) |
|
482 |
- ##all.equal(index, index.allbelow) |
|
483 |
- ## or calculate the proportion of missing effected by low crlmm confidence |
|
484 |
- sum(index.allbelow %in% index)/length(index) |
|
485 |
-} |
|
575 |
+invisible(open(cnSet$gender)) |
|
576 |
+F <- which(cnSet$gender[sample.index] == 2) |
|
577 |
+M <- which(cnSet$gender[sample.index] == 1) |
|
578 |
+gt.conf <- i2p(snpCallProbability(cnSet)[X.index, sample.index]) |
|
579 |
+below.thr <- gt.conf < GT.CONF.THR |
|
580 |
+index.allbelowF <- as.integer(which(rowSums(below.thr[, F]) == length(F))) |
|
581 |
+index.allbelowM <- as.integer(which(rowSums(below.thr[, M]) == length(M))) |
|
582 |
+all.equal(index.allbelowF, index.allbelowM) |
|
583 |
+all.equal(as.integer(index.allbelowF), as.integer(index)) |
|
486 | 584 |
@ |
487 | 585 |
|
488 | 586 |
For nonpolymorphic loci, the genotype confidence scores are irrelevant |
... | ... |
@@ -519,8 +617,9 @@ detection and removal may be explored in the future. |
519 | 617 |
Copy number estimates for other chromosomes, such as mitochondrial and |
520 | 618 |
chromosome Y, are not currently available in \crlmm{}. |
521 | 619 |
|
522 |
-<<echo=FALSE>>= |
|
620 |
+<<close>>= |
|
523 | 621 |
invisible(close(cnSet)) |
622 |
+invisible(close(cnSet$gender)) |
|
524 | 623 |
@ |
525 | 624 |
|
526 | 625 |
|
... | ... |
@@ -38,7 +38,7 @@ options(continue=" ", width=70) |
38 | 38 |
vignettes for the Affymetrix and Illumina platforms, |
39 | 39 |
respectively. While this vignette uses Affymetrix 6.0 arrays for |
40 | 40 |
illustration, the steps at this point are identical for both |
41 |
- platforms. See \citep{Scharpf2010} for details regarding the |
|
41 |
+ platforms. See \citep{Scharpf2011} for details regarding the |
|
42 | 42 |
methodology implemented in \crlmm{} for copy number analysis. In |
43 | 43 |
addition, a compendium describing copy number analysis using the |
44 | 44 |
\crlmm{} package is available from the author's website: |
... | ... |
@@ -80,7 +80,9 @@ the \crlmm{} package currently requires at least 10 samples per batch. |
80 | 80 |
The parameter estimates for copy number and the corresponding |
81 | 81 |
estimates of raw copy number will tend to be more noisy for batches |
82 | 82 |
with small sample sizes (e.g., $<$ 50). Chemistry plate or scan date |
83 |
-are often useful surrogates for batch. |
|
83 |
+are often useful surrogates for batch. Samples that were processed at |
|
84 |
+similar times (e.g., in the same month) can be grouped together in the |
|
85 |
+same batch. |
|
84 | 86 |
|
85 | 87 |
\section{Quality control} |
86 | 88 |
|
... | ... |
@@ -98,7 +100,7 @@ quality. The following code chunk makes a histogram of the SNR values |
98 | 100 |
for the HapMap samples. |
99 | 101 |
|
100 | 102 |
<<snr,fig=TRUE,include=FALSE,width=6, height=4>>= |
101 |
-open(cnSet$SNR) |
|
103 |
+invisible(open(cnSet$SNR)) |
|
102 | 104 |
snr <- cnSet$SNR[] |
103 | 105 |
close(cnSet$SNR) |
104 | 106 |
print(histogram(~snr, panel=function(...){ |
... | ... |
@@ -120,13 +122,13 @@ print(histogram(~snr, panel=function(...){ |
120 | 122 |
|
121 | 123 |
\section{Copy number estimation} |
122 | 124 |
|
123 |
-As described in \cite{Scharpf2010}, the CRLMM-CopyNumber algorithm |
|
125 |
+As described in \cite{Scharpf2011}, the CRLMM-CopyNumber algorithm |
|
124 | 126 |
fits a linear model to the normalized intensities stratified by the |
125 | 127 |
diallic genotype call. The intercept and slope from the linear model |
126 | 128 |
are both SNP- and batch-specific. The implementation in the \crlmm{} |
127 | 129 |
package is encapsulated by the function \Rfunction{crlmmCopynumber} |
128 | 130 |
that, using the default settings, can be called by passing a single |
129 |
-object of class \code{CNSet}. See the appropriate |
|
131 |
+object of class \Rclass{CNSet}. See the appropriate |
|
130 | 132 |
preprocessing/genotyping vignette for the construction of an object of |
131 | 133 |
class \Rclass{CNSet}. |
132 | 134 |
<<LDS_copynumber,cache=TRUE>>= |
... | ... |
@@ -147,7 +149,7 @@ computed for subsets of the markers to reduce the required RAM. Note |
147 | 149 |
that the value returned by the \Rfunction{crlmmCopynumber} function in |
148 | 150 |
the above example is \verb+TRUE+. The reason the function returns |
149 | 151 |
\verb+TRUE+ in the above example is that the elements of the |
150 |
-\verb+batchStatistics+ slot have the class \Rclass{ff_matrix}. Rather |
|
152 |
+\verb+batchStatistics+ slot have the class \Rclass{ff\_matrix}. Rather |
|
151 | 153 |
than keep the statistical summaries in memory, the summaries are |
152 | 154 |
written to files on disk using protocols described in the |
153 | 155 |
\Rpackage{ff} package. Hence, while the \Robject{cnSet} object itself |
... | ... |
@@ -169,9 +171,9 @@ slot of the class \Rclass{CNSet}. Using the default settings for the |
169 | 171 |
Note that depends on whether the elements of the \verb+batchStatistics+ |
170 | 172 |
slot are \Robject{ff} objects or ordinary matrices. In this example, |
171 | 173 |
the elements of \verb+batchStatistics+ have the class |
172 |
-\Rclass{ff_matrix}. |
|
174 |
+\Rclass{ff\_matrix}. |
|
173 | 175 |
|
174 |
-<<LDS_copynumber,cache=TRUE>>= |
|
176 |
+<<classes>>= |
|
175 | 177 |
nms <- ls(batchStatistics(cnSet)) |
176 | 178 |
cls <- rep(NA, length(nms)) |
177 | 179 |
for(i in seq_along(nms)) cls[i] <- class(batchStatistics(cnSet)[[nms[i]]])[1] |
... | ... |
@@ -284,6 +286,7 @@ In the following code chunk, we extract estimates of the total copy |
284 | 286 |
number at nonpolymorphic markers on chromosome X. |
285 | 287 |
|
286 | 288 |
<<npx>>= |
289 |
+set.seed(123) |
|
287 | 290 |
npx.index <- which(chromosome(cnSet)==23 & !isSnp(cnSet)) |
288 | 291 |
M <- sample(which(cnSet$gender[]==1), 5) |
289 | 292 |
F <- sample(which(cnSet$gender[]==2), 5) |
... | ... |
@@ -341,9 +344,9 @@ print(bwplot(cn~id,df, panel=function(x,y,...){ |
341 | 344 |
\begin{figure}[t!] |
342 | 345 |
\centering |
343 | 346 |
\includegraphics[width=0.5\textwidth]{copynumber-polymorphicX} |
344 |
-\caption{Copy number estimates for polymorphic markers on chromosome |
|
345 |
- X. crlmm assumes that the median copy number across samples at a |
|
346 |
- given marker on X is 1 for men and 2 for women.} |
|
347 |
+ \caption{Copy number estimates for polymorphic markers on chromosome |
|
348 |
+ X. crlmm assumes that the median copy number across samples at a |
|
349 |
+ given marker on X is 1 for men and 2 for women.} |
|
347 | 350 |
\end{figure} |
348 | 351 |
|
349 | 352 |
\subsection{A container for raw copy number} |
... | ... |
@@ -355,29 +358,31 @@ confidence scores, and the total copy number at each marker is the |
355 | 358 |
\Rfunction{as} (as illustrated below). Users should note that if the |
356 | 359 |
\verb+assayData+ elements in the \Rclass{CNSet} instance are |
357 | 360 |
\Rpackage{ff} objects, the \verb+assayData+ elements of the |
358 |
-instantiated \Rclass{oligoSnpSet} will also be \Rpackage{ff} object (a |
|
359 |
-new \verb+total_cn*.ff+ file will be created in the \verb+ldPath()+ |
|
360 |
-directory). The advantage of this approach is that the raw copy number |
|
361 |
-estimates can be computed for all markers and all samples |
|
362 |
-simultaneously without excessive RAM. The disadvantage is that such a |
|
363 |
-step creates additional I/O for reading and writing the raw copy |
|
364 |
-number estimates from disk. For very large data sets (1000+ samples), |
|
365 |
-the time required for I/O may outweigh the benefits. The alternative |
|
366 |
-is to perform downstream processing (such as smoothing) on the total |
|
367 |
-copy number for subsets of markers and/or samples. As indicated |
|
368 |
-previously, the subset `[' method applied to an object of class |
|
369 |
-\Robject{CNSet} will return an object of the same class with simple |
|
370 |
-matrices instead of \Rpackage{ff} objects. Therefore, one can |
|
371 |
-instantiate a \Rpackage{oligoSnpSet} object from a \Robject{CNSet} |
|
372 |
-object without the I/O overhead for storing the total copy number |
|
373 |
-estimates by first subsetting the \Rclass{CNSet} object. One could |
|
374 |
-then smooth the raw copy number estimates and store a summary copy |
|
375 |
-number for the interval, thereby substantially reducing the |
|
376 |
-dimensionality of the copy number estimates. Examples of smoothing |
|
377 |
-applications such as circular binary segmentation implemented in the |
|
378 |
-\Rpackage{DNAcopy} package or a hidden Markov model implemented in the |
|
379 |
-\Rpackage{VanillaICE} package are provided in the \verb+SmoothingCN+ |
|
380 |
-vignette. |
|
361 |
+instantiated \Rclass{oligoSnpSet} will also be \Rpackage{ff}-dervied |
|
362 |
+objects (a new \verb+total_cn*.ff+ file will be created in the |
|
363 |
+\verb+ldPath()+ directory). |
|
364 |
+ |
|
365 |
+%The advantage of this approach is that the raw copy number estimates |
|
366 |
+%can be computed for all markers and all samples simultaneously without |
|
367 |
+%excessive RAM. The disadvantage is that such a step creates |
|
368 |
+%additional I/O for reading and writing the raw copy number estimates |
|
369 |
+%from disk. For very large data sets (1000+ samples), the time |
|
370 |
+%required for I/O may outweigh the benefits. The alternative is to |
|
371 |
+%perform downstream processing (such as smoothing) on the total copy |
|
372 |
+%number for subsets of markers and/or samples. As indicated |
|
373 |
+%previously, the subset `[' method applied to an object of class |
|
374 |
+%\Robject{CNSet} will return an object of the same class with simple |
|
375 |
+%matrices instead of \Rpackage{ff} objects. Therefore, one can |
|
376 |
+%instantiate a \Rpackage{oligoSnpSet} object from a \Robject{CNSet} |
|
377 |
+%object without the I/O overhead for storing the total copy number |
|
378 |
+%estimates by first subsetting the \Rclass{CNSet} object. One could |
|
379 |
+%then smooth the raw copy number estimates and store a summary copy |
|
380 |
+%number for the interval, thereby substantially reducing the |
|
381 |
+%dimensionality of the copy number estimates. Examples of smoothing |
|
382 |
+%applications such as circular binary segmentation implemented in the |
|
383 |
+%\Rpackage{DNAcopy} package or a hidden Markov model implemented in the |
|
384 |
+%\Rpackage{VanillaICE} package are provided in the \verb+SmoothingCN+ |
|
385 |
+%vignette. |
|
381 | 386 |
|
382 | 387 |
<<oligoSnpSet>>= |
383 | 388 |
open(cnSet3) |
... | ... |
@@ -388,7 +393,7 @@ class(copyNumber(oligoSet)) |
388 | 393 |
|
389 | 394 |
\noindent Note that the raw copy number estimates stored in the |
390 | 395 |
\Robject{oligoSnpSet} object can be retrieved by the |
391 |
-\Rfunction{copyNumber} accessor and is equivalanet to that returned by |
|
396 |
+\Rfunction{copyNumber} accessor and is equivalent to that returned by |
|
392 | 397 |
the \Rfunction{totalCopynumber} function defined over the same row and |
393 | 398 |
column indices. |
394 | 399 |
|
... | ... |
@@ -1,12 +1,58 @@ |
1 |
-@ARTICLE{Scharpf2010, |
|
1 |
+@ARTICLE{Scharpf2011, |
|
2 | 2 |
author = {Robert B Scharpf and Ingo Ruczinski and Benilton Carvalho and Betty |
3 |
- Doan and Aravinda Chakravarti and Rafael Irizarry}, |
|
3 |
+ Doan and Aravinda Chakravarti and Rafael A Irizarry}, |
|
4 | 4 |
title = {A multilevel model to address batch effects in copy number estimation |
5 |
- using SNP arrays}, |
|
6 |
- journal={Biostatistics}, |
|
7 |
- year = {2010}, |
|
5 |
+ using SNP arrays.}, |
|
6 |
+ journal = {Biostatistics}, |
|
7 |
+ year = {2011}, |
|
8 |
+ volume = {12}, |
|
9 |
+ pages = {33--50}, |
|
10 |
+ number = {1}, |
|
11 |
+ month = {Jan}, |
|
12 |
+ abstract = {Submicroscopic changes in chromosomal DNA copy number dosage are common |
|
13 |
+ and have been implicated in many heritable diseases and cancers. |
|
14 |
+ Recent high-throughput technologies have a resolution that permits |
|
15 |
+ the detection of segmental changes in DNA copy number that span thousands |
|
16 |
+ of base pairs in the genome. Genomewide association studies (GWAS) |
|
17 |
+ may simultaneously screen for copy number phenotype and single nucleotide |
|
18 |
+ polymorphism (SNP) phenotype associations as part of the analytic |
|
19 |
+ strategy. However, genomewide array analyses are particularly susceptible |
|
20 |
+ to batch effects as the logistics of preparing DNA and processing |
|
21 |
+ thousands of arrays often involves multiple laboratories and technicians, |
|
22 |
+ or changes over calendar time to the reagents and laboratory equipment. |
|
23 |
+ Failure to adjust for batch effects can lead to incorrect inference |
|
24 |
+ and requires inefficient post hoc quality control procedures to exclude |
|
25 |
+ regions that are associated with batch. Our work extends previous |
|
26 |
+ model-based approaches for copy number estimation by explicitly modeling |
|
27 |
+ batch and using shrinkage to improve locus-specific estimates of |
|
28 |
+ copy number uncertainty. Key features of this approach include the |
|
29 |
+ use of biallelic genotype calls from experimental data to estimate |
|
30 |
+ batch-specific and locus-specific parameters of background and signal |
|
31 |
+ without the requirement of training data. We illustrate these ideas |
|
32 |
+ using a study of bipolar disease and a study of chromosome 21 trisomy. |
|
33 |
+ The former has batch effects that dominate much of the observed variation |
|
34 |
+ in the quantile-normalized intensities, while the latter illustrates |
|
35 |
+ the robustness of our approach to a data set in which approximately |
|
36 |
+ 27\% of the samples have altered copy number. Locus-specific estimates |
|
37 |
+ of copy number can be plotted on the copy number scale to investigate |
|
38 |
+ mosaicism and guide the choice of appropriate downstream approaches |
|
39 |
+ for smoothing the copy number as a function of physical position. |
|
40 |
+ The software is open source and implemented in the R package crlmm |
|
41 |
+ at Bioconductor (http:www.bioconductor.org).}, |
|
42 |
+ doi = {10.1093/biostatistics/kxq043}, |
|
43 |
+ institution = {Department of Oncology, Johns Hopkins University School of Medicine, |
|
44 |
+ Baltimore, MD 21205, USA. rscharpf@jhsph.edu}, |
|
45 |
+ language = {eng}, |
|
46 |
+ medline-pst = {ppublish}, |
|
47 |
+ owner = {rscharpf}, |
|
48 |
+ pii = {kxq043}, |
|
49 |
+ pmcid = {PMC3006124}, |
|
50 |
+ pmid = {20625178}, |
|
51 |
+ timestamp = {2011.02.26}, |
|
52 |
+ url = {http://dx.doi.org/10.1093/biostatistics/kxq043} |
|
8 | 53 |
} |
9 | 54 |
|
55 |
+ |
|
10 | 56 |
@ARTICLE{Carvalho2007a, |
11 | 57 |
author = {Benilton Carvalho and Henrik Bengtsson and Terence P Speed and Rafael |
12 | 58 |
A Irizarry}, |