git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@86002 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -43,6 +43,7 @@ |
43 | 43 |
\section{Set up} |
44 | 44 |
|
45 | 45 |
<<crlmm, results=hide, echo=FALSE>>= |
46 |
+library(Biobase) |
|
46 | 47 |
library(crlmm) |
47 | 48 |
options(width=70) |
48 | 49 |
options(continue=" ") |
... | ... |
@@ -238,7 +239,6 @@ cnSet <- genotype.Illumina(sampleSheet=samplesheet, |
238 | 239 |
arrayInfoColNames=arrayInfo, |
239 | 240 |
cdfName="human370v1c", |
240 | 241 |
batch=batch) |
241 |
-stop() |
|
242 | 242 |
@ |
243 | 243 |
|
244 | 244 |
|
* collab:
add warning in vignette about NAs with BafLrrSetList function
Added Human Omni Express Exome 8 v1.1b as a supported chip
updated version number of pacakge and man pages to reflect these changes
skeleton for krlmm capability added. genotype.Illumina() can now take and XY object as input
update copynumber.Rnw to use BafLrrSetList
updates to vignettes
update namespace
# Please enter a commit message to explain why this merge is necessary,
# especially if it merges an updated upstream into a topic branch.
#
# Lines starting with '#' will be ignored, and an empty message aborts
# the commit.
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@79138 bc3139a8-67e5-0310-9ffc-ced21a209358
* collab: (34 commits)
revert change to IlluminaPreprocessCN
fix bug in isValidCdfName
print warning when all features in a batch of probes are flagged, but allow processing to continue
add utility cleancdfnames
Add validCdfNames.Rd
export validCdfNames
imputeGender fix when chromosome Y not available
Use splitIndicesByLength(index, ocSamples/getDoParWorkers())
Can not allocate vector of size XG with genotype.Illumina. Use splitIndicesByNode() only if the length of the list is greater than the split from splitIndicesByLength(). Otherwise, split by length using ocSamples()
update .gitignore
Add make.unique for sampleSheet$Sample_ID in readIdatFiles
bug in description
ensure sample ids stored in samplesheet are unique when constructing cnSet object
update oligoClasses dependency
update unit test for genotype.Illumina
revert change in constructInf call from genotype.Illumina
Update genotype.Rd
edit ACN function
1.15.6 use make.unique(basename(arrayNames)) to allow processing of Illumina samples with duplicated barcodes
check that sample identifies are unique in crlmm function
...
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@67435 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -6,6 +6,7 @@ |
6 | 6 |
\usepackage{graphicx} |
7 | 7 |
\usepackage{natbib} |
8 | 8 |
\usepackage{amsmath} |
9 |
+\usepackage{url} |
|
9 | 10 |
\usepackage[margin=1in]{geometry} |
10 | 11 |
\newcommand{\crlmm}{\Rpackage{crlmm}} |
11 | 12 |
\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
... | ... |
@@ -233,10 +234,10 @@ wrapper to the above functions and returns an object of class |
233 | 234 |
|
234 | 235 |
<<genotype.Illumina,cache=TRUE,results=hide>>= |
235 | 236 |
cnSet <- genotype.Illumina(sampleSheet=samplesheet, |
236 |
- arrayNames=arrayNames, |
|
237 |
- arrayInfoColNames=arrayInfo, |
|
238 |
- cdfName="human370v1c", |
|
239 |
- batch=batch) |
|
237 |
+ arrayNames=arrayNames, |
|
238 |
+ arrayInfoColNames=arrayInfo, |
|
239 |
+ cdfName="human370v1c", |
|
240 |
+ batch=batch) |
|
240 | 241 |
@ |
241 | 242 |
|
242 | 243 |
|
* collab:
remove getCluster() calls and replace with parStatus()
update man pages for crlmm and genotype.Illumina with respect to the setup for parallelization
add neededPkgs argument to ocLapply calls in crlmmGT2
bump dependency on oligoClasses
Update R/crlmm-illumina.R
contructInf, preprocessInf and genotypeInf no longer exported
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@64211 bc3139a8-67e5-0310-9ffc-ced21a209358
* collab:
replace splitIndicesByLength with splitIndicesByNode throughout cnrma-functions.R (check that snow is loaded and getCluster is not null)
add an example to genotype.Illumina that indicates how parallelization would be enabled. The example requires local data and is not run.
change outdir in IlluminaPreprocessCN and AffyGW
Update R/crlmm-illumina.R
bump version for parallelization of genotype.Illumina
Update R/crlmm-illumina.R
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@64151 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -60,10 +60,7 @@ calls. |
60 | 60 |
|
61 | 61 |
<<ldpath,results=hide>>= |
62 | 62 |
library(ff) |
63 |
-if(getRversion() < "2.13.0"){ |
|
64 |
- rpath <- getRversion() |
|
65 |
-} else rpath <- "trunk" |
|
66 |
-outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/illumina_vignette", sep="") |
|
63 |
+outdir <- paste("/local_data/r00/crlmm/", getRversion(), "/illumina_vignette", sep="") |
|
67 | 64 |
ldPath(outdir) |
68 | 65 |
dir.create(outdir, recursive=TRUE, showWarnings=FALSE) |
69 | 66 |
@ |
* mymac:
add AffyGW.pdf
update vignettes in inst/scripts
Change argument of validCEL to celfiles
Update constructInf to accommodate GenomeDataFrame class for featureData
bump version to 1.13.7
Add doRUnit.R
Add celfile-utils.Rd
Streamlne some of the Rd files
add validCEL function that checks whether all celfiles can be read
getFeatureData returns GenomeAnnotatedDataFrame
Remove imports from methods. Remove pdf of illumina_copynumber.pdf (large file) and copynumber.pdf
getFeatureDAta returns GenomeAnnotatedDataFrame
Remove separate vignette for copy number in inst/scripts. Include copynumber section in both affy and illumina pipelines.
update documentation files for genotype.Illumina, preprocessInf, and genotypeInf (cdfName added as argument. Indicate that 'batch' should be a character string)
pass cdfName to genotypeInf and preprocessInf
add unitTests and cn-functions for 'simple usage'
Combine AffyPreprocess and copynumber. Combine IlluminaPreprocess and copynumber
remove depency on ff to allow installation on my mac
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@62108 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -5,6 +5,7 @@ |
5 | 5 |
\documentclass{article} |
6 | 6 |
\usepackage{graphicx} |
7 | 7 |
\usepackage{natbib} |
8 |
+\usepackage{amsmath} |
|
8 | 9 |
\usepackage[margin=1in]{geometry} |
9 | 10 |
\newcommand{\crlmm}{\Rpackage{crlmm}} |
10 | 11 |
\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
... | ... |
@@ -84,15 +85,15 @@ ocProbesets(150e3) |
84 | 85 |
ocSamples(500) |
85 | 86 |
@ |
86 | 87 |
|
87 |
-\section{Initializing a container for storing processed data} |
|
88 |
- |
|
89 |
-This section will initialize a container for storing processed forms |
|
90 |
-of the data, including the normalized intensities for the A and B |
|
91 |
-alleles and the CRLMM genotype calls and confidence scores. In |
|
92 |
-addition, the container will store information on the markers |
|
93 |
-(physical position, chromosome, and a SNP indicator), the batch, and |
|
94 |
-the samples (e.g., gender). To construct this container for Infinium |
|
95 |
-platforms, several steps are required. |
|
88 |
+%\section{Initializing a container for storing processed data} |
|
89 |
+% |
|
90 |
+%This section will initialize a container for storing processed forms |
|
91 |
+%of the data, including the normalized intensities for the A and B |
|
92 |
+%alleles and the CRLMM genotype calls and confidence scores. In |
|
93 |
+%addition, the container will store information on the markers |
|
94 |
+%(physical position, chromosome, and a SNP indicator), the batch, and |
|
95 |
+%the samples (e.g., gender). To construct this container for Infinium |
|
96 |
+%platforms, several steps are required. |
|
96 | 97 |
|
97 | 98 |
We begin by specifying the path containing the raw IDAT files for a |
98 | 99 |
set of samples from the Infinium 370k platform. |
... | ... |
@@ -143,129 +144,140 @@ processed at similar times (e.g., within a few weeks). |
143 | 144 |
batch <- rep("1", nrow(samplesheet)) |
144 | 145 |
@ |
145 | 146 |
|
146 |
-Finally, we initialize an object of class \Robject{CNSet} using the |
|
147 |
-function \Rfunction{constructInf}. |
|
148 |
- |
|
149 |
-<<container,cache=TRUE>>= |
|
150 |
-cnSet <- constructInf(sampleSheet=samplesheet, |
|
151 |
- arrayNames=arrayNames, |
|
152 |
- batch=batch, |
|
153 |
- arrayInfoColNames=arrayInfo, |
|
154 |
- cdfName=cdfName, |
|
155 |
- verbose=TRUE, |
|
156 |
- saveDate=TRUE) |
|
157 |
-@ |
|
158 |
- |
|
159 |
-A concise summary of the object's contents can be viewed with the |
|
160 |
-\Rfunction{print} function. |
|
161 |
- |
|
162 |
-<<cnset>>= |
|
163 |
-print(cnSet) |
|
164 |
-@ |
|
165 |
- |
|
166 |
-Note that the above object does not yet contain any processed data |
|
167 |
-(only \verb+NA+'s). As the elements of the \verb+assayData+ slot are |
|
168 |
-\Robject{ff} objects (not matrices), several \verb+.ff+ files now |
|
169 |
-appear in the \verb+outdir+. The \verb+.ff+ files should not be |
|
170 |
-removed and can be listed using the \R{} function |
|
171 |
-\Rfunction{list.files}. %For the most part, the \emph{appearance} that |
|
147 |
+%Finally, we initialize an object of class \Robject{CNSet} using the |
|
148 |
+%function \Rfunction{constructInf}. |
|
149 |
+% |
|
150 |
+%<<container,cache=TRUE>>= |
|
151 |
+%cnSet <- constructInf(sampleSheet=samplesheet, |
|
152 |
+% arrayNames=arrayNames, |
|
153 |
+% batch=batch, |
|
154 |
+% arrayInfoColNames=arrayInfo, |
|
155 |
+% cdfName=cdfName, |
|
156 |
+% verbose=TRUE, |
|
157 |
+% saveDate=TRUE) |
|
158 |
+%@ |
|
159 |
+% |
|
160 |
+%A concise summary of the object's contents can be viewed with the |
|
161 |
+%\Rfunction{print} function. |
|
162 |
+% |
|
163 |
+%<<cnset>>= |
|
164 |
+%print(cnSet) |
|
165 |
+%@ |
|
166 |
+% |
|
167 |
+%Note that the above object does not yet contain any processed data |
|
168 |
+%(only \verb+NA+'s). As the elements of the \verb+assayData+ slot are |
|
169 |
+%\Robject{ff} objects (not matrices), several \verb+.ff+ files now |
|
170 |
+%appear in the \verb+outdir+. The \verb+.ff+ files should not be |
|
171 |
+%removed and can be listed using the \R{} function |
|
172 |
+%\Rfunction{list.files}. %For the most part, the \emph{appearance} that |
|
172 | 173 |
%the data is stored in memory is preserved. |
173 |
- |
|
174 |
-<<listff>>= |
|
175 |
-sapply(assayData(cnSet), function(x) class(x)[1]) |
|
176 |
-list.files(outdir, pattern=".ff")[1:5] |
|
177 |
-@ |
|
178 |
- |
|
179 |
-\section{Preprocessing} |
|
174 |
+% |
|
175 |
+%<<listff>>= |
|
176 |
+%sapply(assayData(cnSet), function(x) class(x)[1]) |
|
177 |
+%list.files(outdir, pattern=".ff")[1:5] |
|
178 |
+%@ |
|
179 |
+% |
|
180 |
+\section{Preprocessing and genotyping} |
|
180 | 181 |
|
181 | 182 |
The raw intensities from the Infinium IDAT files are read and |
182 | 183 |
normalized using the function \Rfunction{preprocessInf}. The function |
183 | 184 |
\Rfunction{preprocessInf} returns a \Robject{ff} object containing the |
184 | 185 |
parameters for the mixture model used by the CRLMM genotyping |
185 |
-algorithm. |
|
186 |
- |
|
187 |
-<<preprocess,cache=TRUE, results=hide>>= |
|
188 |
-mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet, arrayNames=arrayNames, arrayInfoColNames=arrayInfo) |
|
189 |
-@ |
|
190 |
- |
|
191 |
-<<showMixtureParams>>= |
|
192 |
-invisible(open(mixtureParams)) |
|
193 |
-str(mixtureParams[]) |
|
194 |
-invisible(close(mixtureParams)) |
|
195 |
-@ |
|
196 |
- |
|
197 |
-Note that the normalized intensities for the A and B alleles are no |
|
198 |
-longer \verb+NA+s and can be inspected using the methods \Rfunction{A} |
|
199 |
-and \Rfunction{B}, respectively. |
|
200 |
- |
|
201 |
-<<intensities>>= |
|
202 |
-invisible(open(A(cnSet))) |
|
203 |
-invisible(open(B(cnSet))) |
|
204 |
-as.matrix(A(cnSet)[1:5, 1:5]) |
|
205 |
-as.matrix(B(cnSet)[1:5, 1:5]) |
|
206 |
-invisible(close(A(cnSet))) |
|
207 |
-invisible(close(B(cnSet))) |
|
208 |
-@ |
|
209 |
- |
|
210 |
-\section{Genotyping} |
|
211 |
- |
|
212 |
-CRLMM genotype calls and confidence scores are estimated using the |
|
213 |
-function \Rfunction{genotypeInf}. |
|
214 |
- |
|
215 |
-<<genotype,cache=TRUE>>= |
|
216 |
-updated <- genotypeInf(cnSet, mixtureParams=mixtureParams) |
|
217 |
-@ |
|
218 |
-\vspace{-0.5em} |
|
219 |
-<<>>= |
|
220 |
-updated |
|
221 |
-@ |
|
222 |
- |
|
223 |
-\textbf{Wrapper:} As an alternative to calling the functions |
|
224 |
-\Rfunction{constructInf}, \Rfunction{preprocessInf} and |
|
225 |
-\Rfunction{genotypeInf} in sequence, a convenience function called |
|
226 |
-\Rfunction{genotype.Illumina} is a wrapper for the above functions and |
|
227 |
-produces identical results. |
|
186 |
+algorithm. Following preprocessing, the \Rfunction{genotypeInf} |
|
187 |
+genotypes the samples. The function \Rfunction{genotype.Illumina} is a |
|
188 |
+wrapper to the above functions and returns an object of class |
|
189 |
+\Rclass{CNSet}. |
|
190 |
+ |
|
191 |
+%<<preprocess,cache=TRUE, results=hide>>= |
|
192 |
+%mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet, |
|
193 |
+% arrayNames=arrayNames, |
|
194 |
+% arrayInfoColNames=arrayInfo, |
|
195 |
+% cdfName=cdfName) |
|
196 |
+%@ |
|
197 |
+% |
|
198 |
+%<<showMixtureParams>>= |
|
199 |
+%invisible(open(mixtureParams)) |
|
200 |
+%str(mixtureParams[]) |
|
201 |
+%invisible(close(mixtureParams)) |
|
202 |
+%@ |
|
203 |
+% |
|
204 |
+%Note that the normalized intensities for the A and B alleles are no |
|
205 |
+%longer \verb+NA+s and can be inspected using the methods \Rfunction{A} |
|
206 |
+%and \Rfunction{B}, respectively. |
|
207 |
+% |
|
208 |
+%<<intensities>>= |
|
209 |
+%invisible(open(A(cnSet))) |
|
210 |
+%invisible(open(B(cnSet))) |
|
211 |
+%as.matrix(A(cnSet)[1:5, 1:5]) |
|
212 |
+%as.matrix(B(cnSet)[1:5, 1:5]) |
|
213 |
+%invisible(close(A(cnSet))) |
|
214 |
+%invisible(close(B(cnSet))) |
|
215 |
+%@ |
|
216 |
+% |
|
217 |
+%\section{Genotyping} |
|
218 |
+% |
|
219 |
+%CRLMM genotype calls and confidence scores are estimated using the |
|
220 |
+%function \Rfunction{genotypeInf}. |
|
221 |
+% |
|
222 |
+%<<genotype,cache=TRUE>>= |
|
223 |
+%updated <- genotypeInf(cnSet, mixtureParams=mixtureParams, cdfName=cdfName) |
|
224 |
+%@ |
|
225 |
+%\vspace{-0.5em} |
|
226 |
+%<<>>= |
|
227 |
+%updated |
|
228 |
+%@ |
|
229 |
+% |
|
230 |
+%\textbf{Wrapper:} As an alternative to calling the functions |
|
231 |
+%\Rfunction{constructInf}, \Rfunction{preprocessInf} and |
|
232 |
+%\Rfunction{genotypeInf} in sequence, a convenience function called |
|
233 |
+%\Rfunction{genotype.Illumina} is a wrapper for the above functions and |
|
234 |
+%produces identical results. |
|
228 | 235 |
|
229 | 236 |
<<genotype.Illumina,cache=TRUE,results=hide>>= |
230 |
-cnSet2 <- genotype.Illumina(sampleSheet=samplesheet, |
|
237 |
+cnSet <- genotype.Illumina(sampleSheet=samplesheet, |
|
231 | 238 |
arrayNames=arrayNames, |
232 | 239 |
arrayInfoColNames=arrayInfo, |
233 | 240 |
cdfName="human370v1c", |
234 | 241 |
batch=batch) |
235 | 242 |
@ |
236 | 243 |
|
237 |
-<<checkIdenticalCalls>>= |
|
238 |
-invisible(open(calls(cnSet))) |
|
239 |
-invisible(open(calls(cnSet2))) |
|
240 |
-snp.index <- which(isSnp(cnSet)) |
|
241 |
-identical(calls(cnSet)[snp.index, 1:20], calls(cnSet2)[snp.index, 1:20]) |
|
242 |
-invisible(close(calls(cnSet))) |
|
243 |
-invisible(close(calls(cnSet2))) |
|
244 |
-@ |
|
245 | 244 |
|
246 |
-To fully remove the data associated with the \Robject{cnSet2} object, |
|
247 |
-one should use the \Rfunction{delete} function in the \Rpackage{ff} |
|
248 |
-package followed by the \Rfunction{rm} function. The following code |
|
249 |
-is not evaluated is it would change the results of the cached |
|
250 |
-computations in the previous code chunk. |
|
245 |
+Note, to fully remove the data associated with the \Robject{cnSet2} |
|
246 |
+object, one should use the \Rfunction{delete} function in the |
|
247 |
+\Rpackage{ff} package followed by the \Rfunction{rm} function. The |
|
248 |
+following code is not evaluated is it would change the results of the |
|
249 |
+cached computations in the previous code chunk. |
|
251 | 250 |
|
252 | 251 |
<<delete, eval=FALSE>>= |
253 |
-lapply(assayData(cnSet2), delete) |
|
254 |
-lapply(batchStatistics(cnSet2), delete) |
|
255 |
-delete(cnSet2$gender) |
|
256 |
-delete(cnSet2$SNR) |
|
257 |
-delete(cnSet2$SKW) |
|
258 |
-rm(cnSet2) |
|
252 |
+lapply(assayData(cnSet), delete) |
|
253 |
+lapply(batchStatistics(cnSet), delete) |
|
254 |
+delete(cnSet$gender) |
|
255 |
+delete(cnSet$SNR) |
|
256 |
+delete(cnSet$SKW) |
|
257 |
+rm(cnSet) |
|
259 | 258 |
@ |
260 | 259 |
|
261 |
-Users can proceed to the \verb+copynumber+ vignette for copy number |
|
262 |
-analyses. See the \verb+Infrastructure+ vignette for additional |
|
263 |
-details on the \Robject{CNSet} class, including an overview of the |
|
264 |
-available accessors. |
|
260 |
+%\section{Copy number estimation} |
|
261 |
+ |
|
262 |
+\SweaveInput{copynumber} |
|
265 | 263 |
|
266 | 264 |
\section{Session information} |
267 | 265 |
<<sessionInfo, results=tex>>= |
268 | 266 |
toLatex(sessionInfo()) |
269 | 267 |
@ |
270 | 268 |
|
269 |
+\begin{figure}[f] |
|
270 |
+ \begin{center} |
|
271 |
+ \includegraphics[width=0.6\textwidth]{IlluminaPreprocessCN-snr.pdf} |
|
272 |
+ \caption{The signal to noise ratio (SNR) for 180 HapMap samples. For |
|
273 |
+ Affymetrix platforms, SNR values below 5 can indicate possible |
|
274 |
+ problems with sample quality. In some circumstances, it may be |
|
275 |
+ more helpful to exclude samples with poor DNA quality.} |
|
276 |
+\end{center} |
|
277 |
+\end{figure} |
|
278 |
+ |
|
279 |
+ |
|
280 |
+\bibliography{refs} |
|
281 |
+\bibliographystyle{plain} |
|
282 |
+ |
|
271 | 283 |
\end{document} |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54327 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -184,8 +184,11 @@ normalized using the function \Rfunction{preprocessInf}. The function |
184 | 184 |
parameters for the mixture model used by the CRLMM genotyping |
185 | 185 |
algorithm. |
186 | 186 |
|
187 |
-<<preprocess,cache=TRUE>>= |
|
187 |
+<<preprocess,cache=TRUE, results=hide>>= |
|
188 | 188 |
mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet, arrayNames=arrayNames, arrayInfoColNames=arrayInfo) |
189 |
+@ |
|
190 |
+ |
|
191 |
+<<showMixtureParams>>= |
|
189 | 192 |
invisible(open(mixtureParams)) |
190 | 193 |
str(mixtureParams[]) |
191 | 194 |
invisible(close(mixtureParams)) |
... | ... |
@@ -211,20 +214,10 @@ function \Rfunction{genotypeInf}. |
211 | 214 |
|
212 | 215 |
<<genotype,cache=TRUE>>= |
213 | 216 |
updated <- genotypeInf(cnSet, mixtureParams=mixtureParams) |
214 |
-print(updated) |
|
215 | 217 |
@ |
216 |
- |
|
217 |
-The posterior probabilities for the genotype calls in the |
|
218 |
-\verb+callProbability+ element of the \verb+assayData+ are stored as |
|
219 |
-integers to reduce the file size on disk. The scores can be |
|
220 |
-transformed to the probability scale using the \Rfunction{i2p} |
|
221 |
-function as illustrated in the following code chunk. |
|
222 |
- |
|
223 |
-<<callprobs>>= |
|
224 |
-invisible(open(snpCallProbability(cnSet))) |
|
225 |
-callProbs <- as.matrix(snpCallProbability(cnSet)[1:5, 1:5]) |
|
226 |
-i2p(callProbs) |
|
227 |
-invisible(close(snpCallProbability(cnSet))) |
|
218 |
+\vspace{-0.5em} |
|
219 |
+<<>>= |
|
220 |
+updated |
|
228 | 221 |
@ |
229 | 222 |
|
230 | 223 |
\textbf{Wrapper:} As an alternative to calling the functions |
... | ... |
@@ -265,27 +258,14 @@ delete(cnSet2$SKW) |
265 | 258 |
rm(cnSet2) |
266 | 259 |
@ |
267 | 260 |
|
268 |
-Users may now proceed to the CopyNumber vignette for copy number |
|
269 |
-analyses. |
|
270 |
- |
|
271 |
- |
|
261 |
+Users can proceed to the \verb+copynumber+ vignette for copy number |
|
262 |
+analyses. See the \verb+Infrastructure+ vignette for additional |
|
263 |
+details on the \Robject{CNSet} class, including an overview of the |
|
264 |
+available accessors. |
|
272 | 265 |
|
273 | 266 |
\section{Session information} |
274 | 267 |
<<sessionInfo, results=tex>>= |
275 | 268 |
toLatex(sessionInfo()) |
276 | 269 |
@ |
277 | 270 |
|
278 |
- |
|
279 |
-<<>>= |
|
280 |
-cnset.updated <- crlmmCopynumber(cnSet) |
|
281 |
-@ |
|
282 |
- |
|
283 |
-<<>>= |
|
284 |
-snp.index <- which(chromosome(cnSet)==1) |
|
285 |
-cnset2 <- cnSet[snp.index, ] |
|
286 |
-trace(crlmmCopynumber, browser) |
|
287 |
-object <- crlmmCopynumber(cnset2) |
|
288 |
-## elements no longer ff |
|
289 |
-@ |
|
290 |
- |
|
291 | 271 |
\end{document} |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54171 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,291 @@ |
1 |
+%\VignetteIndexEntry{IlluminaPreprocessCN Vignette for Illumina} |
|
2 |
+%\VignetteDepends{crlmm, ff, cacheSweave} |
|
3 |
+%\VignetteKeywords{crlmm, illumina, copy number, SNP} |
|
4 |
+%\VignettePackage{crlmm} |
|
5 |
+\documentclass{article} |
|
6 |
+\usepackage{graphicx} |
|
7 |
+\usepackage{natbib} |
|
8 |
+\usepackage[margin=1in]{geometry} |
|
9 |
+\newcommand{\crlmm}{\Rpackage{crlmm}} |
|
10 |
+\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
11 |
+\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
12 |
+\newcommand{\Rcode}[1]{{\texttt{#1}}} |
|
13 |
+\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
14 |
+\newcommand{\Rpackage}[1]{{\textsf{#1}}} |
|
15 |
+\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
16 |
+\newcommand{\oligo}{\Rpackage{oligo }} |
|
17 |
+\newcommand{\R}{\textsf{R}} |
|
18 |
+\newcommand{\ff}{\Rpackage{ff}} |
|
19 |
+ |
|
20 |
+\begin{document} |
|
21 |
+\title{Preprocessing and Genotyping Illumina Arrays for Copy Number Analysis} |
|
22 |
+ |
|
23 |
+\date{\today} |
|
24 |
+ |
|
25 |
+\author{Rob Scharpf} |
|
26 |
+\maketitle |
|
27 |
+ |
|
28 |
+ |
|
29 |
+\begin{abstract} |
|
30 |
+ |
|
31 |
+ This vignette illustrates the steps required prior to copy number |
|
32 |
+ analysis for Infinium platforms. Specifically, we require |
|
33 |
+ construction of a container to store processed forms of the raw |
|
34 |
+ data, preprocessing to normalize the arrays, and genotyping using |
|
35 |
+ the CRLMM algorithm. After completing these steps, users can refer |
|
36 |
+ to the \verb+copynumber+ vignette. |
|
37 |
+ |
|
38 |
+\end{abstract} |
|
39 |
+ |
|
40 |
+ |
|
41 |
+\section{Set up} |
|
42 |
+ |
|
43 |
+<<crlmm, results=hide, echo=FALSE>>= |
|
44 |
+library(crlmm) |
|
45 |
+options(width=70) |
|
46 |
+options(continue=" ") |
|
47 |
+@ |
|
48 |
+ |
|
49 |
+%\textbf{Supported platforms:} The supported Infinium platforms are |
|
50 |
+%those for which a corresponding annotation package is available. The |
|
51 |
+%annotation packages contain information on the markers, such as |
|
52 |
+%physical position and chromosome, as well as pre-computed parameters |
|
53 |
+%estimated from HapMap used during the preprocessing and genotyping |
|
54 |
+%steps. Currently supported Infinium platforms are listed in the |
|
55 |
+%following code chunk. |
|
56 |
+The following codechunk declares a directory for saving \Robject{ff} |
|
57 |
+files that will contain the normalized intensities and the genotype |
|
58 |
+calls. |
|
59 |
+ |
|
60 |
+<<ldpath,results=hide>>= |
|
61 |
+library(ff) |
|
62 |
+if(getRversion() < "2.13.0"){ |
|
63 |
+ rpath <- getRversion() |
|
64 |
+} else rpath <- "trunk" |
|
65 |
+outdir <- paste("/thumper/ctsa/snpmicroarray/rs/ProcessedData/crlmm/", rpath, "/illumina_vignette", sep="") |
|
66 |
+ldPath(outdir) |
|
67 |
+dir.create(outdir, recursive=TRUE, showWarnings=FALSE) |
|
68 |
+@ |
|
69 |
+ |
|
70 |
+We will also store cached computations in the directory \verb+outdir+. |
|
71 |
+ |
|
72 |
+<<cacheSweave, echo=FALSE, results=hide>>= |
|
73 |
+library(cacheSweave) |
|
74 |
+setCacheDir(outdir) |
|
75 |
+@ |
|
76 |
+ |
|
77 |
+We declare that \crlmm{} should process 150,000 markers at a time |
|
78 |
+and/or 500 samples at a time (when possible) to reduce the memory |
|
79 |
+footprint. As our example dataset in this vignette contains fewer |
|
80 |
+than 500 samples, all samples will be processed simultaneously. |
|
81 |
+ |
|
82 |
+<<ram>>= |
|
83 |
+ocProbesets(150e3) |
|
84 |
+ocSamples(500) |
|
85 |
+@ |
|
86 |
+ |
|
87 |
+\section{Initializing a container for storing processed data} |
|
88 |
+ |
|
89 |
+This section will initialize a container for storing processed forms |
|
90 |
+of the data, including the normalized intensities for the A and B |
|
91 |
+alleles and the CRLMM genotype calls and confidence scores. In |
|
92 |
+addition, the container will store information on the markers |
|
93 |
+(physical position, chromosome, and a SNP indicator), the batch, and |
|
94 |
+the samples (e.g., gender). To construct this container for Infinium |
|
95 |
+platforms, several steps are required. |
|
96 |
+ |
|
97 |
+We begin by specifying the path containing the raw IDAT files for a |
|
98 |
+set of samples from the Infinium 370k platform. |
|
99 |
+ |
|
100 |
+<<datadir>>= |
|
101 |
+datadir <- "/thumper/ctsa/snpmicroarray/illumina/IDATS/370k" |
|
102 |
+@ |
|
103 |
+ |
|
104 |
+For Infinium platforms, an Illumina sample sheet containing |
|
105 |
+information for reading the raw IDAT files is required. Please refer |
|
106 |
+to the BeadStudio Genotyping guide, Appendix A, for additional |
|
107 |
+information. The following code reads in the samplesheet for the IDAT |
|
108 |
+files on our local server. |
|
109 |
+ |
|
110 |
+<<samplesheet>>= |
|
111 |
+samplesheet = read.csv(file.path(datadir, "HumanHap370Duo_Sample_Map.csv"), header=TRUE, as.is=TRUE) |
|
112 |
+@ |
|
113 |
+ |
|
114 |
+For the purposes of this vignette, we indicate that we only wish to |
|
115 |
+process a subset of the arrays. For our dataset, the file extensions |
|
116 |
+are `Grn.dat' and `Red.idat'. We store the complete path to the |
|
117 |
+filename without the file extension in the \Robject{arrayNames} and |
|
118 |
+check that all of the green and red IDAT files exists. |
|
119 |
+ |
|
120 |
+<<subsetArrays>>= |
|
121 |
+samplesheet <- samplesheet[-c(28:46,61:75,78:79), ] |
|
122 |
+arrayNames <- file.path(datadir, unique(samplesheet[, "SentrixPosition"])) |
|
123 |
+all(file.exists(paste(arrayNames, "_Grn.idat", sep=""))) |
|
124 |
+all(file.exists(paste(arrayNames, "_Red.idat", sep=""))) |
|
125 |
+arrayInfo <- list(barcode=NULL, position="SentrixPosition") |
|
126 |
+@ |
|
127 |
+ |
|
128 |
+All supported platforms have a corresponding annotation package. The |
|
129 |
+appropriate annotation package is specified by the platform identifier |
|
130 |
+without the \verb+Crlmm+ postfix. |
|
131 |
+ |
|
132 |
+<<cdfname>>= |
|
133 |
+cdfName <- "human370v1c" |
|
134 |
+@ |
|
135 |
+ |
|
136 |
+Next, we construct a character vector that specifies the batch for |
|
137 |
+each of the \Sexpr{length(arrayNames)} arrays. Here, we have a small |
|
138 |
+dataset and process the samples in a single batch. Processing the |
|
139 |
+samples as a single batch is generally reasonable if the samples were |
|
140 |
+processed at similar times (e.g., within a few weeks). |
|
141 |
+ |
|
142 |
+<<batch>>= |
|
143 |
+batch <- rep("1", nrow(samplesheet)) |
|
144 |
+@ |
|
145 |
+ |
|
146 |
+Finally, we initialize an object of class \Robject{CNSet} using the |
|
147 |
+function \Rfunction{constructInf}. |
|
148 |
+ |
|
149 |
+<<container,cache=TRUE>>= |
|
150 |
+cnSet <- constructInf(sampleSheet=samplesheet, |
|
151 |
+ arrayNames=arrayNames, |
|
152 |
+ batch=batch, |
|
153 |
+ arrayInfoColNames=arrayInfo, |
|
154 |
+ cdfName=cdfName, |
|
155 |
+ verbose=TRUE, |
|
156 |
+ saveDate=TRUE) |
|
157 |
+@ |
|
158 |
+ |
|
159 |
+A concise summary of the object's contents can be viewed with the |
|
160 |
+\Rfunction{print} function. |
|
161 |
+ |
|
162 |
+<<cnset>>= |
|
163 |
+print(cnSet) |
|
164 |
+@ |
|
165 |
+ |
|
166 |
+Note that the above object does not yet contain any processed data |
|
167 |
+(only \verb+NA+'s). As the elements of the \verb+assayData+ slot are |
|
168 |
+\Robject{ff} objects (not matrices), several \verb+.ff+ files now |
|
169 |
+appear in the \verb+outdir+. The \verb+.ff+ files should not be |
|
170 |
+removed and can be listed using the \R{} function |
|
171 |
+\Rfunction{list.files}. %For the most part, the \emph{appearance} that |
|
172 |
+%the data is stored in memory is preserved. |
|
173 |
+ |
|
174 |
+<<listff>>= |
|
175 |
+sapply(assayData(cnSet), function(x) class(x)[1]) |
|
176 |
+list.files(outdir, pattern=".ff")[1:5] |
|
177 |
+@ |
|
178 |
+ |
|
179 |
+\section{Preprocessing} |
|
180 |
+ |
|
181 |
+The raw intensities from the Infinium IDAT files are read and |
|
182 |
+normalized using the function \Rfunction{preprocessInf}. The function |
|
183 |
+\Rfunction{preprocessInf} returns a \Robject{ff} object containing the |
|
184 |
+parameters for the mixture model used by the CRLMM genotyping |
|
185 |
+algorithm. |
|
186 |
+ |
|
187 |
+<<preprocess,cache=TRUE>>= |
|
188 |
+mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=samplesheet, arrayNames=arrayNames, arrayInfoColNames=arrayInfo) |
|
189 |
+invisible(open(mixtureParams)) |
|
190 |
+str(mixtureParams[]) |
|
191 |
+invisible(close(mixtureParams)) |
|
192 |
+@ |
|
193 |
+ |
|
194 |
+Note that the normalized intensities for the A and B alleles are no |
|
195 |
+longer \verb+NA+s and can be inspected using the methods \Rfunction{A} |
|
196 |
+and \Rfunction{B}, respectively. |
|
197 |
+ |
|
198 |
+<<intensities>>= |
|
199 |
+invisible(open(A(cnSet))) |
|
200 |
+invisible(open(B(cnSet))) |
|
201 |
+as.matrix(A(cnSet)[1:5, 1:5]) |
|
202 |
+as.matrix(B(cnSet)[1:5, 1:5]) |
|
203 |
+invisible(close(A(cnSet))) |
|
204 |
+invisible(close(B(cnSet))) |
|
205 |
+@ |
|
206 |
+ |
|
207 |
+\section{Genotyping} |
|
208 |
+ |
|
209 |
+CRLMM genotype calls and confidence scores are estimated using the |
|
210 |
+function \Rfunction{genotypeInf}. |
|
211 |
+ |
|
212 |
+<<genotype,cache=TRUE>>= |
|
213 |
+updated <- genotypeInf(cnSet, mixtureParams=mixtureParams) |
|
214 |
+print(updated) |
|
215 |
+@ |
|
216 |
+ |
|
217 |
+The posterior probabilities for the genotype calls in the |
|
218 |
+\verb+callProbability+ element of the \verb+assayData+ are stored as |
|
219 |
+integers to reduce the file size on disk. The scores can be |
|
220 |
+transformed to the probability scale using the \Rfunction{i2p} |
|
221 |
+function as illustrated in the following code chunk. |
|
222 |
+ |
|
223 |
+<<callprobs>>= |
|
224 |
+invisible(open(snpCallProbability(cnSet))) |
|
225 |
+callProbs <- as.matrix(snpCallProbability(cnSet)[1:5, 1:5]) |
|
226 |
+i2p(callProbs) |
|
227 |
+invisible(close(snpCallProbability(cnSet))) |
|
228 |
+@ |
|
229 |
+ |
|
230 |
+\textbf{Wrapper:} As an alternative to calling the functions |
|
231 |
+\Rfunction{constructInf}, \Rfunction{preprocessInf} and |
|
232 |
+\Rfunction{genotypeInf} in sequence, a convenience function called |
|
233 |
+\Rfunction{genotype.Illumina} is a wrapper for the above functions and |
|
234 |
+produces identical results. |
|
235 |
+ |
|
236 |
+<<genotype.Illumina,cache=TRUE,results=hide>>= |
|
237 |
+cnSet2 <- genotype.Illumina(sampleSheet=samplesheet, |
|
238 |
+ arrayNames=arrayNames, |
|
239 |
+ arrayInfoColNames=arrayInfo, |
|
240 |
+ cdfName="human370v1c", |
|
241 |
+ batch=batch) |
|
242 |
+@ |
|
243 |
+ |
|
244 |
+<<checkIdenticalCalls>>= |
|
245 |
+invisible(open(calls(cnSet))) |
|
246 |
+invisible(open(calls(cnSet2))) |
|
247 |
+snp.index <- which(isSnp(cnSet)) |
|
248 |
+identical(calls(cnSet)[snp.index, 1:20], calls(cnSet2)[snp.index, 1:20]) |
|
249 |
+invisible(close(calls(cnSet))) |
|
250 |
+invisible(close(calls(cnSet2))) |
|
251 |
+@ |
|
252 |
+ |
|
253 |
+To fully remove the data associated with the \Robject{cnSet2} object, |
|
254 |
+one should use the \Rfunction{delete} function in the \Rpackage{ff} |
|
255 |
+package followed by the \Rfunction{rm} function. The following code |
|
256 |
+is not evaluated is it would change the results of the cached |
|
257 |
+computations in the previous code chunk. |
|
258 |
+ |
|
259 |
+<<delete, eval=FALSE>>= |
|
260 |
+lapply(assayData(cnSet2), delete) |
|
261 |
+lapply(batchStatistics(cnSet2), delete) |
|
262 |
+delete(cnSet2$gender) |
|
263 |
+delete(cnSet2$SNR) |
|
264 |
+delete(cnSet2$SKW) |
|
265 |
+rm(cnSet2) |
|
266 |
+@ |
|
267 |
+ |
|
268 |
+Users may now proceed to the CopyNumber vignette for copy number |
|
269 |
+analyses. |
|
270 |
+ |
|
271 |
+ |
|
272 |
+ |
|
273 |
+\section{Session information} |
|
274 |
+<<sessionInfo, results=tex>>= |
|
275 |
+toLatex(sessionInfo()) |
|
276 |
+@ |
|
277 |
+ |
|
278 |
+ |
|
279 |
+<<>>= |
|
280 |
+cnset.updated <- crlmmCopynumber(cnSet) |
|
281 |
+@ |
|
282 |
+ |
|
283 |
+<<>>= |
|
284 |
+snp.index <- which(chromosome(cnSet)==1) |
|
285 |
+cnset2 <- cnSet[snp.index, ] |
|
286 |
+trace(crlmmCopynumber, browser) |
|
287 |
+object <- crlmmCopynumber(cnset2) |
|
288 |
+## elements no longer ff |
|
289 |
+@ |
|
290 |
+ |
|
291 |
+\end{document} |