git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@37677 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-2009-03-01 Benilton Carvalho <bcarvalh@jhsph.edu> |
|
1 |
+2009-03-01 Benilton Carvalho <bcarvalh@jhsph.edu> - committed version 1.0.40 |
|
2 | 2 |
|
3 | 3 |
* Added TODO file |
4 | 4 |
|
... | ... |
@@ -9,3 +9,11 @@ |
9 | 9 |
* Files with suffix XYZ-functions.R should have *functions* for methodology XYZ |
10 | 10 |
|
11 | 11 |
* Files with suffix XYZ-methods.R should have *methods* for methodology XYZ |
12 |
+ |
|
13 |
+* Exported assayDataNew from Biobase |
|
14 |
+ |
|
15 |
+* Added inst/scripts to store vignettes that can't be built by BioC |
|
16 |
+due to the fact that they depend on external data not available as |
|
17 |
+BioC data packages |
|
18 |
+ |
|
19 |
+* Added crlmmSet class and calls/confs methods |
... | ... |
@@ -1,14 +1,14 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling via CRLMM Algorithm |
4 |
-Version: 1.0.38 |
|
4 |
+Version: 1.0.40 |
|
5 | 5 |
Date: 2008-12-28 |
6 | 6 |
Author: Rafael A Irizarry |
7 | 7 |
Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu> |
8 | 8 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays. |
9 |
-Imports: affyio, preprocessCore, utils, stats, genefilter, splines, Biobase |
|
10 | 9 |
License: Artistic-2.0 |
11 |
-Suggests: hapmapsnp5, genomewidesnp5Crlmm, genomewidesnp6Crlmm, methods |
|
10 |
+Imports: affyio, preprocessCore, utils, stats, genefilter, splines, Biobase |
|
11 |
+Suggests: hapmapsnp5, genomewidesnp5Crlmm, genomewidesnp6Crlmm, methods, GGdata, snpMatrix |
|
12 | 12 |
Collate: AllClasses.R |
13 | 13 |
crlmm-methods.R |
14 | 14 |
cnrma-functions.R |
... | ... |
@@ -2,10 +2,10 @@ useDynLib("crlmm", .registration=TRUE) |
2 | 2 |
export("crlmm", "list.celfiles", "computeCnBatch") |
3 | 3 |
exportMethods("list2crlmmSet", "calls", "confs") |
4 | 4 |
exportClasses("crlmmSet") |
5 |
+import(Biobase) |
|
5 | 6 |
importFrom(affyio, read.celfile.header, read.celfile) |
6 | 7 |
importFrom(preprocessCore, normalize.quantiles.use.target) |
7 | 8 |
importFrom(utils, data, packageDescription, setTxtProgressBar, txtProgressBar) |
8 | 9 |
importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, sd) |
9 |
-importFrom(Biobase, rowMedians) |
|
10 | 10 |
importFrom(genefilter, rowSds) |
11 | 11 |
|
... | ... |
@@ -3,11 +3,27 @@ |
3 | 3 |
##################################### |
4 | 4 |
- Decide on output format (BC vote: eSet-like) |
5 | 5 |
- Helper to convert to snpMatrix |
6 |
-- Vignette with non-trivial use of crlmm (use Vince's) |
|
7 |
-- Add RS ids |
|
6 |
+- Add RS ids to annotation packages |
|
8 | 7 |
- Allele plots |
9 | 8 |
- M v S plots |
10 | 9 |
|
10 |
+ |
|
11 | 11 |
##################################### |
12 | 12 |
### FOR CNRMA |
13 | 13 |
##################################### |
14 |
+- fix the following 2 items |
|
15 |
+* checking R code for possible problems ... NOTE |
|
16 |
+cnrma: no visible global function definition for ‘getCnvFid’ |
|
17 |
+cnrma: no visible binding for global variable ‘reference’ |
|
18 |
+coefs: no visible binding for global variable ‘nu’ |
|
19 |
+coefs: no visible binding for global variable ‘phi’ |
|
20 |
+coefs: no visible binding for global variable ‘nu.se’ |
|
21 |
+coefs: no visible binding for global variable ‘phi.se’ |
|
22 |
+ |
|
23 |
+* checking for unstated dependencies in R code ... WARNING |
|
24 |
+'library' or 'require' calls not declared from: |
|
25 |
+ affyio splines Biobase genefilter |
|
26 |
+ |
|
27 |
+* checking for missing documentation entries ... WARNING |
|
28 |
+Undocumented code objects: |
|
29 |
+ computeCnBatch |
14 | 30 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,10 @@ |
1 |
+all: copynumber crlmmDownstream clean |
|
2 |
+ |
|
3 |
+copynumber: copynumber.tex |
|
4 |
+ cp -p ../scripts/copynumber.pdf . |
|
5 |
+ |
|
6 |
+crlmmDownstream: crlmmDownstream.tex |
|
7 |
+ cp -p ../scripts/crlmmDownstream.pdf . |
|
8 |
+ |
|
9 |
+clean: |
|
10 |
+ -$(RM) -f *.out *.bbl *.log *.aux *.blg *.brf *.toc *.tex |
... | ... |
@@ -1,378 +1,11 @@ |
1 |
-%\VignetteIndexEntry{crlmm copy number Vignette} |
|
1 |
+%\VignetteIndexEntry{crlmm Vignette - Copy Number} |
|
2 | 2 |
%\VignetteDepends{crlmm, genomewidesnp6Crlmm} |
3 | 3 |
%\VignetteKeywords{crlmm, SNP 6} |
4 | 4 |
%\VignettePackage{crlmm} |
5 |
-\documentclass{article} |
|
6 |
-\usepackage{graphicx} |
|
7 |
-\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
8 |
-\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
9 |
-\newcommand{\Rcode}[1]{{\texttt{#1}}} |
|
10 |
-\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
11 |
-\newcommand{\Rpackage}[1]{{\textsf{#1}}} |
|
12 |
-\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
13 |
-\newcommand{\oligo}{\Rpackage{oligo }} |
|
14 |
-\newcommand{\R}{\textsf{R}} |
|
15 | 5 |
|
6 |
+%%% The real .Rnw is under inst/scripts. |
|
7 |
+%%% It cannot be added here because it depends on external |
|
8 |
+%%% data not yet available as a BioC package |
|
9 |
+\documentclass{article} |
|
16 | 10 |
\begin{document} |
17 |
-\title{Estimating copy number for Affymetrix 6.0 with the crlmm Package} |
|
18 |
-\date{February, 2009} |
|
19 |
-\author{Rob Scharpf} |
|
20 |
-\maketitle |
|
21 |
- |
|
22 |
-<<setup, echo=FALSE, results=hide>>= |
|
23 |
-options(width=60) |
|
24 |
-options(continue=" ") |
|
25 |
-options(prompt="R> ") |
|
26 |
-@ |
|
27 |
- |
|
28 |
-\section{Estimating copy number} |
|
29 |
- |
|
30 |
-General requirements: in addition to the \R{} packages indicated |
|
31 |
-below, processing a large number of samples requires a high-end |
|
32 |
-computer with a large amount of RAM. Currently, the maximum number of |
|
33 |
-samples that can be genotyped at once is approximately 2000 and this |
|
34 |
-requires roughly 32G of RAM. |
|
35 |
- |
|
36 |
-\paragraph{Suggested work flow.} I typically submit code for |
|
37 |
-preprocessing and genotyping as a batch job to a computing cluster. |
|
38 |
-When the preprocessing is complete, I pick a chromosome and work |
|
39 |
-interactively from the cluster to calculate copy number and make some |
|
40 |
-of the suggested diagnostic plots. |
|
41 |
- |
|
42 |
-\subsection{Preprocess and genotype the samples} |
|
43 |
- |
|
44 |
-First, load the required libraries. |
|
45 |
- |
|
46 |
-<<requiredPackages>>= |
|
47 |
-library(crlmm) |
|
48 |
-library(genomewidesnp6Crlmm) |
|
49 |
-library(ellipse) |
|
50 |
-@ |
|
51 |
- |
|
52 |
-Specify an output directory and provide the complete path for the CEL |
|
53 |
-file names. |
|
54 |
- |
|
55 |
-<<>>= |
|
56 |
-outdir <- "/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/GRU-EA" |
|
57 |
-datadir <- "/thumper/ctsa/snpmicroarray/GAIN/Bipolar/GRU-EA" |
|
58 |
-fns <- list.files(datadir, pattern=".CEL", full.names=TRUE) |
|
59 |
-@ |
|
60 |
- |
|
61 |
-Genotype the samples with CRLMM. Submit this job to the cluster and save the |
|
62 |
-output to \Robject{outdir}. |
|
63 |
- |
|
64 |
-<<genotype, eval=FALSE>>= |
|
65 |
-res2 <- crlmm(filenames=fns, save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda")) |
|
66 |
-save(res2, file=file.path(outdir, "res2.rda")) |
|
67 |
-@ |
|
68 |
- |
|
69 |
-Quantile normalize the nonpolymorphic probes and save the output: |
|
70 |
- |
|
71 |
-<<quantileNormalizeCnProbes, eval=FALSE>>= |
|
72 |
-res3 <- cnrma(fns) |
|
73 |
-save(res3, file=file.path(outdir, "res3.rda")) |
|
74 |
-@ |
|
75 |
- |
|
76 |
-\subsection{Copy number} |
|
77 |
- |
|
78 |
-Copy number can be assessed one chromosome at a time. Here we specify |
|
79 |
-chromosome 15 and and load a list of indices used to subset the files |
|
80 |
-saved in the previous section. The first element in the list |
|
81 |
-correspond to indices of polymorphic probes on chromosome 15; the |
|
82 |
-second element corresponds to indices of nonpolymorphic probes on |
|
83 |
-chromosome 15. |
|
84 |
- |
|
85 |
-<<chromosomeIndex>>= |
|
86 |
-CHR <- 15 |
|
87 |
-CHR_INDEX <- paste(CHR, "index", sep="") |
|
88 |
-data(list=CHR_INDEX, package="genomewidesnp6Crlmm") |
|
89 |
-str(index) |
|
90 |
-@ |
|
91 |
- |
|
92 |
-Next we load 3 files that were saved from the preprocessing step and |
|
93 |
-then subset these lists using the above indices to extract the |
|
94 |
-preprocessed intensities and genotypes needed for estimating copy |
|
95 |
-number. Specifically, we require 6 items: |
|
96 |
- |
|
97 |
-\begin{itemize} |
|
98 |
- \item quantile-normalized A intensities (I1 x J) |
|
99 |
- \item quantile-normalized B intensities (I1 x J) |
|
100 |
- \item quantile-normalized intensities from nonpolymorphic (NP) probes (I2 x J) |
|
101 |
- \item genotype calls (I1 x J) |
|
102 |
- \item confidence scores of the genotype calls (I1 x J) |
|
103 |
- \item signal to noise ratio (SNR) of the samples (J) |
|
104 |
- \end{itemize} |
|
105 |
- |
|
106 |
- These items are extracted as follows: |
|
107 |
- |
|
108 |
-<<eval=FALSE>>= |
|
109 |
-load(file.path(outdir, "intensities.rda")) |
|
110 |
-load(file.path(outdir, "res2.rda")) |
|
111 |
-load(file.path(outdir, "res3.rda")) |
|
112 |
-@ |
|
113 |
- |
|
114 |
-Depending on the number of samples, the above objects can be large and |
|
115 |
-take a while to load. For the figures in this vignette, I am loading |
|
116 |
-only 5000 SNP-level summaries from chromosome 22 (28MB) and the |
|
117 |
-quantile-normalized intensities for 5000 nonpolymorphic probes on |
|
118 |
-chromosome 22 (8 MB total). |
|
119 |
- |
|
120 |
-<<loadTmpFiles, eval=FALSE, echo=FALSE>>= |
|
121 |
-load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmp.rda") |
|
122 |
-assign("res", tmp, envir=.GlobalEnv) |
|
123 |
-rm(tmp); gc() |
|
124 |
-load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmpcn.rda") |
|
125 |
-A <- res$A |
|
126 |
-B <- res$B |
|
127 |
-calls <- res$calls |
|
128 |
-conf <- res$conf |
|
129 |
-SNR <- res$SNR |
|
130 |
-NP <- res3$NP |
|
131 |
-@ |
|
132 |
- |
|
133 |
-<<snpAndCnSummaries, eval=FALSE>>= |
|
134 |
-A <- res$A |
|
135 |
-B <- res$B |
|
136 |
-calls <- res2$calls |
|
137 |
-conf <- res2$conf |
|
138 |
-SNR <- res2$SNR |
|
139 |
-NP <- res3$NP |
|
140 |
-rm(res, res2, res3) |
|
141 |
-gc() |
|
142 |
-@ |
|
143 |
- |
|
144 |
-Make a histogram of the signal to noise ratio for these samples: |
|
145 |
- |
|
146 |
-<<plotSnr, fig=TRUE>>= |
|
147 |
-hist(SNR, xlab="SNR", main="") |
|
148 |
-@ |
|
149 |
- |
|
150 |
-We suggest excluding samples with a signal to noise ratio less than 5. |
|
151 |
-We then extract the probes (rows) that are on chromosome 15 and the |
|
152 |
-samples (columns) that have a suitable signal to noise ratio. |
|
153 |
- |
|
154 |
-<<subsetCrlmmOutput, eval=FALSE>>= |
|
155 |
-A <- A[index[[1]], SNR > 5] |
|
156 |
-B <- B[index[[1]], SNR > 5] |
|
157 |
-calls <- calls[index[[1]], SNR > 5] |
|
158 |
-conf <- conf[index[[1]], SNR > 5] |
|
159 |
-NP <- NP[index[[2]], SNR > 5] |
|
160 |
-rm(res, res2, res3); gc() |
|
161 |
-@ |
|
162 |
- |
|
163 |
-We want to adjust for batch effects. In our experience, the chemistry |
|
164 |
-plate is a good surrogate for batch, although this should be assessed |
|
165 |
-on a study by study basis. For samples processed by Broad, the plate |
|
166 |
-is indicated by a capitalized five-letter word and is part of the |
|
167 |
-sample name. It is important that the plate (batch) variable is |
|
168 |
-ordered the same as filenames in the preprocessed data. For instance, |
|
169 |
-to indicate plate in this example one can use the command |
|
170 |
- |
|
171 |
-<<specifyBatch>>= |
|
172 |
-sns <- colnames(calls) |
|
173 |
-sns[1] |
|
174 |
-plates <- sapply(sns, function(x) strsplit(x, "_")[[1]][2]) |
|
175 |
-table(plates) |
|
176 |
-@ |
|
177 |
- |
|
178 |
-We are now ready to estimate the copy number for each batch. In the |
|
179 |
-current version of this package, one specifies an environment to which |
|
180 |
-intermediate \R{} objects for copy number estimation are |
|
181 |
-stored. Allele-specific estimates of copy number are also stored in |
|
182 |
-this environment. |
|
183 |
- |
|
184 |
-<<copyNumberByBatch, eval=FALSE>>= |
|
185 |
-e1 <- new.env() |
|
186 |
-computeCnBatch(A=A, |
|
187 |
- B=B, |
|
188 |
- calls=calls, |
|
189 |
- conf=conf, |
|
190 |
- NP=NP, |
|
191 |
- plate=plates, |
|
192 |
- envir=e1, chrom=CHR, DF.PRIOR=75) |
|
193 |
-@ |
|
194 |
- |
|
195 |
-The DF.PRIOR indicates how much we will shrink SNP-specific estimates |
|
196 |
-of the variance and correlation. |
|
197 |
- |
|
198 |
-<<accessingEstimates>>= |
|
199 |
-copyA <- get("CA", e1) |
|
200 |
-copyB <- get("CB", e1) |
|
201 |
-copyT <- (copyA+copyB)/100 |
|
202 |
-copyT[copyT < 0] <- 0 |
|
203 |
-copyT[copyT > 6] <- 6 |
|
204 |
-@ |
|
205 |
- |
|
206 |
-\section{Suggested plots} |
|
207 |
- |
|
208 |
-\paragraph{One sample at a time} |
|
209 |
- |
|
210 |
-<<annotation>>= |
|
211 |
-data(snpProbes, package="genomewidesnp6Crlmm") |
|
212 |
-data(cnProbes, package="genomewidesnp6Crlmm") |
|
213 |
-position <- snpProbes[match(rownames(calls), rownames(snpProbes)), "position"] |
|
214 |
-@ |
|
215 |
- |
|
216 |
-To plot physical position versus copy number for the first sample: |
|
217 |
- |
|
218 |
-<<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>= |
|
219 |
-require(SNPchip) |
|
220 |
-par(las=1) |
|
221 |
-plotCytoband(as.character(CHR), ylim=c(0,7), cytoband.ycoords=c(6.5,7), label.cytoband=FALSE, main="Chr 15") |
|
222 |
-points(position, copyT[, 1], pch=".", cex=2, , xaxt="n", col="grey70") |
|
223 |
-axis(1, at=pretty(range(position)), labels=pretty(range(position))/1e6) |
|
224 |
-axis(2, at=0:5, labels=0:5) |
|
225 |
-mtext("position (Mb)", 1, line=2) |
|
226 |
-mtext(expression(C[A] + C[B]), 2, line=2, las=3) |
|
227 |
-@ |
|
228 |
- |
|
229 |
-\begin{figure} |
|
230 |
- \includegraphics[width=0.9\textwidth]{copynumber-oneSample} |
|
231 |
- \caption{Total copy number (y-axis) for chromosome 22 plotted |
|
232 |
- against physical position (x-axis) for one sample. } |
|
233 |
-\end{figure} |
|
234 |
- |
|
235 |
-\paragraph{One SNP at a time} |
|
236 |
- |
|
237 |
-<<intermediateFiles>>= |
|
238 |
-tau2A <- get("tau2A", e1) |
|
239 |
-tau2B <- get("tau2B", e1) |
|
240 |
-sig2A <- get("sig2A", e1) |
|
241 |
-sig2B <- get("sig2B", e1) |
|
242 |
-nuA <- get("nuA", e1) |
|
243 |
-phiA <- get("phiA", e1) |
|
244 |
-nuB <- get("nuB", e1) |
|
245 |
-phiB <- get("phiB", e1) |
|
246 |
-corr <- get("corr", e1) |
|
247 |
-corrA.BB <- get("corrA.BB", e1) |
|
248 |
-corrB.AA <- get("corrA.BB", e1) |
|
249 |
-A <- get("A", e1) |
|
250 |
-B <- get("B", e1) |
|
251 |
-@ |
|
252 |
- |
|
253 |
-Here, we plot the prediction regions for total copy number 2 and 3 for |
|
254 |
-the first plate. Black plotting symbols are estimates from the first |
|
255 |
-plate; light grey are points from other plates. (You could also draw |
|
256 |
-prediction regions for 0-4 copies, but it gets crowded). Notice that |
|
257 |
-there is little evidence of a plate effect for this SNP. |
|
258 |
- |
|
259 |
-<<predictionRegion, fig=TRUE, width=8, height=8, include=FALSE>>= |
|
260 |
-par(las=1) |
|
261 |
-p <- 1 ##indicates plate |
|
262 |
-J <- grep(unique(plates)[p], sns) ##sample indices for this plate |
|
263 |
-ylim <- c(6.5,13) |
|
264 |
-I <- which(phiA > 10 & phiB > 10) |
|
265 |
-i <- I[1] |
|
266 |
-plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B") |
|
267 |
-points(log2(A[i, J]), log2(B[i, J]), col="black", pch=as.character(calls[i, J])) |
|
268 |
-for(CT in 2:3){ |
|
269 |
- if(CT == 2) ellipse.col <- "brown" else ellipse.col <- "purple" |
|
270 |
- for(CA in 0:CT){ |
|
271 |
- CB <- CT-CA |
|
272 |
- A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
273 |
- B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
274 |
- scale <- c(A.scale, B.scale) |
|
275 |
- if(CA == 0 & CB > 0) rho <- corrA.BB[i, p] |
|
276 |
- if(CA > 0 & CB == 0) rho <- corrB.AA[i, p] |
|
277 |
- if(CA > 0 & CB > 0) rho <- corr[i, p] |
|
278 |
- lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])), |
|
279 |
- scale=scale), col=ellipse.col, lwd=2) |
|
280 |
- } |
|
281 |
-} |
|
282 |
-legend("topright", lwd=3, col=c("black", "purple"), legend=c("2 copies", "3 copies"), bty="n") |
|
283 |
-@ |
|
284 |
- |
|
285 |
-\begin{figure} |
|
286 |
- \includegraphics[width=0.9\textwidth]{copynumber-predictionRegion} |
|
287 |
- \caption{Prediction regions for copy number 2 and 3 for one SNP. |
|
288 |
- The plate effects are negligible for this SNP and we only plot the |
|
289 |
- prediction region for the first plate.} |
|
290 |
-\end{figure} |
|
291 |
- |
|
292 |
-Here is one way to identify a SNP with a large plate effect -- look |
|
293 |
-for shifts in the prediction regions (here I'm lucking for large |
|
294 |
-shifts in the A direction). |
|
295 |
- |
|
296 |
-<<shifts>>= |
|
297 |
-shiftA <- nuA+2*phiA |
|
298 |
-maxA <- apply(shiftA, 1, max, na.rm=T) |
|
299 |
-minA <- apply(shiftA, 1, min, na.rm=T) |
|
300 |
-d <- maxA - minA |
|
301 |
-hist(d) |
|
302 |
-index <- which(d > 3000)[2] |
|
303 |
-plate1 <- which(shiftA[index, ] == maxA[index]) |
|
304 |
-plate2 <- which(shiftA[index, ] == minA[index]) |
|
305 |
-@ |
|
306 |
- |
|
307 |
-Now plot the predictions for the two plates. All the ellipses are |
|
308 |
-prediction regions for copy number 2. Note that while I looked for |
|
309 |
-shifts in the A direction, the shift often occurs in both the B and A |
|
310 |
-dimensions. |
|
311 |
- |
|
312 |
-<<plateEffect, fig=TRUE, width=8, height=8, include=FALSE>>= |
|
313 |
-par(las=1) |
|
314 |
-i <- index |
|
315 |
-J1 <- grep(unique(plates)[plate1], sns) ##sample indices for this plate |
|
316 |
-J2 <- grep(unique(plates)[plate2], sns) ##sample indices for this plate |
|
317 |
-ylim <- c(6.5,13) |
|
318 |
-plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B") |
|
319 |
-points(log2(A[i, J1]), log2(B[i, J1]), col="brown", pch=as.character(calls[i, J1])) |
|
320 |
-points(log2(A[i, J2]), log2(B[i, J2]), col="blue", pch=as.character(calls[i, J2])) |
|
321 |
-p <- plate1 |
|
322 |
-CT <- 2 |
|
323 |
-ellipse.col <- "brown" |
|
324 |
-for(CA in 0:CT){ |
|
325 |
- CB <- CT-CA |
|
326 |
- A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
327 |
- B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
328 |
- scale <- c(A.scale, B.scale) |
|
329 |
- lines(ellipse(x=corr[i, p]*(CA>0)*(CB>0), centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])), |
|
330 |
- scale=scale), col=ellipse.col, lwd=2) |
|
331 |
-} |
|
332 |
-ellipse.col <- "blue" |
|
333 |
-p <- plate2 |
|
334 |
-for(CA in 0:CT){ |
|
335 |
- CB <- CT-CA |
|
336 |
- A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
337 |
- B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
338 |
- scale <- c(A.scale, B.scale) |
|
339 |
- lines(ellipse(x=corr[i, p]*(CA>0)*(CB>0), centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])), |
|
340 |
- scale=scale), col=ellipse.col, lwd=2) |
|
341 |
-} |
|
342 |
-legend("topright", lwd=3, col=c("brown", "blue"), legend=c("2 copies, plate 1", "2 copies, plate 2 "), bty="n") |
|
343 |
-@ |
|
344 |
- |
|
345 |
-\begin{figure} |
|
346 |
- \includegraphics[width=0.9\textwidth]{copynumber-plateEffect} |
|
347 |
- \caption{The prediction regions for copy number 2 shift when there |
|
348 |
- is a large plate effect.} |
|
349 |
-\end{figure} |
|
350 |
- |
|
351 |
-Alternatively, loop through a large number of SNPs to see how these |
|
352 |
-regions move |
|
353 |
-<<codeForLoop, eval=FALSE>>= |
|
354 |
-par(las=1, pty="s", ask=TRUE) |
|
355 |
-for(i in I[1:50]){ |
|
356 |
- plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col=col[DS], cex=0.9, ylim=ylim, xlim=ylim) |
|
357 |
- for(CT in 2:3){ |
|
358 |
- if(CT == 2) ellipse.col <- "brown" else ellipse.col <- "purple" |
|
359 |
- for(CA in 0:CT){ |
|
360 |
- CB <- CT-CA |
|
361 |
- A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
362 |
- B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
363 |
- scale <- c(A.scale, B.scale) |
|
364 |
- lines(ellipse(x=corr[i, p]*(CA>0)*(CB>0), centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])), |
|
365 |
- scale=scale), col=ellipse.col, lwd=2) |
|
366 |
- } |
|
367 |
- } |
|
368 |
- legend("topright", lwd=3, col=col, legend=c("3 copies", "2 copies"), bty="n") |
|
369 |
-} |
|
370 |
-@ |
|
371 |
- |
|
372 |
-\section{Session information} |
|
373 |
-<<>>= |
|
374 |
-sessionInfo() |
|
375 |
-@ |
|
376 |
- |
|
377 |
- |
|
378 | 11 |
\end{document} |
379 | 12 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,10 @@ |
1 |
+%\VignetteIndexEntry{crlmm Vignette - Downstream Analysis} |
|
2 |
+%\VignetteKeywords{genotype, crlmm, SNP 5, SNP 6} |
|
3 |
+%\VignettePackage{crlmm} |
|
4 |
+ |
|
5 |
+%%% The real .Rnw is under inst/scripts. |
|
6 |
+%%% It cannot be added here because it depends on external |
|
7 |
+%%% data not yet available as a BioC package |
|
8 |
+\documentclass[12pt]{article} |
|
9 |
+\begin{document} |
|
10 |
+\end{document} |
2 | 13 |
similarity index 98% |
3 | 14 |
rename from inst/doc/crlmm.Rnw |
4 | 15 |
rename to inst/doc/genotyping.Rnw |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-%\VignetteIndexEntry{crlmm Vignette} |
|
1 |
+%\VignetteIndexEntry{crlmm Vignette - Genotyping} |
|
2 | 2 |
%\VignetteDepends{crlmm, hapmapsnp5, genomewidesnp5Crlmm} |
3 | 3 |
%\VignetteKeywords{genotype, crlmm, SNP 5, SNP 6} |
4 | 4 |
%\VignettePackage{crlmm} |
8 | 8 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,378 @@ |
1 |
+%\VignetteIndexEntry{crlmm Vignette - Copy Number} |
|
2 |
+%\VignetteDepends{crlmm, genomewidesnp6Crlmm} |
|
3 |
+%\VignetteKeywords{crlmm, SNP 6} |
|
4 |
+%\VignettePackage{crlmm} |
|
5 |
+\documentclass{article} |
|
6 |
+\usepackage{graphicx} |
|
7 |
+\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
8 |
+\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
9 |
+\newcommand{\Rcode}[1]{{\texttt{#1}}} |
|
10 |
+\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
11 |
+\newcommand{\Rpackage}[1]{{\textsf{#1}}} |
|
12 |
+\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
13 |
+\newcommand{\oligo}{\Rpackage{oligo }} |
|
14 |
+\newcommand{\R}{\textsf{R}} |
|
15 |
+ |
|
16 |
+\begin{document} |
|
17 |
+\title{Estimating copy number for Affymetrix 6.0 with the crlmm Package} |
|
18 |
+\date{February, 2009} |
|
19 |
+\author{Rob Scharpf} |
|
20 |
+\maketitle |
|
21 |
+ |
|
22 |
+<<setup, echo=FALSE, results=hide>>= |
|
23 |
+options(width=60) |
|
24 |
+options(continue=" ") |
|
25 |
+options(prompt="R> ") |
|
26 |
+@ |
|
27 |
+ |
|
28 |
+\section{Estimating copy number} |
|
29 |
+ |
|
30 |
+General requirements: in addition to the \R{} packages indicated |
|
31 |
+below, processing a large number of samples requires a high-end |
|
32 |
+computer with a large amount of RAM. Currently, the maximum number of |
|
33 |
+samples that can be genotyped at once is approximately 2000 and this |
|
34 |
+requires roughly 32G of RAM. |
|
35 |
+ |
|
36 |
+\paragraph{Suggested work flow.} I typically submit code for |
|
37 |
+preprocessing and genotyping as a batch job to a computing cluster. |
|
38 |
+When the preprocessing is complete, I pick a chromosome and work |
|
39 |
+interactively from the cluster to calculate copy number and make some |
|
40 |
+of the suggested diagnostic plots. |
|
41 |
+ |
|
42 |
+\subsection{Preprocess and genotype the samples} |
|
43 |
+ |
|
44 |
+First, load the required libraries. |
|
45 |
+ |
|
46 |
+<<requiredPackages>>= |
|
47 |
+library(crlmm) |
|
48 |
+library(genomewidesnp6Crlmm) |
|
49 |
+library(ellipse) |
|
50 |
+@ |
|
51 |
+ |
|
52 |
+Specify an output directory and provide the complete path for the CEL |
|
53 |
+file names. |
|
54 |
+ |
|
55 |
+<<>>= |
|
56 |
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/GRU-EA" |
|
57 |
+datadir <- "/thumper/ctsa/snpmicroarray/GAIN/Bipolar/GRU-EA" |
|
58 |
+fns <- list.files(datadir, pattern=".CEL", full.names=TRUE) |
|
59 |
+@ |
|
60 |
+ |
|
61 |
+Genotype the samples with CRLMM. Submit this job to the cluster and save the |
|
62 |
+output to \Robject{outdir}. |
|
63 |
+ |
|
64 |
+<<genotype, eval=FALSE>>= |
|
65 |
+res2 <- crlmm(filenames=fns, save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda")) |
|
66 |
+save(res2, file=file.path(outdir, "res2.rda")) |
|
67 |
+@ |
|
68 |
+ |
|
69 |
+Quantile normalize the nonpolymorphic probes and save the output: |
|
70 |
+ |
|
71 |
+<<quantileNormalizeCnProbes, eval=FALSE>>= |
|
72 |
+res3 <- cnrma(fns) |
|
73 |
+save(res3, file=file.path(outdir, "res3.rda")) |
|
74 |
+@ |
|
75 |
+ |
|
76 |
+\subsection{Copy number} |
|
77 |
+ |
|
78 |
+Copy number can be assessed one chromosome at a time. Here we specify |
|
79 |
+chromosome 15 and and load a list of indices used to subset the files |
|
80 |
+saved in the previous section. The first element in the list |
|
81 |
+correspond to indices of polymorphic probes on chromosome 15; the |
|
82 |
+second element corresponds to indices of nonpolymorphic probes on |
|
83 |
+chromosome 15. |
|
84 |
+ |
|
85 |
+<<chromosomeIndex>>= |
|
86 |
+CHR <- 15 |
|
87 |
+CHR_INDEX <- paste(CHR, "index", sep="") |
|
88 |
+data(list=CHR_INDEX, package="genomewidesnp6Crlmm") |
|
89 |
+str(index) |
|
90 |
+@ |
|
91 |
+ |
|
92 |
+Next we load 3 files that were saved from the preprocessing step and |
|
93 |
+then subset these lists using the above indices to extract the |
|
94 |
+preprocessed intensities and genotypes needed for estimating copy |
|
95 |
+number. Specifically, we require 6 items: |
|
96 |
+ |
|
97 |
+\begin{itemize} |
|
98 |
+ \item quantile-normalized A intensities (I1 x J) |
|
99 |
+ \item quantile-normalized B intensities (I1 x J) |
|
100 |
+ \item quantile-normalized intensities from nonpolymorphic (NP) probes (I2 x J) |
|
101 |
+ \item genotype calls (I1 x J) |
|
102 |
+ \item confidence scores of the genotype calls (I1 x J) |
|
103 |
+ \item signal to noise ratio (SNR) of the samples (J) |
|
104 |
+ \end{itemize} |
|
105 |
+ |
|
106 |
+ These items are extracted as follows: |
|
107 |
+ |
|
108 |
+<<eval=FALSE>>= |
|
109 |
+load(file.path(outdir, "intensities.rda")) |
|
110 |
+load(file.path(outdir, "res2.rda")) |
|
111 |
+load(file.path(outdir, "res3.rda")) |
|
112 |
+@ |
|
113 |
+ |
|
114 |
+Depending on the number of samples, the above objects can be large and |
|
115 |
+take a while to load. For the figures in this vignette, I am loading |
|
116 |
+only 5000 SNP-level summaries from chromosome 22 (28MB) and the |
|
117 |
+quantile-normalized intensities for 5000 nonpolymorphic probes on |
|
118 |
+chromosome 22 (8 MB total). |
|
119 |
+ |
|
120 |
+<<loadTmpFiles, eval=FALSE, echo=FALSE>>= |
|
121 |
+load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmp.rda") |
|
122 |
+assign("res", tmp, envir=.GlobalEnv) |
|
123 |
+rm(tmp); gc() |
|
124 |
+load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmpcn.rda") |
|
125 |
+A <- res$A |
|
126 |
+B <- res$B |
|
127 |
+calls <- res$calls |
|
128 |
+conf <- res$conf |
|
129 |
+SNR <- res$SNR |
|
130 |
+NP <- res3$NP |
|
131 |
+@ |
|
132 |
+ |
|
133 |
+<<snpAndCnSummaries, eval=FALSE>>= |
|
134 |
+A <- res$A |
|
135 |
+B <- res$B |
|
136 |
+calls <- res2$calls |
|
137 |
+conf <- res2$conf |
|
138 |
+SNR <- res2$SNR |
|
139 |
+NP <- res3$NP |
|
140 |
+rm(res, res2, res3) |
|
141 |
+gc() |
|
142 |
+@ |
|
143 |
+ |
|
144 |
+Make a histogram of the signal to noise ratio for these samples: |
|
145 |
+ |
|
146 |
+<<plotSnr, fig=TRUE>>= |
|
147 |
+hist(SNR, xlab="SNR", main="") |
|
148 |
+@ |
|
149 |
+ |
|
150 |
+We suggest excluding samples with a signal to noise ratio less than 5. |
|
151 |
+We then extract the probes (rows) that are on chromosome 15 and the |
|
152 |
+samples (columns) that have a suitable signal to noise ratio. |
|
153 |
+ |
|
154 |
+<<subsetCrlmmOutput, eval=FALSE>>= |
|
155 |
+A <- A[index[[1]], SNR > 5] |
|
156 |
+B <- B[index[[1]], SNR > 5] |
|
157 |
+calls <- calls[index[[1]], SNR > 5] |
|
158 |
+conf <- conf[index[[1]], SNR > 5] |
|
159 |
+NP <- NP[index[[2]], SNR > 5] |
|
160 |
+rm(res, res2, res3); gc() |
|
161 |
+@ |
|
162 |
+ |
|
163 |
+We want to adjust for batch effects. In our experience, the chemistry |
|
164 |
+plate is a good surrogate for batch, although this should be assessed |
|
165 |
+on a study by study basis. For samples processed by Broad, the plate |
|
166 |
+is indicated by a capitalized five-letter word and is part of the |
|
167 |
+sample name. It is important that the plate (batch) variable is |
|
168 |
+ordered the same as filenames in the preprocessed data. For instance, |
|
169 |
+to indicate plate in this example one can use the command |
|
170 |
+ |
|
171 |
+<<specifyBatch>>= |
|
172 |
+sns <- colnames(calls) |
|
173 |
+sns[1] |
|
174 |
+plates <- sapply(sns, function(x) strsplit(x, "_")[[1]][2]) |
|
175 |
+table(plates) |
|
176 |
+@ |
|
177 |
+ |
|
178 |
+We are now ready to estimate the copy number for each batch. In the |
|
179 |
+current version of this package, one specifies an environment to which |
|
180 |
+intermediate \R{} objects for copy number estimation are |
|
181 |
+stored. Allele-specific estimates of copy number are also stored in |
|
182 |
+this environment. |
|
183 |
+ |
|
184 |
+<<copyNumberByBatch, eval=FALSE>>= |
|
185 |
+e1 <- new.env() |
|
186 |
+computeCnBatch(A=A, |
|
187 |
+ B=B, |
|
188 |
+ calls=calls, |
|
189 |
+ conf=conf, |
|
190 |
+ NP=NP, |
|
191 |
+ plate=plates, |
|
192 |
+ envir=e1, chrom=CHR, DF.PRIOR=75) |
|
193 |
+@ |
|
194 |
+ |
|
195 |
+The DF.PRIOR indicates how much we will shrink SNP-specific estimates |
|
196 |
+of the variance and correlation. |
|
197 |
+ |
|
198 |
+<<accessingEstimates>>= |
|
199 |
+copyA <- get("CA", e1) |
|
200 |
+copyB <- get("CB", e1) |
|
201 |
+copyT <- (copyA+copyB)/100 |
|
202 |
+copyT[copyT < 0] <- 0 |
|
203 |
+copyT[copyT > 6] <- 6 |
|
204 |
+@ |
|
205 |
+ |
|
206 |
+\section{Suggested plots} |
|
207 |
+ |
|
208 |
+\paragraph{One sample at a time} |
|
209 |
+ |
|
210 |
+<<annotation>>= |
|
211 |
+data(snpProbes, package="genomewidesnp6Crlmm") |
|
212 |
+data(cnProbes, package="genomewidesnp6Crlmm") |
|
213 |
+position <- snpProbes[match(rownames(calls), rownames(snpProbes)), "position"] |
|
214 |
+@ |
|
215 |
+ |
|
216 |
+To plot physical position versus copy number for the first sample: |
|
217 |
+ |
|
218 |
+<<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>= |
|
219 |
+require(SNPchip) |
|
220 |
+par(las=1) |
|
221 |
+plotCytoband(as.character(CHR), ylim=c(0,7), cytoband.ycoords=c(6.5,7), label.cytoband=FALSE, main="Chr 15") |
|
222 |
+points(position, copyT[, 1], pch=".", cex=2, , xaxt="n", col="grey70") |
|
223 |
+axis(1, at=pretty(range(position)), labels=pretty(range(position))/1e6) |
|
224 |
+axis(2, at=0:5, labels=0:5) |
|
225 |
+mtext("position (Mb)", 1, line=2) |
|
226 |
+mtext(expression(C[A] + C[B]), 2, line=2, las=3) |
|
227 |
+@ |
|
228 |
+ |
|
229 |
+\begin{figure} |
|
230 |
+ \includegraphics[width=0.9\textwidth]{copynumber-oneSample} |
|
231 |
+ \caption{Total copy number (y-axis) for chromosome 22 plotted |
|
232 |
+ against physical position (x-axis) for one sample. } |
|
233 |
+\end{figure} |
|
234 |
+ |
|
235 |
+\paragraph{One SNP at a time} |
|
236 |
+ |
|
237 |
+<<intermediateFiles>>= |
|
238 |
+tau2A <- get("tau2A", e1) |
|
239 |
+tau2B <- get("tau2B", e1) |
|
240 |
+sig2A <- get("sig2A", e1) |
|
241 |
+sig2B <- get("sig2B", e1) |
|
242 |
+nuA <- get("nuA", e1) |
|
243 |
+phiA <- get("phiA", e1) |
|
244 |
+nuB <- get("nuB", e1) |
|
245 |
+phiB <- get("phiB", e1) |
|
246 |
+corr <- get("corr", e1) |
|
247 |
+corrA.BB <- get("corrA.BB", e1) |
|
248 |
+corrB.AA <- get("corrA.BB", e1) |
|
249 |
+A <- get("A", e1) |
|
250 |
+B <- get("B", e1) |
|
251 |
+@ |
|
252 |
+ |
|
253 |
+Here, we plot the prediction regions for total copy number 2 and 3 for |
|
254 |
+the first plate. Black plotting symbols are estimates from the first |
|
255 |
+plate; light grey are points from other plates. (You could also draw |
|
256 |
+prediction regions for 0-4 copies, but it gets crowded). Notice that |
|
257 |
+there is little evidence of a plate effect for this SNP. |
|
258 |
+ |
|
259 |
+<<predictionRegion, fig=TRUE, width=8, height=8, include=FALSE>>= |
|
260 |
+par(las=1) |
|
261 |
+p <- 1 ##indicates plate |
|
262 |
+J <- grep(unique(plates)[p], sns) ##sample indices for this plate |
|
263 |
+ylim <- c(6.5,13) |
|
264 |
+I <- which(phiA > 10 & phiB > 10) |
|
265 |
+i <- I[1] |
|
266 |
+plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B") |
|
267 |
+points(log2(A[i, J]), log2(B[i, J]), col="black", pch=as.character(calls[i, J])) |
|
268 |
+for(CT in 2:3){ |
|
269 |
+ if(CT == 2) ellipse.col <- "brown" else ellipse.col <- "purple" |
|
270 |
+ for(CA in 0:CT){ |
|
271 |
+ CB <- CT-CA |
|
272 |
+ A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
273 |
+ B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
274 |
+ scale <- c(A.scale, B.scale) |
|
275 |
+ if(CA == 0 & CB > 0) rho <- corrA.BB[i, p] |
|
276 |
+ if(CA > 0 & CB == 0) rho <- corrB.AA[i, p] |
|
277 |
+ if(CA > 0 & CB > 0) rho <- corr[i, p] |
|
278 |
+ lines(ellipse(x=rho, centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])), |
|
279 |
+ scale=scale), col=ellipse.col, lwd=2) |
|
280 |
+ } |
|
281 |
+} |
|
282 |
+legend("topright", lwd=3, col=c("black", "purple"), legend=c("2 copies", "3 copies"), bty="n") |
|
283 |
+@ |
|
284 |
+ |
|
285 |
+\begin{figure} |
|
286 |
+ \includegraphics[width=0.9\textwidth]{copynumber-predictionRegion} |
|
287 |
+ \caption{Prediction regions for copy number 2 and 3 for one SNP. |
|
288 |
+ The plate effects are negligible for this SNP and we only plot the |
|
289 |
+ prediction region for the first plate.} |
|
290 |
+\end{figure} |
|
291 |
+ |
|
292 |
+Here is one way to identify a SNP with a large plate effect -- look |
|
293 |
+for shifts in the prediction regions (here I'm lucking for large |
|
294 |
+shifts in the A direction). |
|
295 |
+ |
|
296 |
+<<shifts>>= |
|
297 |
+shiftA <- nuA+2*phiA |
|
298 |
+maxA <- apply(shiftA, 1, max, na.rm=T) |
|
299 |
+minA <- apply(shiftA, 1, min, na.rm=T) |
|
300 |
+d <- maxA - minA |
|
301 |
+hist(d) |
|
302 |
+index <- which(d > 3000)[2] |
|
303 |
+plate1 <- which(shiftA[index, ] == maxA[index]) |
|
304 |
+plate2 <- which(shiftA[index, ] == minA[index]) |
|
305 |
+@ |
|
306 |
+ |
|
307 |
+Now plot the predictions for the two plates. All the ellipses are |
|
308 |
+prediction regions for copy number 2. Note that while I looked for |
|
309 |
+shifts in the A direction, the shift often occurs in both the B and A |
|
310 |
+dimensions. |
|
311 |
+ |
|
312 |
+<<plateEffect, fig=TRUE, width=8, height=8, include=FALSE>>= |
|
313 |
+par(las=1) |
|
314 |
+i <- index |
|
315 |
+J1 <- grep(unique(plates)[plate1], sns) ##sample indices for this plate |
|
316 |
+J2 <- grep(unique(plates)[plate2], sns) ##sample indices for this plate |
|
317 |
+ylim <- c(6.5,13) |
|
318 |
+plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col="grey60", cex=0.9, ylim=ylim, xlim=ylim, xlab="A", ylab="B") |
|
319 |
+points(log2(A[i, J1]), log2(B[i, J1]), col="brown", pch=as.character(calls[i, J1])) |
|
320 |
+points(log2(A[i, J2]), log2(B[i, J2]), col="blue", pch=as.character(calls[i, J2])) |
|
321 |
+p <- plate1 |
|
322 |
+CT <- 2 |
|
323 |
+ellipse.col <- "brown" |
|
324 |
+for(CA in 0:CT){ |
|
325 |
+ CB <- CT-CA |
|
326 |
+ A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
327 |
+ B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
328 |
+ scale <- c(A.scale, B.scale) |
|
329 |
+ lines(ellipse(x=corr[i, p]*(CA>0)*(CB>0), centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])), |
|
330 |
+ scale=scale), col=ellipse.col, lwd=2) |
|
331 |
+} |
|
332 |
+ellipse.col <- "blue" |
|
333 |
+p <- plate2 |
|
334 |
+for(CA in 0:CT){ |
|
335 |
+ CB <- CT-CA |
|
336 |
+ A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
337 |
+ B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
338 |
+ scale <- c(A.scale, B.scale) |
|
339 |
+ lines(ellipse(x=corr[i, p]*(CA>0)*(CB>0), centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])), |
|
340 |
+ scale=scale), col=ellipse.col, lwd=2) |
|
341 |
+} |
|
342 |
+legend("topright", lwd=3, col=c("brown", "blue"), legend=c("2 copies, plate 1", "2 copies, plate 2 "), bty="n") |
|
343 |
+@ |
|
344 |
+ |
|
345 |
+\begin{figure} |
|
346 |
+ \includegraphics[width=0.9\textwidth]{copynumber-plateEffect} |
|
347 |
+ \caption{The prediction regions for copy number 2 shift when there |
|
348 |
+ is a large plate effect.} |
|
349 |
+\end{figure} |
|
350 |
+ |
|
351 |
+Alternatively, loop through a large number of SNPs to see how these |
|
352 |
+regions move |
|
353 |
+<<codeForLoop, eval=FALSE>>= |
|
354 |
+par(las=1, pty="s", ask=TRUE) |
|
355 |
+for(i in I[1:50]){ |
|
356 |
+ plot(log2(A[i, ]), log2(B[i, ]), pch=as.character(calls[i, ]), col=col[DS], cex=0.9, ylim=ylim, xlim=ylim) |
|
357 |
+ for(CT in 2:3){ |
|
358 |
+ if(CT == 2) ellipse.col <- "brown" else ellipse.col <- "purple" |
|
359 |
+ for(CA in 0:CT){ |
|
360 |
+ CB <- CT-CA |
|
361 |
+ A.scale <- sqrt(tau2A[i, p]*(CA==0) + sig2A[i, p]*(CA > 0)) |
|
362 |
+ B.scale <- sqrt(tau2B[i, p]*(CB==0) + sig2B[i, p]*(CB > 0)) |
|
363 |
+ scale <- c(A.scale, B.scale) |
|
364 |
+ lines(ellipse(x=corr[i, p]*(CA>0)*(CB>0), centre=c(log2(nuA[i, p]+CA*phiA[i, p]), log2(nuB[i, p]+CB*phiB[i, p])), |
|
365 |
+ scale=scale), col=ellipse.col, lwd=2) |
|
366 |
+ } |
|
367 |
+ } |
|
368 |
+ legend("topright", lwd=3, col=col, legend=c("3 copies", "2 copies"), bty="n") |
|
369 |
+} |
|
370 |
+@ |
|
371 |
+ |
|
372 |
+\section{Session information} |
|
373 |
+<<>>= |
|
374 |
+sessionInfo() |
|
375 |
+@ |
|
376 |
+ |
|
377 |
+ |
|
378 |
+\end{document} |
2 | 381 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,129 @@ |
1 |
+%\VignetteIndexEntry{crlmm Vignette - Downstream Analysis} |
|
2 |
+%\VignetteKeywords{genotype, crlmm, SNP 5, SNP 6} |
|
3 |
+%\VignettePackage{crlmm} |
|
4 |
+ |
|
5 |
+\documentclass[12pt]{article} |
|
6 |
+ |
|
7 |
+\usepackage{amsmath,pstricks} |
|
8 |
+\usepackage[authoryear,round]{natbib} |
|
9 |
+\usepackage{hyperref} |
|
10 |
+ |
|
11 |
+ |
|
12 |
+\textwidth=6.2in |
|
13 |
+\textheight=8.5in |
|
14 |
+%\parskip=.3cm |
|
15 |
+\oddsidemargin=.1in |
|
16 |
+\evensidemargin=.1in |
|
17 |
+\headheight=-.3in |
|
18 |
+ |
|
19 |
+\newcommand{\scscst}{\scriptscriptstyle} |
|
20 |
+\newcommand{\scst}{\scriptstyle} |
|
21 |
+ |
|
22 |
+ |
|
23 |
+\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
24 |
+\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
25 |
+\newcommand{\Rpackage}[1]{{\textit{#1}}} |
|
26 |
+\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
27 |
+\newcommand{\Rfunarg}[1]{{\texttt{#1}}} |
|
28 |
+\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
29 |
+ |
|
30 |
+\textwidth=6.2in |
|
31 |
+ |
|
32 |
+\bibliographystyle{plainnat} |
|
33 |
+ |
|
34 |
+\begin{document} |
|
35 |
+%\setkeys{Gin}{width=0.55\textwidth} |
|
36 |
+ |
|
37 |
+\title{crlmm to downstream data analysis} |
|
38 |
+\author{VJ Carey} |
|
39 |
+\maketitle |
|
40 |
+ |
|
41 |
+\section{Running CRLMM on a nontrivial set of CEL files} |
|
42 |
+ |
|
43 |
+We work with the 90 CEU samples hybridized to Affy 6.0 chips, which |
|
44 |
+are assumed to be in the current directory. First, we identify the |
|
45 |
+files and run \Rmethod{crlmm}. The results will be saved to the |
|
46 |
+variable \Robject{crlmmResult}. |
|
47 |
+<<lkd>>= |
|
48 |
+library(crlmm) |
|
49 |
+celFiles <- list.celfiles() |
|
50 |
+celFiles[1:4] |
|
51 |
+if (!exists("crlmmResult")) { |
|
52 |
+ if (file.exists("crlmmResult.rda")) load("crlmmResult.rda") |
|
53 |
+ else { |
|
54 |
+ crlmmResult <- crlmm(celFiles) |
|
55 |
+ save(crlmmResult, file="crlmmResult.rda") |
|
56 |
+ } |
|
57 |
+} |
|
58 |
+@ |
|
59 |
+ |
|
60 |
+This is currently a list. |
|
61 |
+<<lkj21>>= |
|
62 |
+ class(crlmmResult) |
|
63 |
+ sapply(crlmmResult, dim) |
|
64 |
+ sapply(crlmmResult, length) |
|
65 |
+@ |
|
66 |
+ |
|
67 |
+\section{Constructing an eSet extension} |
|
68 |
+ |
|
69 |
+We will use the \Rpackage{GGdata} package to obtain extra information |
|
70 |
+on the samples. This will be later used when building an \Rclass{eSet} |
|
71 |
+extension to store the genotyping results. |
|
72 |
+<<getpd>>= |
|
73 |
+ library(GGdata) |
|
74 |
+ if (!exists("hmceuB36")) data(hmceuB36) |
|
75 |
+ pd <- phenoData(hmceuB36) |
|
76 |
+ ggn <- sampleNames(pd) |
|
77 |
+ preSN <- colnames(crlmmResult[["calls"]]) |
|
78 |
+ simpSN <- gsub("_.*", "", preSN) |
|
79 |
+ if (!all.equal(simpSN, ggn)) stop("align GGdata phenoData with crlmmResult read") |
|
80 |
+@ |
|
81 |
+ |
|
82 |
+The list obtained as output of the \Rmethod{crlmm} method can be |
|
83 |
+easily coerced to an eSet extension with the help of the helper |
|
84 |
+function \Rfunction{list2crlmmSet}. |
|
85 |
+<<docl>>= |
|
86 |
+ colnames(crlmmResult[["calls"]]) <- colnames(crlmmResult[["confs"]]) <- simpSN |
|
87 |
+ crlmmResultSet <- list2crlmmSet(crlmmResult) |
|
88 |
+ phenoData(crlmmResultSet) <- combine(pd, phenoData(crlmmResultSet)) |
|
89 |
+ crlmmResultSet |
|
90 |
+ dim(calls(crlmmResultSet)) |
|
91 |
+ dim(confs(crlmmResultSet)) |
|
92 |
+ calls(crlmmResultSet)[1:10, 1:2] |
|
93 |
+ confs(crlmmResultSet)[1:10, 1:2] |
|
94 |
+@ |
|
95 |
+ |
|
96 |
+\section{Coercing to snp.matrix as a prelude to a GWAS} |
|
97 |
+ |
|
98 |
+<<lksnm>>= |
|
99 |
+library(snpMatrix) |
|
100 |
+crlmmSM <- as(t(calls(crlmmResultSet))-1, "snp.matrix") |
|
101 |
+crlmmSM |
|
102 |
+@ |
|
103 |
+ |
|
104 |
+\section{Conducting a GWAS} |
|
105 |
+ |
|
106 |
+We want to find SNP for which rare allele count is predictive of expression of CPNE1. |
|
107 |
+We will use expression data available from GGdata. This is a very naive analysis. |
|
108 |
+<<doa>>= |
|
109 |
+library(illuminaHumanv1.db) |
|
110 |
+rmm <- revmap(illuminaHumanv1SYMBOL) |
|
111 |
+mypr <- get("CPNE1", rmm) |
|
112 |
+ex <- as.numeric(exprs(hmceuB36)[mypr[1],]) |
|
113 |
+subjdata <- pData(hmceuB36) |
|
114 |
+subjdata[["ex"]] <- ex |
|
115 |
+gwas <- snp.rhs.tests(ex~male, data=subjdata, snp.data=crlmmSM, family="gaussian") |
|
116 |
+ok <- which(p.value(gwas) < 1e-10) |
|
117 |
+gwas[ok,] |
|
118 |
+<<dopl,fig=TRUE>>= |
|
119 |
+plot(ex~calls(crlmmResultSet)["SNP_A-4208858",], |
|
120 |
+ xlab="Genotype Call for SNP_A-4208858", |
|
121 |
+ ylab="Expression", xaxt="n") |
|
122 |
+axis(1, at=1:3, labels=c("AA", "AB", "BB")) |
|
123 |
+@ |
|
124 |
+ |
|
125 |
+<<lksess>>= |
|
126 |
+sessionInfo() |
|
127 |
+@ |
|
128 |
+ |
|
129 |
+\end{document} |
2 | 132 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,58 @@ |
1 |
+\name{crlmmSet-class} |
|
2 |
+\Rdversion{1.1} |
|
3 |
+\docType{class} |
|
4 |
+\alias{crlmmSet-class} |
|
5 |
+\alias{calls} |
|
6 |
+\alias{calls,crlmmSet-method} |
|
7 |
+\alias{confs} |
|
8 |
+\alias{confs,crlmmSet-method} |
|
9 |
+\alias{list2crlmmSet} |
|
10 |
+\alias{list2crlmmSet,list-method} |
|
11 |
+ |
|
12 |
+\title{Class "crlmmSet"} |
|
13 |
+\description{An eSet extension to store genotype results.} |
|
14 |
+\section{Objects from the Class}{ |
|
15 |
+ Objects can be created by calls of the form \code{new("crlmmSet", |
|
16 |
+ assayData, phenoData, featureData, experimentData, annotation, |
|
17 |
+ ...)}. |
|
18 |
+} |
|
19 |
+\section{Slots}{ |
|
20 |
+ \describe{ |
|
21 |
+ \item{\code{assayData}:}{Object of class \code{"AssayData"}. For an |
|
22 |
+ object to be valid, the assayData slot must have two matrices |
|
23 |
+ named 'calls' and 'confs'.} |
|
24 |
+ \item{\code{phenoData}:}{Object of class \code{"AnnotatedDataFrame"}} |
|
25 |
+ \item{\code{featureData}:}{Object of class \code{"AnnotatedDataFrame"}} |
|
26 |
+ \item{\code{experimentData}:}{Object of class \code{"MIAME"}} |
|
27 |
+ \item{\code{annotation}:}{Object of class \code{"character"}} |
|
28 |
+ \item{\code{.__classVersion__}:}{Object of class \code{"Versions"}} |
|
29 |
+ } |
|
30 |
+} |
|
31 |
+\section{Extends}{ |
|
32 |
+Class \code{"\link[Biobase]{eSet-class}"}, directly. |
|
33 |
+Class \code{"\link[Biobase]{VersionedBiobase-class}"}, by class "eSet", distance 2. |
|
34 |
+Class \code{"\link[Biobase]{Versioned-class}"}, by class "eSet", distance 3. |
|
35 |
+} |
|
36 |
+\section{Methods}{ |
|
37 |
+ \describe{ |
|
38 |
+ \item{calls}{\code{signature(x = "crlmmSet")}: extractor for |
|
39 |
+ genotype calls} |
|
40 |
+ \item{confs}{\code{signature(x = "crlmmSet")}: extractor for confidences} |
|
41 |
+ } |
|
42 |
+} |
|
43 |
+\examples{ |
|
44 |
+if (require(genomewidesnp5Crlmm) & require(hapmapsnp5)){ |
|
45 |
+ path <- system.file("celFiles", package="hapmapsnp5") |
|
46 |
+ |
|
47 |
+ ## the filenames with full path... |
|
48 |
+ ## very useful when genotyping samples not in the working directory |
|
49 |
+ cels <- list.celfiles(path, full.names=TRUE) |
|
50 |
+ crlmmOutput <- crlmm(cels) |
|
51 |
+ crlmmSetObject <- list2crlmmSet(crlmmOutput) |
|
52 |
+ crlmmSetObject |
|
53 |
+ calls(crlmmSetObject)[1:10, 1:2] |
|
54 |
+ confs(crlmmSetObject)[1:10, 1:2] |
|
55 |
+} |
|
56 |
+} |
|
57 |
+\keyword{classes} |
|
58 |
+\keyword{methods} |