git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@37686 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -12,17 +12,6 @@ rowMAD <- function(x, y, ...){ |
12 | 12 |
return(mad) |
13 | 13 |
} |
14 | 14 |
|
15 |
-getCnvFid <- function(chromosome, pkg="pd.genomewidesnp.6", max.size=1000){ |
|
16 |
- ##requires the pd.mapping package |
|
17 |
- ##best to drop this |
|
18 |
- require(pkg, character.only=TRUE) |
|
19 |
- conn <- db(get(pkg)) |
|
20 |
- sql <- paste("SELECT featureSetCNV.chrom_start, featureSetCNV.man_fsetid, featureSetCNV.fsetid, fid FROM featureSetCNV, pmfeatureCNV WHERE ", |
|
21 |
- "featureSetCNV.fsetid=pmfeatureCNV.fsetid AND featureSetCNV.chrom IN ('", chromosome, "')", sep="") |
|
22 |
- tmp <- dbGetQuery(conn, sql) |
|
23 |
- tmp |
|
24 |
-} |
|
25 |
- |
|
26 | 15 |
rowCors <- function(x, y, ...){ |
27 | 16 |
N <- rowSums(!is.na(x)) |
28 | 17 |
x <- suppressWarnings(log2(x)) |
... | ... |
@@ -121,16 +110,9 @@ cnrma <- function(filenames, sns, cdfName, seed=1){ |
121 | 110 |
} |
122 | 111 |
|
123 | 112 |
##I'm using the pd.mapping package |
124 |
- map <- list(); j <- 1 |
|
125 |
- for(chrom in c(1:22, "X")){ |
|
126 |
- map[[j]] <- getCnvFid(chrom) |
|
127 |
- j <- j+1 |
|
128 |
- } |
|
129 |
- map <- lapply(map, function(x) x[order(x[, "chrom_start"]), ]) |
|
130 |
- map <- do.call("rbind", map) |
|
131 |
- |
|
132 |
- |
|
133 |
- fid <- map[, "fid"] |
|
113 |
+ data(npProbesFid, package=pkgname) |
|
114 |
+ fid <- npProbesFid |
|
115 |
+ gc() |
|
134 | 116 |
set.seed(seed) |
135 | 117 |
idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything |
136 | 118 |
SKW <- vector("numeric", length(filenames)) |
... | ... |
@@ -141,7 +123,7 @@ cnrma <- function(filenames, sns, cdfName, seed=1){ |
141 | 123 |
if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
142 | 124 |
} |
143 | 125 |
##load reference distribution obtained from hapmap |
144 |
- load("/thumper/ctsa/snpmicroarray/hapmap/processed/affy/1m_reference_cn.rda") |
|
126 |
+ data(list="1m_reference_cn", pkgname) |
|
145 | 127 |
for(i in seq(along=filenames)){ |
146 | 128 |
y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
147 | 129 |
x <- log2(y[idx2]) |
... | ... |
@@ -155,7 +137,8 @@ cnrma <- function(filenames, sns, cdfName, seed=1){ |
155 | 137 |
if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
156 | 138 |
else cat(".") |
157 | 139 |
} |
158 |
- dimnames(NP) <- list(map[, "man_fsetid"], sns) |
|
140 |
+ dimnames(NP) <- list(names(fid), sns) |
|
141 |
+ ##dimnames(NP) <- list(map[, "man_fsetid"], sns) |
|
159 | 142 |
res3 <- list(NP=NP, SKW=SKW) |
160 | 143 |
return(res3) |
161 | 144 |
} |
... | ... |
@@ -255,17 +238,16 @@ instantiateObjects <- function(calls, NP, plate, envir, chrom){ |
255 | 238 |
assign("CB", CB, envir=envir) |
256 | 239 |
} |
257 | 240 |
|
258 |
-computeCnBatch <- function(A, |
|
259 |
- B, |
|
260 |
- calls, |
|
261 |
- conf, |
|
262 |
- NP, |
|
263 |
- plate, |
|
264 |
- fit.variance=NULL, |
|
265 |
- seed=123, |
|
266 |
- MIN.OBS=3, |
|
267 |
- envir, |
|
268 |
- chrom, P, DF.PRIOR=50, CONF.THR=0.99, trim=0, upperTail=TRUE, ...){ |
|
241 |
+computeCopynumber <- function(A, |
|
242 |
+ B, |
|
243 |
+ calls, |
|
244 |
+ conf, |
|
245 |
+ NP, |
|
246 |
+ plate, |
|
247 |
+ fit.variance=NULL, |
|
248 |
+ MIN.OBS=3, |
|
249 |
+ envir, |
|
250 |
+ chrom, P, DF.PRIOR=50, CONF.THR=0.99, trim=0, upperTail=TRUE, ...){ |
|
269 | 251 |
if(length(ls(envir)) == 0) instantiateObjects(calls, NP, plate, envir, chrom) |
270 | 252 |
##require(genefilter) |
271 | 253 |
require(splines) |
... | ... |
@@ -297,7 +279,7 @@ computeCnBatch <- function(A, |
297 | 279 |
message("\nAllele specific copy number") |
298 | 280 |
for(p in P){ |
299 | 281 |
cat(".") |
300 |
- copynumber(plateIndex=p, |
|
282 |
+ polymorphic(plateIndex=p, |
|
301 | 283 |
A=A[, plate==uplate[p]], |
302 | 284 |
B=B[, plate==uplate[p]], |
303 | 285 |
envir=envir) |
... | ... |
@@ -872,7 +854,7 @@ coefs <- function(plateIndex, conf, MIN.OBS=3, envir, CONF.THR=0.99){ |
872 | 854 |
assign("phiB.se", phiB.se, envir) |
873 | 855 |
} |
874 | 856 |
|
875 |
-copynumber <- function(plateIndex, A, B, envir){ |
|
857 |
+polymorphic <- function(plateIndex, A, B, envir){ |
|
876 | 858 |
p <- plateIndex |
877 | 859 |
plates.completed <- get("plates.completed", envir) |
878 | 860 |
if(!plates.completed[p]) return() |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-%\VignetteIndexEntry{crlmm Vignette - Copy Number} |
|
1 |
+%\VignetteIndexEntry{crlmm copy number Vignette} |
|
2 | 2 |
%\VignetteDepends{crlmm, genomewidesnp6Crlmm} |
3 | 3 |
%\VignetteKeywords{crlmm, SNP 6} |
4 | 4 |
%\VignettePackage{crlmm} |
... | ... |
@@ -27,21 +27,14 @@ options(prompt="R> ") |
27 | 27 |
|
28 | 28 |
\section{Estimating copy number} |
29 | 29 |
|
30 |
-General requirements: in addition to the \R{} packages indicated |
|
31 |
-below, processing a large number of samples requires a high-end |
|
32 |
-computer with a large amount of RAM. Currently, the maximum number of |
|
33 |
-samples that can be genotyped at once is approximately 2000 and this |
|
34 |
-requires roughly 32G of RAM. |
|
35 |
- |
|
36 |
-\paragraph{Suggested work flow.} I typically submit code for |
|
37 |
-preprocessing and genotyping as a batch job to a computing cluster. |
|
38 |
-When the preprocessing is complete, I pick a chromosome and work |
|
39 |
-interactively from the cluster to calculate copy number and make some |
|
40 |
-of the suggested diagnostic plots. |
|
30 |
+At present, software for copy number estimation is provided only for |
|
31 |
+the Affymetrix 6.0 platform. This vignette estimates copy number for |
|
32 |
+the HapMap samples. |
|
41 | 33 |
|
42 | 34 |
\subsection{Preprocess and genotype the samples} |
43 | 35 |
|
44 |
-First, load the required libraries. |
|
36 |
+See the crlmm vignette for additional details on |
|
37 |
+preprocessing/genotyping. |
|
45 | 38 |
|
46 | 39 |
<<requiredPackages>>= |
47 | 40 |
library(crlmm) |
... | ... |
@@ -49,41 +42,49 @@ library(genomewidesnp6Crlmm) |
49 | 42 |
library(ellipse) |
50 | 43 |
@ |
51 | 44 |
|
52 |
-Specify an output directory and provide the complete path for the CEL |
|
53 |
-file names. |
|
54 |
- |
|
55 |
-<<>>= |
|
56 |
-outdir <- "/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/GRU-EA" |
|
57 |
-datadir <- "/thumper/ctsa/snpmicroarray/GAIN/Bipolar/GRU-EA" |
|
58 |
-fns <- list.files(datadir, pattern=".CEL", full.names=TRUE) |
|
45 |
+<<celfiles>>= |
|
46 |
+celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE, pattern=".CEL") |
|
47 |
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/affy" |
|
59 | 48 |
@ |
60 | 49 |
|
61 |
-Genotype the samples with CRLMM. Submit this job to the cluster and save the |
|
62 |
-output to \Robject{outdir}. |
|
50 |
+Specify an output directory for storing the quantile-normalized |
|
51 |
+intensities from crlmm. |
|
63 | 52 |
|
64 |
-<<genotype, eval=FALSE>>= |
|
65 |
-res2 <- crlmm(filenames=fns, save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda")) |
|
66 |
-save(res2, file=file.path(outdir, "res2.rda")) |
|
53 |
+<<genotype>>= |
|
54 |
+if (!exists("crlmmResult")) { |
|
55 |
+ if (file.exists(file.path(outdir, "crlmmResult.rda"))){ |
|
56 |
+ load(file.path(outdir, "intensities.rda")) |
|
57 |
+ load(file.path(outdir, "crlmmResult.rda")) |
|
58 |
+ } |
|
59 |
+ else { |
|
60 |
+ crlmmResult <- crlmm(celFiles, save.it=TRUE, intensityFile=file.path(outdir, "intensities.rda")) |
|
61 |
+ save(crlmmResult, file=file.path(outdir, "crlmmResult.rda")) |
|
62 |
+ } |
|
63 |
+} |
|
67 | 64 |
@ |
68 | 65 |
|
69 |
-Quantile normalize the nonpolymorphic probes and save the output: |
|
66 |
+Quantile normalize the nonpolymorphic probes and save the output. |
|
70 | 67 |
|
71 |
-<<quantileNormalizeCnProbes, eval=FALSE>>= |
|
72 |
-res3 <- cnrma(fns) |
|
73 |
-save(res3, file=file.path(outdir, "res3.rda")) |
|
68 |
+<<cnrma>>= |
|
69 |
+if(!exists("cnrmaResult")){ |
|
70 |
+ if(file.exists(file.path(outdir, "cnrmaResult.rda"))) load(file.path(outdir, "cnrmaResult.rda")) |
|
71 |
+ else { |
|
72 |
+ cnrmaResult <- cnrma(celFiles) |
|
73 |
+ save(cnrmaResult, file=file.path(outdir, "cnrmaResult.rda")) |
|
74 |
+ } |
|
75 |
+} |
|
74 | 76 |
@ |
75 | 77 |
|
76 | 78 |
\subsection{Copy number} |
77 | 79 |
|
78 | 80 |
Copy number can be assessed one chromosome at a time. Here we specify |
79 |
-chromosome 15 and and load a list of indices used to subset the files |
|
80 |
-saved in the previous section. The first element in the list |
|
81 |
-correspond to indices of polymorphic probes on chromosome 15; the |
|
82 |
-second element corresponds to indices of nonpolymorphic probes on |
|
83 |
-chromosome 15. |
|
81 |
+chromosome 15 and and load a list of indices to subset the data. The |
|
82 |
+first element in the list correspond to indices of polymorphic probes |
|
83 |
+on chromosome 15; the second element corresponds to indices of |
|
84 |
+nonpolymorphic probes on chromosome 15. |
|
84 | 85 |
|
85 | 86 |
<<chromosomeIndex>>= |
86 |
-CHR <- 15 |
|
87 |
+CHR <- 21 |
|
87 | 88 |
CHR_INDEX <- paste(CHR, "index", sep="") |
88 | 89 |
data(list=CHR_INDEX, package="genomewidesnp6Crlmm") |
89 | 90 |
str(index) |
... | ... |
@@ -105,39 +106,14 @@ number. Specifically, we require 6 items: |
105 | 106 |
|
106 | 107 |
These items are extracted as follows: |
107 | 108 |
|
108 |
-<<eval=FALSE>>= |
|
109 |
-load(file.path(outdir, "intensities.rda")) |
|
110 |
-load(file.path(outdir, "res2.rda")) |
|
111 |
-load(file.path(outdir, "res3.rda")) |
|
112 |
-@ |
|
113 |
- |
|
114 |
-Depending on the number of samples, the above objects can be large and |
|
115 |
-take a while to load. For the figures in this vignette, I am loading |
|
116 |
-only 5000 SNP-level summaries from chromosome 22 (28MB) and the |
|
117 |
-quantile-normalized intensities for 5000 nonpolymorphic probes on |
|
118 |
-chromosome 22 (8 MB total). |
|
119 |
- |
|
120 |
-<<loadTmpFiles, eval=FALSE, echo=FALSE>>= |
|
121 |
-load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmp.rda") |
|
122 |
-assign("res", tmp, envir=.GlobalEnv) |
|
123 |
-rm(tmp); gc() |
|
124 |
-load("/thumper/ctsa/snpmicroarray/rs/data/gain/bipolar/tmpcn.rda") |
|
125 |
-A <- res$A |
|
126 |
-B <- res$B |
|
127 |
-calls <- res$calls |
|
128 |
-conf <- res$conf |
|
129 |
-SNR <- res$SNR |
|
130 |
-NP <- res3$NP |
|
131 |
-@ |
|
132 |
- |
|
133 | 109 |
<<snpAndCnSummaries, eval=FALSE>>= |
134 | 110 |
A <- res$A |
135 | 111 |
B <- res$B |
136 |
-calls <- res2$calls |
|
137 |
-conf <- res2$conf |
|
138 |
-SNR <- res2$SNR |
|
139 |
-NP <- res3$NP |
|
140 |
-rm(res, res2, res3) |
|
112 |
+calls <- crlmmResult$calls |
|
113 |
+conf <- crlmmResult$conf |
|
114 |
+SNR <- crlmmResult$SNR |
|
115 |
+NP <- cnrmaResult$NP |
|
116 |
+rm(res, crlmmResult, cnrmaResult) |
|
141 | 117 |
gc() |
142 | 118 |
@ |
143 | 119 |
|
... | ... |
@@ -157,25 +133,22 @@ B <- B[index[[1]], SNR > 5] |
157 | 133 |
calls <- calls[index[[1]], SNR > 5] |
158 | 134 |
conf <- conf[index[[1]], SNR > 5] |
159 | 135 |
NP <- NP[index[[2]], SNR > 5] |
160 |
-rm(res, res2, res3); gc() |
|
161 | 136 |
@ |
162 | 137 |
|
163 |
-We want to adjust for batch effects. In our experience, the chemistry |
|
164 |
-plate is a good surrogate for batch, although this should be assessed |
|
165 |
-on a study by study basis. For samples processed by Broad, the plate |
|
166 |
-is indicated by a capitalized five-letter word and is part of the |
|
167 |
-sample name. It is important that the plate (batch) variable is |
|
168 |
-ordered the same as filenames in the preprocessed data. For instance, |
|
169 |
-to indicate plate in this example one can use the command |
|
138 |
+Batch effects can be very large in the quantile-normalized |
|
139 |
+intensities. Often the chemistry plate is a good surrogate for batch |
|
140 |
+effects, although it can also be the lab, etc. Here we define batch |
|
141 |
+by the chemistry plate. For the HapMap data, plate is often confounded |
|
142 |
+with ancestry. |
|
170 | 143 |
|
171 | 144 |
<<specifyBatch>>= |
172 | 145 |
sns <- colnames(calls) |
173 | 146 |
sns[1] |
174 |
-plates <- sapply(sns, function(x) strsplit(x, "_")[[1]][2]) |
|
175 |
-table(plates) |
|
147 |
+plate <- substr(basename(sns), 13, 13) |
|
148 |
+table(plate) |
|
176 | 149 |
@ |
177 | 150 |
|
178 |
-We are now ready to estimate the copy number for each batch. In the |
|
151 |
+We are now ready to estimate copy number for each batch. In the |
|
179 | 152 |
current version of this package, one specifies an environment to which |
180 | 153 |
intermediate \R{} objects for copy number estimation are |
181 | 154 |
stored. Allele-specific estimates of copy number are also stored in |
... | ... |
@@ -183,13 +156,13 @@ this environment. |
183 | 156 |
|
184 | 157 |
<<copyNumberByBatch, eval=FALSE>>= |
185 | 158 |
e1 <- new.env() |
186 |
-computeCnBatch(A=A, |
|
187 |
- B=B, |
|
188 |
- calls=calls, |
|
189 |
- conf=conf, |
|
190 |
- NP=NP, |
|
191 |
- plate=plates, |
|
192 |
- envir=e1, chrom=CHR, DF.PRIOR=75) |
|
159 |
+computeCopynumber(A=A, |
|
160 |
+ B=B, |
|
161 |
+ calls=calls, |
|
162 |
+ conf=conf, |
|
163 |
+ NP=NP, |
|
164 |
+ plate=plates, |
|
165 |
+ envir=e1, chrom=CHR, DF.PRIOR=75) |
|
193 | 166 |
@ |
194 | 167 |
|
195 | 168 |
The DF.PRIOR indicates how much we will shrink SNP-specific estimates |
... | ... |
@@ -290,8 +263,8 @@ legend("topright", lwd=3, col=c("black", "purple"), legend=c("2 copies", "3 copi |
290 | 263 |
\end{figure} |
291 | 264 |
|
292 | 265 |
Here is one way to identify a SNP with a large plate effect -- look |
293 |
-for shifts in the prediction regions (here I'm lucking for large |
|
294 |
-shifts in the A direction). |
|
266 |
+for shifts in the prediction regions (here I'm looking for large |
|
267 |
+shifts in the A direction). |
|
295 | 268 |
|
296 | 269 |
<<shifts>>= |
297 | 270 |
shiftA <- nuA+2*phiA |