git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@47121 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -519,3 +519,6 @@ function (which expects ff objects and supports parallel processing) |
519 | 519 |
2010-04-24 B Carvalho committed version 1.7.1 |
520 | 520 |
** fixed bug in gender prediction that cause a spike in memory usage |
521 | 521 |
|
522 |
+2010-05-25 R. Scharpf committed version 1.7.2 |
|
523 |
+** added example dataset |
|
524 |
+** fixed vignette: inst/scripts/copynumber.Rnw |
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.7.1 |
|
5 |
-Date: 2010-04-16 |
|
4 |
+Version: 1.7.2 |
|
5 |
+Date: 2010-05-25 |
|
6 | 6 |
Author: Rafael A Irizarry, Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au> |
7 | 7 |
Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
8 | 8 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms |
... | ... |
@@ -23,7 +23,8 @@ Imports: affyio (>= 1.15.2), |
23 | 23 |
utils |
24 | 24 |
Suggests: hapmapsnp6, |
25 | 25 |
genomewidesnp6Crlmm (>= 1.0.2), |
26 |
- snpMatrix |
|
26 |
+ snpMatrix, |
|
27 |
+ ellipse |
|
27 | 28 |
Collate: AllGenerics.R |
28 | 29 |
AllClasses.R |
29 | 30 |
methods-CNSet.R |
... | ... |
@@ -33,6 +34,7 @@ Collate: AllGenerics.R |
33 | 34 |
crlmm-functions.R |
34 | 35 |
crlmm-illumina.R |
35 | 36 |
snprma-functions.R |
37 |
+ plot-methods.R |
|
36 | 38 |
utils.R |
37 | 39 |
zzz.R |
38 | 40 |
LazyLoad: yes |
... | ... |
@@ -8,3 +8,16 @@ setMethod("initialize", "CNSetLM", function(.Object, lM=new("list"), ...){ |
8 | 8 |
.Object@lM <- lM |
9 | 9 |
.Object <- callNextMethod(.Object, ...) |
10 | 10 |
}) |
11 |
+ |
|
12 |
+setValidity("CNSetLM", |
|
13 |
+ function(object){ |
|
14 |
+ if(!"batch" %in% varLabels(protocolData(object))) |
|
15 |
+ return("'batch' not defined in protocolData") |
|
16 |
+ if(!"chromosome" %in% fvarLabels(object)) |
|
17 |
+ return("'chromosome' not defined in featureData") |
|
18 |
+ if(!"position" %in% fvarLabels(object)) |
|
19 |
+ return("'position' not defined in featureData") |
|
20 |
+ if(!"isSnp" %in% fvarLabels(object)) |
|
21 |
+ return("'isSnp' not defined in featureData") |
|
22 |
+ return(TRUE) |
|
23 |
+ }) |
... | ... |
@@ -24,7 +24,6 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){ |
24 | 24 |
gns <- getVarInEnv("gns") |
25 | 25 |
path <- system.file("extdata", package=paste(cdfName, "Crlmm", sep="")) |
26 | 26 |
load(file.path(path, "snpProbes.rda")) |
27 |
- |
|
28 | 27 |
if(copynumber){ |
29 | 28 |
load(file.path(path, "cnProbes.rda")) |
30 | 29 |
cnProbes <- get("cnProbes") |
... | ... |
@@ -40,7 +39,6 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){ |
40 | 39 |
M[index, "isSnp"] <- 1L |
41 | 40 |
index <- which(is.na(M[, "isSnp"])) |
42 | 41 |
M[index, "isSnp"] <- 1L |
43 |
- |
|
44 | 42 |
if(copynumber){ |
45 | 43 |
index <- match(rownames(cnProbes), rownames(M)) #only snp probes in M get assigned position |
46 | 44 |
M[index, "position"] <- cnProbes[, grep("pos", colnames(cnProbes))] |
... | ... |
@@ -54,29 +52,42 @@ getFeatureData.Affy <- function(cdfName, copynumber=FALSE){ |
54 | 52 |
##crlmmOpts$snpRange <- range(snpIndex) |
55 | 53 |
##crlmmOpts$npRange <- range(npIndex) |
56 | 54 |
} |
55 |
+ |
|
57 | 56 |
construct <- function(filenames, cdfName, copynumber=FALSE, |
58 |
- sns, verbose=TRUE){ |
|
59 |
- if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability") |
|
57 |
+ sns, verbose=TRUE, batch){ |
|
58 |
+ if(verbose) message("Initializing container for assay data elements alleleA, alleleB, call, callProbability, CA, CB") |
|
60 | 59 |
if(missing(sns)) sns <- basename(filenames) |
61 | 60 |
protocolData <- getProtocolData.Affy(filenames) |
61 |
+ if(missing(batch)){ |
|
62 |
+ protocolData$batch <- as.numeric(as.factor(protocolData$ScanDate)) |
|
63 |
+ } else { |
|
64 |
+ if(length(batch) != length(filenames)) |
|
65 |
+ stop("batch variable must be the same length as the filenames") |
|
66 |
+ protocolData$batch <- batch |
|
67 |
+ } |
|
68 |
+ rownames(pData(protocolData)) <- NULL |
|
62 | 69 |
featureData <- getFeatureData.Affy(cdfName, copynumber=copynumber) |
63 | 70 |
nr <- nrow(featureData); nc <- length(filenames) |
64 |
- callSet <- new("SnpSuperSet", |
|
71 |
+ callSet <- new("CNSetLM", |
|
65 | 72 |
alleleA=initializeBigMatrix(name="A", nr, nc), |
66 | 73 |
alleleB=initializeBigMatrix(name="B", nr, nc), |
67 | 74 |
call=initializeBigMatrix(name="call", nr, nc), |
68 | 75 |
callProbability=initializeBigMatrix(name="callPr", nr,nc), |
76 |
+ CA=initializeBigMatrix(name="CA", nr, nc), |
|
77 |
+ CB=initializeBigMatrix(name="CB", nr, nc), |
|
78 |
+ protocolData=protocolData, |
|
69 | 79 |
featureData=featureData, |
70 | 80 |
annotation=cdfName) |
71 | 81 |
pd <- data.frame(matrix(NA, nc, 3), row.names=sns) |
72 | 82 |
colnames(pd)=c("SKW", "SNR", "gender") |
73 | 83 |
phenoData(callSet) <- new("AnnotatedDataFrame", data=pd) |
74 |
- protocolData(callSet) <- protocolData |
|
75 |
- sampleNames(callSet) <- sns |
|
84 |
+ rownames(pData(callSet)) <- NULL |
|
85 |
+ phenoData(callSet)$sampleNames <- sns |
|
76 | 86 |
return(callSet) |
77 | 87 |
} |
78 | 88 |
|
79 | 89 |
|
90 |
+ |
|
80 | 91 |
genotype <- function(filenames, |
81 | 92 |
cdfName, |
82 | 93 |
batch, |
... | ... |
@@ -97,27 +108,20 @@ genotype <- function(filenames, |
97 | 108 |
if(missing(cdfName)) stop("must specify cdfName") |
98 | 109 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
99 | 110 |
if(missing(sns)) sns <- basename(filenames) |
100 |
- if(missing(batch)){ |
|
101 |
- warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch. The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.") |
|
102 |
- } else { |
|
103 |
- if(length(batch) != length(filenames)) |
|
104 |
- stop("batch variable must be the same length as the filenames") |
|
105 |
- } |
|
106 | 111 |
## callSet contains potentially very big matrices |
107 | 112 |
## More big matrices are created within snprma, that will then be removed. |
108 | 113 |
callSet <- construct(filenames=filenames, |
109 | 114 |
cdfName=cdfName, |
110 | 115 |
copynumber=copynumber, |
111 | 116 |
sns=sns, |
112 |
- verbose=verbose) |
|
113 |
- if(missing(batch)){ |
|
114 |
- protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate)) |
|
115 |
- } |
|
117 |
+ verbose=verbose, |
|
118 |
+ batch=batch) |
|
116 | 119 |
mixtureParams <- matrix(NA, 4, length(filenames)) |
117 | 120 |
snp.index <- which(isSnp(callSet)==1) |
118 | 121 |
batches <- splitIndicesByLength(1:ncol(callSet), ocSamples()) |
119 | 122 |
iter <- 1 |
120 |
- for(j in batches){ |
|
123 |
+## for(j in batches){ |
|
124 |
+ j <- unlist(batches) |
|
121 | 125 |
if(verbose) message("Batch ", iter, " of ", length(batches)) |
122 | 126 |
snprmaRes <- snprma(filenames=filenames[j], |
123 | 127 |
mixtureSampleSize=mixtureSampleSize, |
... | ... |
@@ -167,7 +171,7 @@ genotype <- function(filenames, |
167 | 171 |
snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]] |
168 | 172 |
callSet$gender[j] <- tmp$gender |
169 | 173 |
iter <- iter+1 |
170 |
- } |
|
174 |
+## } |
|
171 | 175 |
return(callSet) |
172 | 176 |
} |
173 | 177 |
|
... | ... |
@@ -208,23 +212,15 @@ genotype2 <- function(filenames, |
208 | 212 |
} |
209 | 213 |
if(missing(cdfName)) stop("must specify cdfName") |
210 | 214 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
211 |
- if(missing(batch)){ |
|
212 |
- warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch. The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.") |
|
213 |
- } else { |
|
214 |
- if(length(batch) != length(filenames)) |
|
215 |
- stop("batch variable must be the same length as the filenames") |
|
216 |
- } |
|
217 | 215 |
if(missing(sns)) sns <- basename(filenames) |
218 | 216 |
## callSet contains potentially very big matrices |
219 | 217 |
callSet <- construct(filenames=filenames, |
220 | 218 |
cdfName=cdfName, |
221 | 219 |
copynumber=TRUE, |
222 | 220 |
sns=sns, |
223 |
- verbose=verbose) |
|
224 |
- if(missing(batch)){ |
|
225 |
- protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate)) |
|
226 |
- } |
|
227 |
- if(!missing(batch)) protocolData(callSet)$batch <- batch |
|
221 |
+ verbose=verbose, |
|
222 |
+ batch=batch) |
|
223 |
+ ##lM(callSet) <- initializeParamObject(list(featureNames(callSet), unique(protocolData(callSet)$batch))) |
|
228 | 224 |
mixtureParams <- matrix(NA, 4, length(filenames)) |
229 | 225 |
snp.index <- which(isSnp(callSet)==1) |
230 | 226 |
snprmaRes <- snprma2(filenames=filenames, |
... | ... |
@@ -242,16 +238,23 @@ genotype2 <- function(filenames, |
242 | 238 |
open(snprmaRes[["SKW"]]) |
243 | 239 |
open(snprmaRes[["mixtureParams"]]) |
244 | 240 |
if(verbose) message("Updating elements of callSet") |
245 |
- bb = ocProbesets()*ncol(A)*8 |
|
246 |
- ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb) |
|
247 |
- ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb) |
|
241 |
+## bb = ocProbesets()*ncol(A)*8 |
|
242 |
+## ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb) |
|
243 |
+## ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb) |
|
244 |
+ ##batches <- splitIndicesByLength(1:nrow(snprmaRes[["A"]]), ocProbesets()) |
|
245 |
+ AA <- A(callSet) |
|
246 |
+ BB <- B(callSet) |
|
247 |
+ for(j in 1:ncol(callSet)){ |
|
248 |
+ AA[snp.index, j] <- snprmaRes[["A"]][, j] |
|
249 |
+ BB[snp.index, j] <- snprmaRes[["B"]][, j] |
|
250 |
+ } |
|
248 | 251 |
if(verbose) message("Finished updating elements of callSet") |
249 | 252 |
stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns)) |
250 | 253 |
pData(callSet)$SKW <- snprmaRes$SKW |
251 | 254 |
pData(callSet)$SNR <- snprmaRes$SNR |
252 | 255 |
mixtureParams <- snprmaRes$mixtureParams |
253 | 256 |
np.index <- which(isSnp(callSet) == 0) |
254 |
- cnrmaRes <- cnrma2(A=A(callSet), |
|
257 |
+ cnrmaRes <- cnrma2(A=AA, |
|
255 | 258 |
filenames=filenames, |
256 | 259 |
row.names=featureNames(callSet)[np.index], |
257 | 260 |
cdfName=cdfName, |
... | ... |
@@ -261,32 +264,169 @@ genotype2 <- function(filenames, |
261 | 264 |
rm(cnrmaRes); gc() |
262 | 265 |
## as.matrix needed when ffdf is used |
263 | 266 |
tmp <- crlmmGT2(A=snprmaRes[["A"]], |
264 |
- B=snprmaRes[["B"]], |
|
265 |
- SNR=snprmaRes[["SNR"]], |
|
266 |
- mixtureParams=snprmaRes[["mixtureParams"]], |
|
267 |
- cdfName=cdfName, |
|
268 |
- row.names=NULL, ##featureNames(callSet),##[snp.index], |
|
269 |
- col.names=sampleNames(callSet), |
|
270 |
- probs=probs, |
|
271 |
- DF=DF, |
|
272 |
- SNRMin=SNRMin, |
|
273 |
- recallMin=recallMin, |
|
274 |
- recallRegMin=recallRegMin, |
|
275 |
- gender=gender, |
|
276 |
- verbose=verbose, |
|
277 |
- returnParams=returnParams, |
|
278 |
- badSNP=badSNP) |
|
267 |
+ B=snprmaRes[["B"]], |
|
268 |
+ SNR=snprmaRes[["SNR"]], |
|
269 |
+ mixtureParams=snprmaRes[["mixtureParams"]], |
|
270 |
+ cdfName=cdfName, |
|
271 |
+ row.names=NULL, ##featureNames(callSet),##[snp.index], |
|
272 |
+ col.names=sampleNames(callSet), |
|
273 |
+ probs=probs, |
|
274 |
+ DF=DF, |
|
275 |
+ SNRMin=SNRMin, |
|
276 |
+ recallMin=recallMin, |
|
277 |
+ recallRegMin=recallRegMin, |
|
278 |
+ gender=gender, |
|
279 |
+ verbose=verbose, |
|
280 |
+ returnParams=returnParams, |
|
281 |
+ badSNP=badSNP) |
|
282 |
+ if(verbose) message("Leaving crlmmGT2") |
|
279 | 283 |
open(tmp[["calls"]]) |
280 | 284 |
open(tmp[["confs"]]) |
281 |
- bb = ocProbesets()*ncol(A)*8 |
|
282 |
- ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb) |
|
283 |
- ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb) |
|
285 |
+ ##bb = ocProbesets()*ncol(A)*8 |
|
286 |
+ GT <- snpCall(callSet) |
|
287 |
+ GTP <- snpCallProbability(callSet) |
|
288 |
+ for(j in 1:ncol(callSet)){ |
|
289 |
+ GT[snp.index, j] <- tmp[["calls"]][, j] |
|
290 |
+ GTP[snp.index, j] <- tmp[["confs"]][, j] |
|
291 |
+ } |
|
292 |
+## ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb) |
|
293 |
+## ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb) |
|
284 | 294 |
callSet$gender <- tmp$gender |
285 |
- cnSet <- as(callSet, "CNSetLM") |
|
286 |
- return(cnSet) |
|
295 |
+ return(callSet) |
|
296 |
+} |
|
297 |
+ |
|
298 |
+ |
|
299 |
+ |
|
300 |
+genotype3 <- function(filenames, |
|
301 |
+ cdfName, |
|
302 |
+ batch, |
|
303 |
+ mixtureSampleSize=10^5, |
|
304 |
+ eps=0.1, |
|
305 |
+ verbose=TRUE, |
|
306 |
+ seed=1, |
|
307 |
+ sns, |
|
308 |
+ copynumber=FALSE, |
|
309 |
+ probs=rep(1/3, 3), |
|
310 |
+ DF=6, |
|
311 |
+ SNRMin=5, |
|
312 |
+ recallMin=10, |
|
313 |
+ recallRegMin=1000, |
|
314 |
+ gender=NULL, |
|
315 |
+ returnParams=TRUE, |
|
316 |
+ badSNP=0.7){ |
|
317 |
+ if(!isPackageLoaded("ff")) stop("Must load package 'ff'") |
|
318 |
+ if(!copynumber){ |
|
319 |
+ callSet <- crlmm2(filenames=filenames, |
|
320 |
+ cdfName=cdfName, |
|
321 |
+ mixtureSampleSize=mixtureSampleSize, |
|
322 |
+ eps=eps, |
|
323 |
+ verbose=verbose, |
|
324 |
+ sns=sns, |
|
325 |
+ probs=probs, |
|
326 |
+ DF=DF, |
|
327 |
+ SNRMin=SNRMin, |
|
328 |
+ recallMin=recallMin, |
|
329 |
+ recallRegMin=recallRegMin, |
|
330 |
+ gender=gender, |
|
331 |
+ returnParams=returnParams, |
|
332 |
+ badSNP=badSNP) |
|
333 |
+ return(callSet) |
|
334 |
+ } |
|
335 |
+ if(missing(cdfName)) stop("must specify cdfName") |
|
336 |
+ if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
|
337 |
+ if(missing(batch)){ |
|
338 |
+ warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch. The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.") |
|
339 |
+ } else { |
|
340 |
+ if(length(batch) != length(filenames)) |
|
341 |
+ stop("batch variable must be the same length as the filenames") |
|
342 |
+ } |
|
343 |
+ if(missing(sns)) sns <- basename(filenames) |
|
344 |
+ ## callSet contains potentially very big matrices |
|
345 |
+ callSet <- construct(filenames=filenames, |
|
346 |
+ cdfName=cdfName, |
|
347 |
+ copynumber=TRUE, |
|
348 |
+ sns=sns, |
|
349 |
+ verbose=verbose) |
|
350 |
+ if(missing(batch)){ |
|
351 |
+ protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate)) |
|
352 |
+ } |
|
353 |
+ if(!missing(batch)) protocolData(callSet)$batch <- batch |
|
354 |
+ ##lM(callSet) <- initializeParamObject(list(featureNames(callSet), unique(protocolData(callSet)$batch))) |
|
355 |
+ mixtureParams <- matrix(NA, 4, length(filenames)) |
|
356 |
+ snp.index <- which(isSnp(callSet)==1) |
|
357 |
+## snprmaRes <- snprma2(filenames=filenames, |
|
358 |
+## mixtureSampleSize=mixtureSampleSize, |
|
359 |
+## fitMixture=TRUE, |
|
360 |
+## eps=eps, |
|
361 |
+## verbose=verbose, |
|
362 |
+## seed=seed, |
|
363 |
+## cdfName=cdfName, |
|
364 |
+## sns=sns) |
|
365 |
+## if(verbose) message("Finished preprocessing.") |
|
366 |
+## open(snprmaRes[["A"]]) |
|
367 |
+## open(snprmaRes[["B"]]) |
|
368 |
+## open(snprmaRes[["SNR"]]) |
|
369 |
+## open(snprmaRes[["SKW"]]) |
|
370 |
+## open(snprmaRes[["mixtureParams"]]) |
|
371 |
+## if(verbose) message("Updating elements of callSet") |
|
372 |
+#### bb = ocProbesets()*ncol(A)*8 |
|
373 |
+#### ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb) |
|
374 |
+#### ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb) |
|
375 |
+## ##batches <- splitIndicesByLength(1:nrow(snprmaRes[["A"]]), ocProbesets()) |
|
376 |
+## for(j in 1:ncol(callSet)){ |
|
377 |
+## A(callSet)[snp.index, j] <- snprmaRes[["A"]][, j] |
|
378 |
+## B(callSet)[snp.index, j] <- snprmaRes[["B"]][, j] |
|
379 |
+## } |
|
380 |
+## if(verbose) message("Finished updating elements of callSet") |
|
381 |
+## stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns)) |
|
382 |
+## pData(callSet)$SKW <- snprmaRes$SKW |
|
383 |
+## pData(callSet)$SNR <- snprmaRes$SNR |
|
384 |
+## mixtureParams <- snprmaRes$mixtureParams |
|
385 |
+ np.index <- which(isSnp(callSet) == 0) |
|
386 |
+ cnrmaRes <- cnrma2(A=A(callSet), |
|
387 |
+ filenames=filenames, |
|
388 |
+ row.names=featureNames(callSet)[np.index], |
|
389 |
+ cdfName=cdfName, |
|
390 |
+ sns=sns, |
|
391 |
+ seed=seed, |
|
392 |
+ verbose=verbose) |
|
393 |
+## if(verbose) message("Entering crlmmGT2...") |
|
394 |
+## rm(cnrmaRes); gc() |
|
395 |
+## ## as.matrix needed when ffdf is used |
|
396 |
+## tmp <- crlmmGT2(A=snprmaRes[["A"]], |
|
397 |
+## B=snprmaRes[["B"]], |
|
398 |
+## SNR=snprmaRes[["SNR"]], |
|
399 |
+## mixtureParams=snprmaRes[["mixtureParams"]], |
|
400 |
+## cdfName=cdfName, |
|
401 |
+## row.names=NULL, ##featureNames(callSet),##[snp.index], |
|
402 |
+## col.names=sampleNames(callSet), |
|
403 |
+## probs=probs, |
|
404 |
+## DF=DF, |
|
405 |
+## SNRMin=SNRMin, |
|
406 |
+## recallMin=recallMin, |
|
407 |
+## recallRegMin=recallRegMin, |
|
408 |
+## gender=gender, |
|
409 |
+## verbose=verbose, |
|
410 |
+## returnParams=returnParams, |
|
411 |
+## badSNP=badSNP) |
|
412 |
+## if(verbose) message("Leaving crlmmGT2") |
|
413 |
+## open(tmp[["calls"]]) |
|
414 |
+## open(tmp[["confs"]]) |
|
415 |
+## ##bb = ocProbesets()*ncol(A)*8 |
|
416 |
+## for(j in 1:ncol(callSet)){ |
|
417 |
+## snpCall(callSet)[snp.index, j] <- tmp[["calls"]][, j] |
|
418 |
+## snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]][, j] |
|
419 |
+## } |
|
420 |
+#### ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb) |
|
421 |
+#### ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb) |
|
422 |
+## callSet$gender <- tmp$gender |
|
423 |
+#### cnSet <- as(callSet, "CNSetLM") |
|
424 |
+## return(callSet) |
|
425 |
+ return(cnrmaRes) |
|
287 | 426 |
} |
288 | 427 |
|
289 | 428 |
|
429 |
+ |
|
290 | 430 |
##--------------------------------------------------------------------------- |
291 | 431 |
##--------------------------------------------------------------------------- |
292 | 432 |
## For Illumina |
... | ... |
@@ -758,6 +898,7 @@ predictGender <- function(res, cdfName="genomewidesnp6", SNRMin=5){ |
758 | 898 |
} |
759 | 899 |
|
760 | 900 |
|
901 |
+##replace with release/crlmm/R/cnrma-functions |
|
761 | 902 |
crlmmCopynumber <- function(object, |
762 | 903 |
chromosome=1:23, |
763 | 904 |
which.batches, |
... | ... |
@@ -776,7 +917,8 @@ crlmmCopynumber <- function(object, |
776 | 917 |
MIN.PHI=2^3, |
777 | 918 |
THR.NU.PHI=TRUE, |
778 | 919 |
thresholdCopynumber=TRUE){ |
779 |
- if(isPackageLoaded("ff")){ |
|
920 |
+ ffIsLoaded <- class(calls(object))[[1]] == "ff" |
|
921 |
+ if(ffIsLoaded){ |
|
780 | 922 |
open(object) |
781 | 923 |
open(object$SKW) |
782 | 924 |
open(object$SNR) |
... | ... |
@@ -787,7 +929,7 @@ crlmmCopynumber <- function(object, |
787 | 929 |
stopifnot("isSnp" %in% fvarLabels(object)) |
788 | 930 |
##batch <- object$batch |
789 | 931 |
batch <- batch(object) |
790 |
- if(isPackageLoaded("ff")) { |
|
932 |
+ if(ffIsLoaded) { |
|
791 | 933 |
open(object$SNR) |
792 | 934 |
SNR <- object$SNR[, ] |
793 | 935 |
} else SNR <- object$SNR |
... | ... |
@@ -844,13 +986,19 @@ crlmmCopynumber <- function(object, |
844 | 986 |
fvarLabels(tmp)[13:14] <- paste(c("phiPrimeA", "phiPrimeB"), batchName, sep=".") |
845 | 987 |
fvarLabels(tmp) <- gsub("_", ".", fvarLabels(tmp)) |
846 | 988 |
##fvarLabels(tmp) <- gsub("\\.[1-9]", "", fvarLabels(tmp)) |
847 |
- fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% names(physical(lM(object)))] |
|
848 |
- jj <- match(fvarLabels(tmp), names(lM(object))) |
|
849 |
- lM(object)[row.index, jj] <- fData(tmp) |
|
850 |
- ##labels.asis <- fvarLabels(tmp) |
|
851 |
- ##labels.asis <- gsub(paste("_", unique(tmp$batch), sep=""), paste(".", ii, sep=""), labels.asis) |
|
852 |
- ##k <- match(labels.asis, colnames(lM(object))) |
|
853 |
- ##lM(object)[row.index, k] <- fData(tmp) |
|
989 |
+ if(ffIsLoaded){ |
|
990 |
+ fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% names(physical(lM(object)))] |
|
991 |
+ jj <- match(fvarLabels(tmp), names(lM(object))) |
|
992 |
+ lM(object)[row.index, jj] <- fData(tmp) |
|
993 |
+ } else { |
|
994 |
+ nms <- paste(names(lM(object)), batchName, sep=".") |
|
995 |
+ fData(tmp) <- fData(tmp)[, fvarLabels(tmp) %in% nms] |
|
996 |
+ for(k in seq_along(fvarLabels(tmp))){ |
|
997 |
+ kk <- match(fvarLabels(tmp)[k], paste(names(lM(object)), batchName, sep=".")) |
|
998 |
+ column <- match(batchName, colnames(lM(object)[[k]])) |
|
999 |
+ lM(object)[[k]][row.index, column] <- fData(tmp)[, k] |
|
1000 |
+ } |
|
1001 |
+ } |
|
854 | 1002 |
rm(tmp); gc() |
855 | 1003 |
ii <- ii+1 |
856 | 1004 |
} |
... | ... |
@@ -879,12 +1027,15 @@ crlmmCopynumber2 <- function(object, |
879 | 1027 |
stopifnot("chromosome" %in% fvarLabels(object)) |
880 | 1028 |
stopifnot("position" %in% fvarLabels(object)) |
881 | 1029 |
stopifnot("isSnp" %in% fvarLabels(object)) |
1030 |
+ lM(object) <- initializeParamObject(list(featureNames(object), unique(protocolData(object)$batch))) |
|
882 | 1031 |
batch <- batch(object) |
883 |
- XIndex.snps <- (1:nrow(object))[chromosome(object) == 23 & isSnp(object) & !is.na(chromosome(object))] |
|
1032 |
+ XIndex.snps <- which(chromosome(object) == 23 & isSnp(object) & !is.na(chromosome(object))) |
|
884 | 1033 |
##YIndex.snps <- (1:nrow(object))[chromosome(object) == 24 & isSnp(object)] |
885 |
- XIndex.nps <- (1:nrow(object))[chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object))] |
|
886 |
- autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))] |
|
887 |
- autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))] |
|
1034 |
+ XIndex.nps <- which(chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object))) |
|
1035 |
+ ##autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))] |
|
1036 |
+ autosomeIndex.snps <- which(chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))) |
|
1037 |
+ autosomeIndex.nps <- which(chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))) |
|
1038 |
+ ##autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))] |
|
888 | 1039 |
|
889 | 1040 |
## Do chromosome X in batches |
890 | 1041 |
Ns <- initializeBigMatrix("Ns", nrow(object), 5) |
... | ... |
@@ -1017,8 +1168,8 @@ fit.lm1 <- function(idxBatch, |
1017 | 1168 |
open(snpflags) |
1018 | 1169 |
open(normal) |
1019 | 1170 |
|
1020 |
- corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batch(object)))) |
|
1021 |
- flags <- nuA <- nuB <- phiA <- phiB <- corr |
|
1171 |
+ corrAB <- corrBB <- corrAA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batch(object)))) |
|
1172 |
+ flags <- nuA <- nuB <- phiA <- phiB <- corrAB |
|
1022 | 1173 |
|
1023 | 1174 |
normal.snps <- normal[snps, ] |
1024 | 1175 |
cB <- cA <- matrix(NA, length(snps), ncol(object)) |
... | ... |
@@ -1053,9 +1204,9 @@ fit.lm1 <- function(idxBatch, |
1053 | 1204 |
tau2B[, J] <- shrink(taus[, 3, drop=FALSE], Ns[, 1], DF.PRIOR) |
1054 | 1205 |
sig2B[, J] <- shrink(taus[, 1, drop=FALSE], Ns[, 3], DF.PRIOR) |
1055 | 1206 |
|
1056 |
- corr[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR) |
|
1057 |
- corrB.AA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR) |
|
1058 |
- corrA.BB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR) |
|
1207 |
+ corrAB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR) |
|
1208 |
+ corrAA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR) |
|
1209 |
+ corrBB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR) |
|
1059 | 1210 |
|
1060 | 1211 |
##--------------------------------------------------------------------------- |
1061 | 1212 |
## Impute sufficient statistics for unobserved genotypes (plate-specific) |
... | ... |
@@ -1067,7 +1218,8 @@ fit.lm1 <- function(idxBatch, |
1067 | 1218 |
size <- min(5000, length(index.complete)) |
1068 | 1219 |
if(size == 5000) index.complete <- sample(index.complete, 5000, replace=TRUE) |
1069 | 1220 |
if(length(index.complete) < 200){ |
1070 |
- stop("fewer than 200 snps pass criteria for predicting the sufficient statistics") |
|
1221 |
+ warning("fewer than 200 snps pass criteria for predicting the sufficient statistics") |
|
1222 |
+ return() |
|
1071 | 1223 |
} |
1072 | 1224 |
index <- vector("list", 3) |
1073 | 1225 |
index[[1]] <- which(Ns[, 1] == 0 & (Ns[, 2] >= MIN.OBS & Ns[, 3] >= MIN.OBS)) |
... | ... |
@@ -1105,6 +1257,7 @@ fit.lm1 <- function(idxBatch, |
1105 | 1257 |
flags[, J] <- rowSums(Ns == 0) > 0 |
1106 | 1258 |
##flags[, J] <- index[[1]] | index[[2]] | index[[3]] | rowSums( |
1107 | 1259 |
|
1260 |
+ ## we're regressing on the medians using the standard errors (hence the division by N) as weights |
|
1108 | 1261 |
##formerly coefs() |
1109 | 1262 |
Np <- Ns |
1110 | 1263 |
Np[Np < 1] <- 1 |
... | ... |
@@ -1176,6 +1329,16 @@ fit.lm1 <- function(idxBatch, |
1176 | 1329 |
tmp <- physical(lM(object))$phiB |
1177 | 1330 |
tmp[snps, ] <- phiB |
1178 | 1331 |
lM(object)$phiB <- tmp |
1332 |
+ tmp <- physical(lM(object))$corrAB |
|
1333 |
+ tmp[snps, ] <- corrAB |
|
1334 |
+ lM(object)$corrAB <- tmp |
|
1335 |
+ tmp <- physical(lM(object))$corrAA |
|
1336 |
+ tmp[snps, ] <- corrAA |
|
1337 |
+ lM(object)$corrAA <- tmp |
|
1338 |
+ tmp <- physical(lM(object))$corrBB |
|
1339 |
+ tmp[snps, ] <- corrBB |
|
1340 |
+ lM(object)$corrAB <- tmp |
|
1341 |
+ |
|
1179 | 1342 |
lapply(assayData(object), close) |
1180 | 1343 |
lapply(lM(object), close) |
1181 | 1344 |
TRUE |
... | ... |
@@ -1380,13 +1543,14 @@ fit.lm3 <- function(idxBatch, |
1380 | 1543 |
##--------------------------------------------------------------------------- |
1381 | 1544 |
## Impute sufficient statistics for unobserved genotypes (plate-specific) |
1382 | 1545 |
##--------------------------------------------------------------------------- |
1383 |
- index <- apply(Ns.F, 2, function(x, MIN.OBS) which(x > MIN.OBS), MIN.OBS) |
|
1546 |
+ index <- apply(Ns.F, 2, function(x, MIN.OBS) which(x >= MIN.OBS), MIN.OBS) |
|
1384 | 1547 |
correct.orderA <- muA.F[, 1] > muA.F[, 3] |
1385 | 1548 |
correct.orderB <- muB.F[, 3] > muB.F[, 1] |
1386 | 1549 |
index.complete <- intersect(which(correct.orderA & correct.orderB), intersect(index[[1]], intersect(index[[2]], index[[3]]))) |
1387 | 1550 |
size <- min(5000, length(index.complete)) |
1388 | 1551 |
if(length(index.complete) < 200){ |
1389 |
- stop("fewer than 200 snps pass criteria for predicting the sufficient statistics") |
|
1552 |
+ warning("fewer than 200 snps pass criteria for predicting the sufficient statistics") |
|
1553 |
+ return() |
|
1390 | 1554 |
} |
1391 | 1555 |
if(size==5000) index.complete <- sample(index.complete, size) |
1392 | 1556 |
index <- vector("list", 3) |
... | ... |
@@ -1520,6 +1684,16 @@ fit.lm3 <- function(idxBatch, |
1520 | 1684 |
tmp <- physical(lM(object))$phiPrimeB |
1521 | 1685 |
tmp[snps, ] <- phiB2 |
1522 | 1686 |
lM(object)$phiPrimeB <- tmp |
1687 |
+ |
|
1688 |
+ tmp <- physical(lM(object))$corrAB |
|
1689 |
+ tmp[snps, ] <- corrAB |
|
1690 |
+ lM(object)$corrAB <- tmp |
|
1691 |
+ tmp <- physical(lM(object))$corrAA |
|
1692 |
+ tmp[snps, ] <- corrAA |
|
1693 |
+ lM(object)$corrAA <- tmp |
|
1694 |
+ tmp <- physical(lM(object))$corrBB |
|
1695 |
+ tmp[snps, ] <- corrBB |
|
1696 |
+ lM(object)$corrAB <- tmp |
|
1523 | 1697 |
## lM(object)$tau2A[snps, ] <- tau2A |
1524 | 1698 |
## lM(object)$tau2B[snps, ] <- tau2B |
1525 | 1699 |
## lM(object)$sig2A[snps, ] <- sig2A |
... | ... |
@@ -54,7 +54,6 @@ readIdatFiles <- function(sampleSheet=NULL, |
54 | 54 |
} |
55 | 55 |
pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) |
56 | 56 |
} |
57 |
- |
|
58 | 57 |
narrays = length(arrayNames) |
59 | 58 |
grnfiles = paste(arrayNames, fileExt$green, sep=sep) |
60 | 59 |
redfiles = paste(arrayNames, fileExt$red, sep=sep) |
... | ... |
@@ -360,7 +359,6 @@ readIdatFiles2 <- function(sampleSheet=NULL, |
360 | 359 |
|
361 | 360 |
## the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008 |
362 | 361 |
readIDAT <- function(idatFile){ |
363 |
- |
|
364 | 362 |
fileSize <- file.info(idatFile)$size |
365 | 363 |
|
366 | 364 |
tempCon <- file(idatFile,"rb") |
... | ... |
@@ -91,18 +91,34 @@ setMethod("computeCopynumber", "CNSet", |
91 | 91 |
} |
92 | 92 |
object |
93 | 93 |
}) |
94 |
- |
|
95 |
- |
|
96 | 94 |
setMethod("copyNumber", "CNSet", function(object){ |
97 | 95 |
I <- isSnp(object) |
96 |
+ ffIsLoaded <- class(calls(object))[[1]]=="ff" |
|
98 | 97 |
CA <- CA(object) |
99 | 98 |
CB <- CB(object) |
99 |
+ if(ffIsLoaded){ |
|
100 |
+ open(CA) |
|
101 |
+ open(CB) |
|
102 |
+ CA <- CA[,] |
|
103 |
+ CB <- CB[,] |
|
104 |
+ } |
|
100 | 105 |
CN <- CA + CB |
101 | 106 |
##For nonpolymorphic probes, CA is the total copy number |
102 | 107 |
CN[!I, ] <- CA(object)[!I, ] |
108 |
+ CN <- CN/100 |
|
103 | 109 |
CN |
104 | 110 |
}) |
105 | 111 |
|
112 |
+##setMethod("copyNumber", "CNSet", function(object){ |
|
113 |
+## I <- isSnp(object) |
|
114 |
+## CA <- CA(object) |
|
115 |
+## CB <- CB(object) |
|
116 |
+## CN <- CA + CB |
|
117 |
+## ##For nonpolymorphic probes, CA is the total copy number |
|
118 |
+## CN[!I, ] <- CA(object)[!I, ] |
|
119 |
+## CN |
|
120 |
+##}) |
|
121 |
+ |
|
106 | 122 |
|
107 | 123 |
setMethod("ellipse", "CNSet", function(x, copynumber, batch, ...){ |
108 | 124 |
ellipse.CNSet(x, copynumber, batch, ...) |
... | ... |
@@ -13,6 +13,7 @@ |
13 | 13 |
\newcommand{\Rclass}[1]{{\textit{#1}}} |
14 | 14 |
\newcommand{\oligo}{\Rpackage{oligo }} |
15 | 15 |
\newcommand{\R}{\textsf{R}} |
16 |
+\newcommand{\crlmm}{\Rpackage{crlmm}} |
|
16 | 17 |
\usepackage[margin=1in]{geometry} |
17 | 18 |
|
18 | 19 |
\begin{document} |
... | ... |
@@ -22,8 +23,7 @@ |
22 | 23 |
\maketitle |
23 | 24 |
|
24 | 25 |
<<setup, echo=FALSE, results=hide>>= |
25 |
-options(continue=" ") |
|
26 |
-options(prompt="R> ") |
|
26 |
+options(continue=" ", width=70) |
|
27 | 27 |
@ |
28 | 28 |
|
29 | 29 |
%\section{Estimating copy number} |
... | ... |
@@ -43,168 +43,304 @@ options(prompt="R> ") |
43 | 43 |
CRLMM supports the following platforms: |
44 | 44 |
|
45 | 45 |
<<cdfname>>= |
46 |
+library(oligoClasses) |
|
46 | 47 |
library(crlmm) |
47 | 48 |
crlmm:::validCdfNames() |
49 |
+cdfName <- "genomewidesnp6" |
|
48 | 50 |
@ |
49 | 51 |
|
50 | 52 |
\paragraph{Preprocess and genotype.} |
51 | 53 |
|
52 |
-Provide the complete path for the filenames: |
|
54 |
+In the following code chunk, we provide the complete path to the |
|
55 |
+Affymetrix CEL files, assign a path to store the normalized |
|
56 |
+intensities and genotype data, and define a 'batch' variable. The |
|
57 |
+batch variable will later be useful when we estimate copy number. We |
|
58 |
+typically use the 96 well chemistry plate or the scan date of the |
|
59 |
+array as a surrogate for batch. Adjusting by date or chemistry plate |
|
60 |
+can be helpful for limiting the influence of batch effects. |
|
53 | 61 |
|
54 | 62 |
<<celfiles>>= |
55 | 63 |
celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE, pattern=".CEL") |
56 |
-## CEPH and Yoruban files |
|
64 |
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/crlmm" |
|
65 |
+if(!file.exists(outdir)) dir.create(outdir) |
|
66 |
+## CEPH and Yoruban |
|
57 | 67 |
batch <- substr(basename(celFiles), 13, 13) |
58 | 68 |
celFiles <- celFiles[batch %in% c("C", "Y")] |
59 | 69 |
batch <- batch[batch %in% c("C", "Y")] |
70 |
+stopifnot(length(batch) == length(celFiles)) |
|
60 | 71 |
@ |
61 | 72 |
|
62 |
-While genotyping with crlmm can be performed using a small number of |
|
63 |
-samples, copy number estimation requires at least 10 samples -- |
|
64 |
-preferably all of the samples that were processed together on the same |
|
65 |
-plate. Crlmm does not use a reference dataset when estimating model |
|
66 |
-parameters because of large batch effects \citep{Scharpf2009}. The |
|
67 |
-quantile normalization performed as part of the preprocessing of the raw |
|
68 |
-data is insufficient for removing batch effects. Processing a reference |
|
69 |
-dataset, such as HapMap samples, along with the experimental data will |
|
70 |
-not improve copy number estimation for the experimental dataset, and |
|
71 |
-should not be used as a means to increase the sample size. Furthermore, |
|
72 |
-processing a reference dataset without acknowledging that these samples |
|
73 |
-were derived from a different batch can result in incorrect copy number |
|
74 |
-estimates in both the experimental and reference datasets. The |
|
75 |
-appropriate way to acknowledge batch is to supply the batch name for |
|
76 |
-each sample to be processed in the argument to \Robject{cnOptions}: |
|
77 |
- |
|
78 |
-<<celfiles>>= |
|
79 |
-cnOpts <- cnOptions(cdfName="genomewidesnp6", |
|
80 |
- outdir="/thumper/ctsa/beaty/scharpf/crlmmOut/hapmap", |
|
81 |
- batch=batch) |
|
82 |
-str(cnOpts) |
|
83 |
-stopifnot(length(cnOpts$batch) == length(celFiles)) |
|
84 |
-names(cnOpts$batch) <- basename(celFiles) |
|
73 |
+The preprocessing steps for copy number estimation includes quantile |
|
74 |
+normalization of the polymorphic and nonpolymorphic markers and a |
|
75 |
+summarization step whereby the quantile normalized intenities are |
|
76 |
+summarized for each locus. As the markers at polymorphic loci on the |
|
77 |
+Affymetrix 6.0 platform are identical, we summarize the intensities by |
|
78 |
+the median. For the nonpolymorphic markers, no summarization step is |
|
79 |
+performs. Next, the \crlmm{} package provides a genotype call and a |
|
80 |
+confidence score at each polymorphic locus. Unless the dataset is small |
|
81 |
+(e.g., fewer than 50 samples), we suggest installing and loading the |
|
82 |
+\R{} package \Rpackage{ff}. The function \Rfunction{genotype} checks to |
|
83 |
+see whether the \Rpackage{ff} is loaded. If loaded, the normalized |
|
84 |
+intensities and genotype are stored as \Robject{ff} objects on disk. |
|
85 |
+Otherwise, the genotypes and normalized intensities are stored in |
|
86 |
+matrices. A word of caution: the \Rfunction{genotype} function requires |
|
87 |
+a potentially large amount of RAM even when the \R{} package |
|
88 |
+\Rpackage{ff} is loaded. Users with large datasets may want to skip |
|
89 |
+this part. For the 180 HapMap samples processed in this vignette, the |
|
90 |
+\Rfunction{genotype} function required 25MB RAM. The next two code |
|
91 |
+chunks should be submitted as batch jobs. |
|
92 |
+ |
|
93 |
+<<genotype>>= |
|
94 |
+if(!exists("cnSet")){ |
|
95 |
+ if(!file.exists(file.path(outdir, "cnSet.rda"))){ |
|
96 |
+ ##ops <- options(error=recover) |
|
97 |
+ cnSet <- genotype(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch) |
|
98 |
+ save(cnSet, file=file.path(outdir, "cnSet.rda")) |
|
99 |
+ } else load(file.path(outdir, "cnSet.rda")) |
|
100 |
+} |
|
85 | 101 |
@ |
86 | 102 |
|
87 |
-\noindent The next code chunk quantile normalizes the samples to a |
|
88 |
-target reference distribution, uses the crlmm algorithm to genotype, and |
|
89 |
-then estimates the copy number for each batch. Currently processing the |
|
90 |
-180 HapMap cel files will require less than 20G of RAM. We are working |
|
91 |
-on methods to reduce the memory footprint. |
|
103 |
+A more memory efficient approach to preprocessing and genotyping is |
|
104 |
+implemented in the \R{} function \Rfunction{genotype2}. The arguments |
|
105 |
+to this function are similar to \Rfunction{genotype} but requires |
|
106 |
+explicit loading of the \R{} package \Rpackage{ff} prior to calling |
|
107 |
+the function. The functions \Rfunction{ocProbesets} and |
|
108 |
+\Rfunction{ocSamples} can be used to manage how many probesets and |
|
109 |
+samples are to be loaded into memory and processed. Using the default |
|
110 |
+settings, the following code required 1.9Mb RAM to process 180 CEL |
|
111 |
+files. |
|
112 |
+ |
|
113 |
+<<genotype2>>= |
|
114 |
+library(ff) |
|
115 |
+ld.path <- file.path(outdir, "ffobjs") |
|
116 |
+if(!file.exists(ld.path)) dir.create(ld.path) |
|
117 |
+ldPath(ld.path) |
|
118 |
+if(!exists("cnSet2")){ |
|
119 |
+ if(!file.exists(file.path(outdir, "cnSet2.rda"))){ |
|
120 |
+ cnSet2 <- genotype2(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch) |
|
121 |
+ save(cnSet2, file=file.path(outdir, "cnSet2.rda")) |
|
122 |
+ } else load(file.path(outdir, "cnSet2.rda")) |
|
123 |
+} |
|
124 |
+@ |
|
92 | 125 |
|
93 |
-<<preprocessAndGenotype, echo=TRUE>>= |
|
94 |
-if(FALSE) crlmmCopynumber(celFiles, cnOpts) |
|
95 |
-load(file.path(cnOpts[["outdir"]], "cnSet_21.rda")) |
|
126 |
+The objects returned by the \Rfunction{genotype} and |
|
127 |
+\Rfunction{genotype2} differ in size as the former returns an object |
|
128 |
+with matrices in the assay data slot, whereas the latter return an |
|
129 |
+object with pointers to files on disk. The functions \Rfunction{open} |
|
130 |
+and \Rfunction{close} can be used to open and close the connections |
|
131 |
+stored in the assay data of the \Robject{cnSet2} object, respectively. |
|
132 |
+Subsetting the \Robject{ff} object pulls the data from disk into |
|
133 |
+memory. As subsetting operations pull the data from disk to memory, |
|
134 |
+these operations must be performed with care. In particular, |
|
135 |
+subsetting the \Robject{cnSet2} will subset each assay data element |
|
136 |
+and this can be slow. The preferred approach would be to extract the |
|
137 |
+assay data element that is needed, and then subset this function |
|
138 |
+object as illustrated below for the genotype calls. |
|
139 |
+ |
|
140 |
+<<check>>= |
|
141 |
+class(calls(cnSet)) |
|
142 |
+dims(cnSet) |
|
143 |
+class(calls(cnSet2)) |
|
144 |
+dims(cnSet2) |
|
145 |
+print(object.size(cnSet), units="Mb") |
|
146 |
+print(object.size(cnSet2), units="Mb") |
|
147 |
+if(FALSE){ |
|
148 |
+ replicate(5, system.time(gt1 <- calls(cnSet)[, 1:50])[[1]]) |
|
149 |
+ open(calls(cnSet2)) |
|
150 |
+ replicate(5, system.time(gt2 <- calls(cnSet2)[, 1:50])[[1]]) |
|
151 |
+} |
|
152 |
+gt1 <- calls(cnSet)[, 1:50] |
|
153 |
+gt2 <- calls(cnSet2)[, 1:50] |
|
154 |
+all.equal(gt1, gt2) |
|
96 | 155 |
@ |
97 | 156 |
|
98 |
-The following R objects are created from crlmmCopynumber: |
|
157 |
+\noindent With \Robject{ff} objects the additional I/O time required |
|
158 |
+for extracting data is substantial and will increase for larger |
|
159 |
+datasets. Patience is required. |
|
160 |
+ |
|
161 |
+Note that for the Affymetrix 6.0 platform the assay data elements each |
|
162 |
+have a row dimension corresponding to the total number of polymorphic |
|
163 |
+and nonpolymorphic markers interrogated by the Affymetrix 6.0 platform. |
|
164 |
+A consequence of keeping the rows of the assay data elements the same |
|
165 |
+for all of the statistical summaries is that the matrix used to store |
|
166 |
+genotype calls is larger than necessary. Also, note the additional |
|
167 |
+overhead of some operations when using \Robject{ff} objects. For |
|
168 |
+instance, the posterior probabilities for the CRLMM genotype calls are |
|
169 |
+represented as integers. The accessor \Robject{snpCallProbability} can |
|
170 |
+be used to access these confidence scores. When stored as matrices, |
|
171 |
+converting the integer representation back to the probability scale is |
|
172 |
+straightforward as shown below. However, for the \Robject{ff} objects |
|
173 |
+we must first bring the data into memory. To bring all the scores into |
|
174 |
+memory, one could use the function \Rfunction{[,]} but this could be |
|
175 |
+slow and require a lot of RAM depending on the size of the dataset. |
|
176 |
+Instead, we suggest pulling only the needed rows and columns from |
|
177 |
+memory. In the following example, we convert the integer scores to |
|
178 |
+probabilities for the CEPH samples. As genotype confidence scores are |
|
179 |
+not applicable to the nonpolymorphic markers, we extract only the |
|
180 |
+polymorphic markers using the \Rfunction{isSnp} function. |
|
181 |
+ |
|
182 |
+<<assayData>>= |
|
183 |
+rows <- which(isSnp(cnSet)) |
|
184 |
+cols <- which(batch == "C") |
|
185 |
+posterior.prob <- tryCatch(integerScoreToProbability(snpCallProbability), |
|
186 |
+ error=function(e) print("This will not work for an ff object.")) |
|
187 |
+posterior.prob1 <- integerScoreToProbability(snpCallProbability(cnSet)[rows, cols]) |
|
188 |
+posterior.prob2 <- integerScoreToProbability(snpCallProbability(cnSet2)[rows, cols]) |
|
189 |
+all.equal(posterior.prob1, posterior.prob2) |
|
190 |
+@ |
|
99 | 191 |
|
100 |
-<<crlmmSetListObjects>>= |
|
101 |
-fns <- list.files(cnOpts[["outdir"]], pattern="cnSet", full.name=TRUE) |
|
102 |
-basename(fns)[1:5] |
|
192 |
+Next, we obtain locus-level estimates of copy number by fitting a linear |
|
193 |
+model to each SNP. A variable named 'batch' must be indicated in the |
|
194 |
+\Robject{protocolData} of the \Robject{cnSet} object. As the inverse |
|
195 |
+variance of the within-genotype normalized intensities are used as |
|
196 |
+weights in the linear model (and hence the design matrix in the |
|
197 |
+regression changes for each SNP), the time is linear with the number of |
|
198 |
+markers on the array. Copy number estimation at nonpolymorphic markers |
|
199 |
+and polymorphic markers with unobserved genotypes is more difficult. We |
|
200 |
+refer the interested reader to the technical report \citep{Scharpf2009}. |
|
201 |
+Again, we peform the copy number estimation using the matrix version and |
|
202 |
+the ff version in parallel and encourage users with large datasets to |
|
203 |
+explore the latter. As with the preprocessing and genotyping, the |
|
204 |
+following code should be submitted as part of the batch job as it is too |
|
205 |
+slow for interactive analysis. |
|
206 |
+ |
|
207 |
+<<cn, echo=FALSE, eval=FALSE>>= |
|
208 |
+##Matrix format |
|
209 |
+cnSet <- crlmmCopynumber(cnSet) |
|
103 | 210 |
@ |
104 | 211 |
|
212 |
+<<save.cnset, echo=FALSE, eval=FALSE>>= |
|
213 |
+save(cnSet, file=file.path(outdir, "cnSet.rda")) |
|
214 |
+@ |
|
105 | 215 |
|
106 |
-The above algorithm for estimating copy number is predicated on the |
|
107 |
-assumption that most samples within a batch have copy number 2 at any |
|
108 |
-given locus. For common copy number variants, this assumption may not |
|
109 |
-hold. An additional iteration using a bias correction provides |
|
110 |
-additional robustness to this assumption. Set the \Robject{bias.adj} |
|
111 |
-argument to \Robject{TRUE}: |
|
216 |
+<<cn.ff, echo=FALSE, eval=FALSE>>= |
|
217 |
+rm(cnSet); gc() |
|
218 |
+ocProbesets(75e3) |
|
219 |
+##ff objects |
|
220 |
+system.time(cnSet2 <- crlmmCopynumber2(cnSet2)) |
|
221 |
+save(cnSet2, file=file.path(outdir, "cnSet2.rda")) |
|
222 |
+@ |
|
112 | 223 |
|
113 |
-<<biasAdjustment, eval=FALSE>>= |
|
114 |
-cnOpts[["bias.adj"]] <- TRUE |
|
115 |
-if(FALSE) crlmmCopynumber(celFiles, cnOpts) |
|
224 |
+<<timings, eval=FALSE>>= |
|
225 |
+tryCatch(print(paste("Time for matrix version:", time1)), error=function(e) print("timing for CN estimation not available")) |
|
226 |
+tryCatch(print(paste("Time for ff version:", time2)), error=function(e) print("timing for CN estimation not available")) |
|
227 |
+rm(cnSet2); gc() |
|
116 | 228 |
@ |
117 | 229 |
|
230 |
+ |
|
118 | 231 |
\section{Accessors} |
119 | 232 |
|
120 |
-\subsection{Assay data accessors} |
|
233 |
+For the remainder of this vignette, we illustrate accessors and |
|
234 |
+visualizations using the sample object provided in the |
|
235 |
+\Rpackage{crlmm} package. |
|
236 |
+ |
|
237 |
+<<sampleset>>= |
|
238 |
+data(sample.CNSetLM) |
|
239 |
+x <- sample.CNSetLM |
|
240 |
+@ |
|
121 | 241 |
|
122 |
-\paragraph{\Robject{ABset}: quantile normalized intensities} |
|
123 | 242 |
|
124 |
-An object of class \Robject{ABset} is stored in the first element of the |
|
125 |
-\Robject{crlmmSetList} object. The following accessors may be of use: |
|
243 |
+\subsection{Quantile-normalized intensities} |
|
126 | 244 |
|
127 | 245 |
Accessors for the quantile normalized intensities for the A allele at |
128 | 246 |
polymorphic loci: |
129 | 247 |
|
130 | 248 |
<<accessors>>= |
131 |
-a <- A(cnSet)[isSnp(cnSet), ] |
|
249 |
+snp.index <- which(isSnp(x)) |
|
250 |
+np.index <- which(!isSnp(x)) |
|
251 |
+a <- (A(x))[snp.index, ] |
|
132 | 252 |
dim(a) |
133 | 253 |
@ |
134 | 254 |
|
135 |
-The quantile-normalized intensities for nonpolymorphic loci are obtained by: |
|
255 |
+The extra set of parentheses surrounding \Robject{A(cnSet2)} above is |
|
256 |
+added to emphasize the appropriate order of operations. Subsetting the |
|
257 |
+entire \Robject{x} object in the following code should be avoided for |
|
258 |
+large datasets. |
|
259 |
+ |
|
260 |
+<<eval=FALSE>>= |
|
261 |
+a <- A(x[snp.index, ]) |
|
262 |
+@ |
|
263 |
+ |
|
264 |
+The quantile-normalized intensities for nonpolymorphic loci are obtained |
|
265 |
+by: |
|
136 | 266 |
|
137 | 267 |
<<>>= |
138 |
-npIntensities <- A(cnSet)[!isSnp(cnSet), ] |
|
268 |
+npIntensities <- (A(x))[np.index, ] |
|
139 | 269 |
@ |
140 | 270 |
|
141 | 271 |
Quantile normalized intensities for the B allele at polymorphic loci: |
142 | 272 |
|
143 | 273 |
<<B.polymorphic>>= |
144 |
-b.snps <- B(cnSet[isSnp(cnSet), ]) |
|
274 |
+b.snps <- (B(x))[snp.index, ] |
|
145 | 275 |
@ |
146 | 276 |
|
147 | 277 |
Note that NAs are recorded in the 'B' assay data element for |
148 | 278 |
nonpolymorphic loci: |
149 | 279 |
|
150 | 280 |
<<B.NAs>>= |
151 |
-all(is.na(B(cnSet[!isSnp(cnSet), ]))) |
|
281 |
+all(is.na((B(x))[np.index, ])) |
|
282 |
+@ |
|
283 |
+ |
|
284 |
+<<clean, echo=FALSE>>= |
|
285 |
+rm(b.snps, a, npIntensities); gc() |
|
152 | 286 |
@ |
153 | 287 |
|
154 | 288 |
\paragraph{\Robject{SnpSet}: Genotype calls and confidence scores} |
155 | 289 |
|
156 | 290 |
Genotype calls: |
157 | 291 |
<<genotypes>>= |
158 |
-genotypes <- snpCall(cnSet) |
|
292 |
+genotypes <- (snpCall(x))[snp.index, ] |
|
159 | 293 |
@ |
160 | 294 |
Confidence scores of the genotype calls: |
161 | 295 |
<<confidenceScores>>= |
162 |
-genotypeConf <- confs(cnSet[isSnp(cnSet), ]) |
|
296 |
+genotypeConf <- integerScoreToProbability(snpCallProbability(x)[snp.index[1:10], ]) |
|
163 | 297 |
@ |
164 | 298 |
|
165 | 299 |
\paragraph{\Robject{CopyNumberSet}: allele-specific copy number} |
166 | 300 |
|
167 | 301 |
Allele-specific copy number at polymorphic loci: |
168 | 302 |
<<ca>>= |
169 |
-ca <- CA(cnSet[isSnp(cnSet), ]) |
|
303 |
+ca <- CA(x[snp.index, ])/100 |
|
170 | 304 |
@ |
171 | 305 |
|
172 | 306 |
Total copy number at nonpolymorphic loci: |
173 | 307 |
<<ca>>= |
174 |
-cn.nonpolymorphic <- CA(cnSet[!isSnp(cnSet), ]) |
|
308 |
+cn.nonpolymorphic <- CA(x[np.index, ])/100 |
|
175 | 309 |
@ |
176 | 310 |
|
177 | 311 |
Total copy number at both polymorphic and nonpolymorphic loci: |
178 | 312 |
<<totalCopynumber>>= |
179 |
-cn <- copyNumber(cnSet) |
|
313 |
+cn <- copyNumber(x) |
|
314 |
+apply(cn, 2, median, na.rm=TRUE) |
|
180 | 315 |
@ |
181 | 316 |
|
182 | 317 |
\subsection{Other accessors} |
183 | 318 |
|
184 |
-Information on physical position and chromosome can be accessed by the |
|
185 |
-following accessors: |
|
319 |
+Information on physical position and chromosome can be accessed as follows: |
|
186 | 320 |
|
187 |
-<<positionChromosome, eval=FALSE>>= |
|
188 |
-xx <- position(cnSet) |
|
189 |
-yy <- chromosome(cnSet) |
|
321 |
+<<positionChromosome>>= |
|
322 |
+xx <- position(x) |
|
323 |
+yy <- chromosome(x) |
|
190 | 324 |
@ |
191 | 325 |
|
192 |
-There are many parameters computed during copy number estimation that |
|
193 |
-are at present stored in the \Robject{featureData} slot. In particular, |
|
194 |
-the estimation procedure fits a linear model to the normalized |
|
195 |
-intensities for each allele. These parameters are not generally meant to |
|
196 |
-be extracted by the user; for now we just mention where they are stored. |
|
326 |
+Parameters from the linear model used to estimate copy number are |
|
327 |
+stored in the slot \Robject{lM}. |
|
197 | 328 |
|
198 | 329 |
<<copynumberParameters>>= |
199 |
-fvarLabels(cnSet) |
|
330 |
+names(lM(x)) |
|
331 |
+dim(lM(x)[[1]]) |
|
200 | 332 |
@ |
201 | 333 |
|
334 |
+ |
|
335 |
+ |
|
336 |
+\section{Suggested visualizations} |
|
337 |
+ |
|
202 | 338 |
\paragraph{SNR.} |
203 | 339 |
|
204 | 340 |
A histogram of the signal to noise ratio for the HapMap samples: |
205 | 341 |
|
206 | 342 |
<<plotSnr, fig=TRUE, include=FALSE>>= |
207 |
-hist(cnSet$SNR, xlab="SNR", main="") |
|
343 |
+hist(x$SNR, xlab="SNR", main="", breaks=25) |
|
208 | 344 |
@ |
209 | 345 |
|
210 | 346 |
\begin{figure} |
... | ... |
@@ -213,22 +349,7 @@ hist(cnSet$SNR, xlab="SNR", main="") |
213 | 349 |
\caption{Signal to noise ratios for the HapMap samples.} |
214 | 350 |
\end{figure} |
215 | 351 |
|
216 |
-\paragraph{Data/Batch.} |
|
217 |
-For Affymetrix 6.0, we currently suggest excluding or flagging samples |
|
218 |
-with a signal to noise ratio less than 5. Adjusting by date or |
|
219 |
-chemistry plate can be helpful for limiting the influence of batch |
|
220 |
-effects. Ideally, one would have 70+ files in a given batch. Here we |
|
221 |
-make a table of date versus ancestry (batch): |
|
222 |
- |
|
223 |
-<<specifyBatch, eval=FALSE, echo=FALSE>>= |
|
224 |
-table(format(as.POSIXlt(protocolData(cnSet)[["ScanDate"]]), "%d %b %Y"), cnSet$batch) |
|
225 |
-@ |
|
226 |
- |
|
227 |
-As all of these samples were run on the first week of March, we would |
|
228 |
-expect that any systematic artifacts to the intensities that develop |
|
229 |
-over time to be minimal (a best case scenario). |
|
230 | 352 |
|
231 |
-\section{Suggested visualizations} |
|
232 | 353 |
|
233 | 354 |
\paragraph{One sample at a time: locus-level estimates} |
234 | 355 |
|
... | ... |
@@ -240,24 +361,24 @@ area of research. |
240 | 361 |
|
241 | 362 |
<<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>= |
242 | 363 |
par(las=1, mar=c(4, 5, 4, 2)) |
243 |
-plot(position(cnSet), copyNumber(cnSet)[, 1], pch=".", |
|
244 |
- cex=2, xaxt="n", col="grey20", ylim=c(0,6), |
|
364 |
+plot(position(x), copyNumber(x)[, 1], pch=".", |
|
365 |
+ cex=2, xaxt="n", col="grey20", ylim=c(0,4), |
|
245 | 366 |
ylab="copy number", xlab="physical position (Mb)", |
246 |
- main=paste(sampleNames(cnSet)[1], ", CHR:", unique(chromosome(cnSet)))) |
|
247 |
-points(position(cnSet)[!isSnp(cnSet)], copyNumber(cnSet)[!isSnp(cnSet), 1], |
|
367 |
+ main=paste(sampleNames(x)[1], ", CHR:", unique(chromosome(x)))) |
|
368 |
+points(position(x)[!isSnp(x)], copyNumber(x)[!isSnp(x), 1], |
|
248 | 369 |
pch=".", cex=2, col="lightblue") |
249 |
-axis(1, at=pretty(range(position(cnSet))), labels=pretty(range(position(cnSet)))/1e6) |
|
370 |
+axis(1, at=pretty(range(position(x))), labels=pretty(range(position(x)))/1e6) |
|
250 | 371 |
@ |
251 | 372 |
|
252 | 373 |
<<idiogram, eval=FALSE, echo=FALSE>>= |
253 | 374 |
require(SNPchip) |
254 |
-plotCytoband(22, new=FALSE, cytoband.ycoords=c(5.8, 6.0), label.cytoband=FALSE) |
|
375 |
+plotCytoband(22, new=FALSE, cytoband.ycoords=c(3.8, 4), label.cytoband=FALSE) |
|
255 | 376 |
@ |
256 | 377 |
|
257 | 378 |
\begin{figure} |
258 | 379 |
\includegraphics[width=0.9\textwidth]{copynumber-oneSample} |
259 | 380 |
\caption{\label{fig:oneSample} Total copy number (y-axis) for |
260 |
- chromosome 22 plotted against physical position (x-axis) for one |
|
381 |
+ chromosome 1 plotted against physical position (x-axis) for one |
|
261 | 382 |
sample. Estimates at nonpolymorphic loci are plotted in light |
262 | 383 |
blue.} |
263 | 384 |
\end{figure} |
... | ... |
@@ -267,16 +388,16 @@ plotCytoband(22, new=FALSE, cytoband.ycoords=c(5.8, 6.0), label.cytoband=FALSE) |
267 | 388 |
%ask <- FALSE |
268 | 389 |
%op <- par(mfrow=c(3, 1), las=1, mar=c(1, 4, 1, 1), oma=c(3, 1, 1, 1), ask=ask) |
269 | 390 |
%##Put fit on the copy number scale |
270 |
-%cns <- copyNumber(cnSet) |
|
391 |
+%cns <- copyNumber(cnSet2) |
|
271 | 392 |
%cnState <- hmmPredictions - as.integer(1) |
272 |
-%xlim <- c(10*1e6, max(position(cnSet))) |
|
393 |
+%xlim <- c(10*1e6, max(position(cnSet2))) |
|
273 | 394 |
%cols <- brewer.pal(8, "Dark2")[1:4] |
274 | 395 |
%for(j in 1:3){ |
275 |
-% plot(position(cnSet), cnState[, j], pch=".", col=cols[2], xaxt="n", |
|
396 |
+% plot(position(cnSet2), cnState[, j], pch=".", col=cols[2], xaxt="n", |
|
276 | 397 |
% ylab="copy number", xlab="Physical position (Mb)", type="s", lwd=2, |
277 | 398 |
% ylim=c(0,6), xlim=xlim) |
278 |
-% points(position(cnSet), cns[, j], pch=".", col=cols[3]) |
|
279 |
-% lines(position(cnSet), cnState[,j], lwd=2, col=cols[2]) |
|
399 |
+% points(position(cnSet2), cns[, j], pch=".", col=cols[3]) |
|
400 |
+% lines(position(cnSet2), cnState[,j], lwd=2, col=cols[2]) |
|
280 | 401 |
% axis(1, at=pretty(position(cnSet)), |
281 | 402 |
% labels=pretty(position(cnSet))/1e6) |
282 | 403 |
% abline(h=c(1,3), lty=2, col=cols[1]) |
... | ... |
@@ -300,52 +421,58 @@ plotCytoband(22, new=FALSE, cytoband.ycoords=c(5.8, 6.0), label.cytoband=FALSE) |
300 | 421 |
\clearpage |
301 | 422 |
\paragraph{One SNP at a time} |
302 | 423 |
|
303 |
-Scatterplots of the A and B allele intensities (log-scale) can be useful |
|
304 |
-for assessing the biallelic genotype calls. The following code chunk is |
|
305 |
-displayed in Figure \ref{fig:genotypeCalls}. |
|
306 |
- |
|
307 |
-<<genotypeCalls, fig=TRUE, width=8, height=8, include=FALSE>>= |
|
308 |
-myScatter <- function(object, add=FALSE, ...){ |
|
309 |
- A <- log2(A(object)) |
|
310 |
- B <- log2(B(object)) |
|
311 |
- if(!add){ |
|
312 |
- plot(A, B, ...) |
|
313 |
- } else{ |
|
314 |
- points(A, B, ...) |
|
315 |
- } |
|
316 |
-} |
|
317 |
-index <- which(isSnp(cnSet))[1:9] |
|
318 |
-xlim <- ylim <- c(6.5,13) |
|
319 |
-par(mfrow=c(3,3), las=1, pty="s", ask=FALSE, mar=c(2, 2, 2, 2), oma=c(2, 2, 1, 1)) |
|
320 |
-for(i in index){ |
|
321 |
- gt <- calls(cnSet)[i, ] |
|
322 |
- if(i != 89){ |
|
323 |
- myScatter(cnSet[i, ], |
|
324 |
- pch=pch, |
|
325 |
- col=colors[snpCall(cnSet)[i, ]], |
|
326 |
- bg=colors[snpCall(cnSet)[i, ]], cex=cex, |
|
327 |
- xlim=xlim, ylim=ylim) |
|
328 |
- mtext("A", 1, outer=TRUE, line=1) |
|
329 |
- mtext("B", 2, outer=TRUE, line=1) |
|
330 |
- crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="C", lwd=2, col="black") |
|
331 |
- crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="Y", lwd=2, col="grey50") |
|
332 |
- } else { |
|
333 |
- plot(0:1, xlim=c(0,1), ylim=c(0,1), type="n", xaxt="n", yaxt="n") |
|
334 |
- legend("center", |
|
335 |
- legend=c("CN = 2, CEPH", "CN = 2, Yoruban"), |
|
336 |
- col=c("black", "grey50"), lwd=2, bty="n") |
|
337 |
- } |
|
338 |
-} |
|
424 |
+Scatterplots of the A and B allele intensities (log-scale) can be |
|
425 |
+useful for assessing the biallelic genotype calls. This section of |
|
426 |
+the vignette is currently under development. |
|
427 |
+% The following code chunk is |
|
428 |
+%displayed in Figure \ref{fig:prediction}. |
|
429 |
+ |
|
430 |
+<<predictionRegions, fig=TRUE, width=8, height=8, include=FALSE, eval=FALSE>>= |
|
431 |
+i <- snp.index[1] |
|
432 |
+#plotCNSetLM=crlmm:::plotCNSetLM |
|
433 |
+##trace(plotCNSetLM, browser) |
|
434 |
+plot(i, x, copynumber=2) |
|
435 |
+##myScatter <- function(object, add=FALSE, ...){ |
|
436 |
+## A <- log2(A(object)) |
|
437 |
+## B <- log2(B(object)) |
|
438 |
+## if(!add){ |
|
439 |
+## plot(A, B, ...) |
|
440 |
+## } else{ |
|
441 |
+## points(A, B, ...) |
|
442 |
+## } |
|
443 |
+##} |
|
444 |
+##index <- which(isSnp(cnSet))[1:9] |
|
445 |
+##xlim <- ylim <- c(6.5,13) |
|
446 |
+##par(mfrow=c(3,3), las=1, pty="s", ask=FALSE, mar=c(2, 2, 2, 2), oma=c(2, 2, 1, 1)) |
|
447 |
+##for(i in index){ |
|
448 |
+## gt <- calls(cnSet)[i, ] |
|
449 |
+## if(i != 89){ |
|
450 |
+## myScatter(cnSet[i, ], |
|
451 |
+## pch=pch, |
|
452 |
+## col=colors[snpCall(cnSet)[i, ]], |
|
453 |
+## bg=colors[snpCall(cnSet)[i, ]], cex=cex, |
|
454 |
+## xlim=xlim, ylim=ylim) |
|
455 |
+## mtext("A", 1, outer=TRUE, line=1) |
|
456 |
+## mtext("B", 2, outer=TRUE, line=1) |
|
457 |
+## crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="C", lwd=2, col="black") |
|
458 |
+## crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="Y", lwd=2, col="grey50") |
|
459 |
+## } else { |
|
460 |
+## plot(0:1, xlim=c(0,1), ylim=c(0,1), type="n", xaxt="n", yaxt="n") |
|
461 |
+## legend("center", |
|
462 |
+## legend=c("CN = 2, CEPH", "CN = 2, Yoruban"), |
|
463 |
+## col=c("black", "grey50"), lwd=2, bty="n") |
|
464 |
+## } |
|
465 |
+##} |
|
339 | 466 |
@ |
340 | 467 |
|
341 |
-\begin{figure} |
|
342 |
- \centering |
|
343 |
- \includegraphics[width=0.8\textwidth]{copynumber-genotypeCalls} |
|
344 |
- \caption{\label{fig:genotypeCalls} Scatterplots of A versus B |
|
345 |
- intensities. Each panel displays a single SNP. The ellipses |
|
346 |
- indicate the 95\% probability region for copy number 2 for the CEPH |
|
347 |
- (black) and Yoruban subjects (grey).} |
|
348 |
-\end{figure} |
|
468 |
+%\begin{figure} |
|
469 |
+% \centering |
|
470 |
+% \includegraphics[width=0.8\textwidth]{copynumber-predictionRegions} |
|
471 |
+% \caption{\label{fig:prediction} Scatterplots of A versus B |
|
472 |
+% intensities. Each panel displays a single SNP. The ellipses |
|
473 |
+% indicate the 95\% probability region for copy number 2 for the CEPH |
|
474 |
+% (black) and Yoruban subjects (grey).} |
|
475 |
+%\end{figure} |
|
349 | 476 |
|
350 | 477 |
%\section{Details for the copy number estimation procedure} |
351 | 478 |
% |
11 | 10 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,21 @@ |
1 |
+\name{CNSetLM-methods} |
|
2 |
+\alias{lM} |
|
3 |
+ |
|
4 |
+\title{Methods for CNSetLM class} |
|
5 |
+\description{ Accessors for CNSetLM class} |
|
6 |
+\usage{ |
|
7 |
+ lM(object) |
|
8 |
+} |
|
9 |
+ |
|
10 |
+\arguments{ |
|
11 |
+ \item{object}{\code{CNSetLM}} |
|
12 |
+} |
|
13 |
+\details{ |
|
14 |
+ \code{lM} returns a list (or an ffdf object if large data support is enabled) of the parameters estimated from a linear model fit for each SNP. The parameters are batch and locus-specific. |
|
15 |
+} |
|
16 |
+\value{object of class \code{ffdf} or \code{list}} |
|
17 |
+ |
|
18 |
+\author{R. Scharpf} |
|
19 |
+ |
|
20 |
+\seealso{\code{\link{crlmmCopynumber}}, \code{\link{crlmmCopynumber2}}, \code{\link{CNSetLM-class}}} |
|
21 |
+\keyword{manip} |
0 | 22 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,27 @@ |
1 |
+\name{sample.CNSetLM} |
|
2 |
+\alias{sample.CNSetLM} |
|
3 |
+\docType{data} |
|
4 |
+\title{ |
|
5 |
+ Dataset of class 'CNSetLM' |
|
6 |
+} |
|
7 |
+\description{ |
|
8 |
+ |
|
9 |
+ The data for 2119 polymorphic and nonpolymorphic markers on |
|
10 |
+ chromosome 1 for the CEPH and Yoruban HapMap samples. |
|
11 |
+ |
|
12 |
+} |
|
13 |
+\usage{data(sample.CNSetLM)} |
|
14 |
+\format{ |
|
15 |
+ |
|
16 |
+ The data illustrates the \code{\link{CNSetLM-class}}, with |
|
17 |
+ \code{assayData} containing the quantile-normalized |
|
18 |
+ intensities for the A and B alleles, genotype calls and |
|
19 |
+ confidence scores (call and callProbability), and |
|
20 |
+ allele-specific copy number (CA and CB). The parameters from |
|
21 |
+ the linear model are stored in the lM slot. |
|
22 |
+ |
|
23 |
+} |
|
24 |
+\examples{ |
|
25 |
+data(sample.CNSetLM) |
|
26 |
+} |
|
27 |
+\keyword{datasets} |