git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@42144 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -272,3 +272,13 @@ is decoded and scanned |
272 | 272 |
* labeled figures / displayed output of code chunks in the copy number vignette |
273 | 273 |
* added bibliography for copy number vignette. Added file inst/doc/refs.bib |
274 | 274 |
* added boxplot method |
275 |
+ |
|
276 |
+2009-10-03 R. Scharpf - committed version 1.3.21 |
|
277 |
+ |
|
278 |
+* modified crlmmWrapper function |
|
279 |
+* modified illumina copy number vignette (still needs debugging) |
|
280 |
+* changed title of copy number vignette |
|
281 |
+* added reference to the crlmm paper |
|
282 |
+* changed copyNumber() method so that CA + NA = NA, CB + NA = NA (previously had CA+NA=CA, but this can result in a lot of zeros, depending on the genotype) |
|
283 |
+* new method: addFeatureAnnotation |
|
284 |
+* support for snp5.0 |
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.3.20 |
|
5 |
-Date: 2009-09-22 |
|
4 |
+Version: 1.3.21 |
|
5 |
+Date: 2009-09-29 |
|
6 | 6 |
Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
7 | 7 |
Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
8 | 8 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 6.0. |
... | ... |
@@ -18,7 +18,7 @@ importMethodsFrom(Biobase, annotation, "annotation<-", annotatedDataFrameFrom, |
18 | 18 |
|
19 | 19 |
|
20 | 20 |
##importMethodsFrom(oligoClasses, chromosome, copyNumber, position) |
21 |
-importFrom(oligoClasses, chromosome, copyNumber, position, calls) |
|
21 |
+importMethodsFrom(oligoClasses, chromosome, copyNumber, position, calls) |
|
22 | 22 |
|
23 | 23 |
##importMethodsFrom(methods, initialize, show) |
24 | 24 |
|
... | ... |
@@ -54,6 +54,8 @@ exportClasses(ABset, CopyNumberSet, CrlmmSetList) |
54 | 54 |
##S3method(ellipse, CopyNumberSet) |
55 | 55 |
exportMethods("[", ##"[[", |
56 | 56 |
"$", A, B, |
57 |
+ "A<-", "B<-", |
|
58 |
+ addFeatureAnnotation, |
|
57 | 59 |
calls, |
58 | 60 |
CA, |
59 | 61 |
"CA<-", |
... | ... |
@@ -1,5 +1,8 @@ |
1 | 1 |
setGeneric("A", function(object) standardGeneric("A")) |
2 | 2 |
setGeneric("B", function(object) standardGeneric("B")) |
3 |
+setGeneric("A<-", function(object, value) standardGeneric("A<-")) |
|
4 |
+setGeneric("addFeatureAnnotation", function(object, ...) standardGeneric("addFeatureAnnotation")) |
|
5 |
+setGeneric("B<-", function(object, value) standardGeneric("B<-")) |
|
3 | 6 |
setGeneric("batch", function(object) standardGeneric("batch")) |
4 | 7 |
##setGeneric("calls", function(x) standardGeneric("calls")) |
5 | 8 |
setGeneric("confs", function(object) standardGeneric("confs")) |
... | ... |
@@ -215,185 +215,6 @@ harmonizeDimnamesTo <- function(object1, object2){ |
215 | 215 |
return(object1) |
216 | 216 |
} |
217 | 217 |
|
218 |
-##crlmmIlluminaWrapper <- function(filenames, |
|
219 |
-## cdfName, |
|
220 |
-## load.it=FALSE, |
|
221 |
-## save.it=FALSE, |
|
222 |
-## splitByChr=TRUE, |
|
223 |
-## crlmmFile, |
|
224 |
-## intensityFile, ...){ |
|
225 |
-#### outdir="./", |
|
226 |
-#### cdfName, |
|
227 |
-#### save.intermediate=FALSE, |
|
228 |
-#### splitByChr=TRUE, |
|
229 |
-#### intensityFile, |
|
230 |
-#### ...){ ##additional arguments to readIdatFiles |
|
231 |
-#### stopifnot(basename(intensityFile) == "res.rda") |
|
232 |
-## if(!file.exists(outdir)) stop(outdir, " does not exist.") |
|
233 |
-## if(!isValidCdfName(cdfName, platform="illumina")) stop(cdfName, " not supported.") |
|
234 |
-## if(file.exists(file.path(outdir, "RG.rda"))) { |
|
235 |
-## message("Loading RG.rda...") |
|
236 |
-## load(file.path(outdir, "RG.rda")) |
|
237 |
-## } else { |
|
238 |
-## message("Reading Idat files...") |
|
239 |
-## RG <- readIdatFiles(...) |
|
240 |
-## ##J <- match(c("1_A", "3_A", "5_A", "7_A"), sampleNames(RG)) |
|
241 |
-## ##RG <- RG[, -J] |
|
242 |
-## if(save.intermediate) save(RG, file=file.path(outdir, "RG.rda")) ##935M for 91 samples...better not to save this |
|
243 |
-## } |
|
244 |
-## if(!file.exists(intensityFile)){ |
|
245 |
-## message("Quantile normalization / genotyping...") |
|
246 |
-## crlmmOut <- crlmmIllumina(RG=RG, |
|
247 |
-## cdfName=cdfName, |
|
248 |
-## sns=sampleNames(RG), |
|
249 |
-## returnParams=TRUE, |
|
250 |
-## save.it=TRUE, |
|
251 |
-## intensityFile=intensityFile) |
|
252 |
-## if(save.intermediate) save(crlmmOut, file=file.path(outdir, "crlmmOut.rda")) |
|
253 |
-## } else{ |
|
254 |
-## load(file.path(outdir, "crlmmOut.rda")) |
|
255 |
-## } |
|
256 |
-## message("Loading ", intensityFile, "...") |
|
257 |
-## load(intensityFile) |
|
258 |
-## if(exists("cnAB")){ |
|
259 |
-## np.A <- as.integer(cnAB$A) |
|
260 |
-## np.B <- as.integer(cnAB$B) |
|
261 |
-## np <- ifelse(np.A > np.B, np.A, np.B) |
|
262 |
-## np <- matrix(np, nrow(cnAB$A), ncol(cnAB$A)) |
|
263 |
-## rownames(np) <- cnAB$gns |
|
264 |
-## colnames(np) <- cnAB$sns |
|
265 |
-## cnAB$NP <- np |
|
266 |
-## } |
|
267 |
-## sampleNames(crlmmOut) <- res$sns |
|
268 |
-## if(exists("cnAB")){ |
|
269 |
-## ABset <- combineIntensities(get("res"), cnAB, cdfName=cdfName) |
|
270 |
-## } else{ |
|
271 |
-## ABset <- combineIntensities(get("res"), NULL, cdfName=cdfName) |
|
272 |
-## } |
|
273 |
-## ##protocolData(ABset)[["ScanDate"]] <- as.character(pData(RG)$ScanDate) |
|
274 |
-## crlmmResult <- harmonizeSnpSet(crlmmOut, ABset, cdfName) |
|
275 |
-## stopifnot(all.equal(dimnames(crlmmOut), dimnames(ABset))) |
|
276 |
-## crlmmList <- list(ABset, |
|
277 |
-## crlmmResult) |
|
278 |
-## crlmmList <- as(crlmmList, "CrlmmSetList") |
|
279 |
-## if(splitByChr){ |
|
280 |
-## message("Saving by chromosome") |
|
281 |
-## splitByChromosome(crlmmList, cdfName=cdfName, outdir=outdir) |
|
282 |
-## } else{ |
|
283 |
-## message("Saving crlmmList object to ", outdir) |
|
284 |
-## save(crlmmList, file=file.path(outdir, "crlmmList.rda")) |
|
285 |
-## } |
|
286 |
-## message("CrlmmSetList objects saved to ", outdir) |
|
287 |
-##} |
|
288 |
- |
|
289 |
-##quantileNormalize1m <- function(cel.files, |
|
290 |
-## outdir, |
|
291 |
-## pkgname, |
|
292 |
-## reference=TRUE, |
|
293 |
-## createCels=TRUE, |
|
294 |
-## verbose=TRUE, |
|
295 |
-## computeCopyNumberReference=FALSE, |
|
296 |
-## normalizeNonpolymorphic=FALSE){ |
|
297 |
-## if(computeCopyNumberReference){ |
|
298 |
-## stopifnot(file.exists("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m")) |
|
299 |
-## } else{ |
|
300 |
-## load(system.file("1m_reference_cn.rda", package="CN")) |
|
301 |
-## } |
|
302 |
-## conn <- db(get(pkgname)) |
|
303 |
-## tmp <- dbGetQuery(conn, paste("SELECT fid, man_fsetid, allele, featureSet.chrom, featureSet.physical_pos", |
|
304 |
-## "FROM pmfeature, featureSet", |
|
305 |
-## "WHERE pmfeature.fsetid = featureSet.fsetid")) |
|
306 |
-## tmp[is.na(tmp$chrom), "chrom"] <- 0 |
|
307 |
-## tmp[is.na(tmp$physical_pos), "physical_pos"] <- 0 |
|
308 |
-## tmp <- tmp[order(tmp$chrom, tmp$physical_pos, tmp$man_fsetid, tmp$allele),] |
|
309 |
-## |
|
310 |
-## if(reference){ |
|
311 |
-## load(system.file("extdata", paste(pkgname, "Ref.rda", sep=""), package=pkgname)) |
|
312 |
-## } else{ |
|
313 |
-## reference <- normalize.quantiles.determine.target(readCelIntensities(celFiles, |
|
314 |
-## indices=tmp$fid)) |
|
315 |
-## if (verbose) message("normalization vector created") |
|
316 |
-## } |
|
317 |
-## reference <- sort(reference) |
|
318 |
-## message("creating empty cel files") |
|
319 |
-## out.celFiles <- file.path(outdir, paste("QN-", basename(cel.files), sep="")) |
|
320 |
-## if(createCels){ |
|
321 |
-## hh <- readCelHeader(cel.files[1]) |
|
322 |
-## message(paste("creating empty cel files in", outdir)) |
|
323 |
-## out.celFiles <- sapply(out.celFiles, function(x) suppressWarnings(createCel(x, header=hh, overwrite=TRUE))) |
|
324 |
-## } |
|
325 |
-## |
|
326 |
-## message("Quantile normalizing SNP probes to hapmap target distribution") |
|
327 |
-## for(i in 1:length(cel.files)){ |
|
328 |
-## if(i%%10==0) cat(".") |
|
329 |
-## pms <- normalize.quantiles.use.target(readCelIntensities(cel.files[i], |
|
330 |
-## indices=tmp$fid), |
|
331 |
-## reference, copy=FALSE) |
|
332 |
-## updateCel(out.celFiles[i], indices=tmp$fid, intensities=as.integer(pms)) |
|
333 |
-## } |
|
334 |
-## |
|
335 |
-#### --------------------------------------------------------------------------- |
|
336 |
-#### copy number probes |
|
337 |
-#### --------------------------------------------------------------------------- |
|
338 |
-## if(normalizeNonpolymorphic){ |
|
339 |
-## cntmp <- dbGetQuery(conn, paste("SELECT fid, man_fsetid, featureSetCNV.chrom, featureSetCNV.chrom_start", |
|
340 |
-## "FROM pmfeatureCNV, featureSetCNV", |
|
341 |
-## "WHERE pmfeatureCNV.fsetid = featureSetCNV.fsetid")) |
|
342 |
-## cntmp[is.na(cntmp$chrom), "chrom"] <- 0 |
|
343 |
-## cntmp[is.na(cntmp$chrom_start), "chrom_start"] <- 0 |
|
344 |
-## cntmp <- cntmp[order(cntmp$chrom, cntmp$chrom_start, cntmp$man_fsetid), ] |
|
345 |
-## |
|
346 |
-## if(computeCopyNumberReference){ |
|
347 |
-## message("computing reference distribution for copy number probes. May take a while...") |
|
348 |
-## celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE) |
|
349 |
-## reference <- computeCopyNumberReference(cel.files=celFiles, fid=cntmp$fid) |
|
350 |
-## message("finished computing CN reference distribution.") |
|
351 |
-## } |
|
352 |
-## message("Quantile normalizing copy number probes to hapmap target distribution") |
|
353 |
-## for(i in 1:length(cel.files)){ |
|
354 |
-## if(i%%10==0) cat(".") |
|
355 |
-## pms <- normalize.quantiles.use.target(readCelIntensities(cel.files[i], |
|
356 |
-## indices=cntmp$fid), |
|
357 |
-## reference, copy=FALSE) |
|
358 |
-## updateCel(out.celFiles[i], indices=cntmp$fid, intensities=as.integer(pms)) |
|
359 |
-## } |
|
360 |
-## } else { |
|
361 |
-## message("Not quantile normalizing the nonpolymorphic probes") |
|
362 |
-## } |
|
363 |
-##} |
|
364 |
- |
|
365 |
-validCdfNames <- function(platform){ |
|
366 |
- if(missing(platform)) stop("missing platform") |
|
367 |
- if(!platform %in% c("illumina", "affymetrix")) |
|
368 |
- stop("only illumina and affymetrix platforms are supported.") |
|
369 |
- if(platform=="illumina"){ |
|
370 |
- chipList = c("human1mv1c", # 1M |
|
371 |
- "human370v1c", # 370CNV |
|
372 |
- "human650v3a", # 650Y |
|
373 |
- "human610quadv1b", # 610 quad |
|
374 |
- "human660quadv1a", # 660 quad |
|
375 |
- "human370quadv3c", # 370CNV quad |
|
376 |
- "human550v3b", # 550K |
|
377 |
- "human1mduov3b") # 1M Duo |
|
378 |
- } |
|
379 |
- if(platform=="affymetrix"){ |
|
380 |
- chipList=c("genomewidesnp6") |
|
381 |
- } |
|
382 |
- return(chipList) |
|
383 |
-} |
|
384 |
- |
|
385 |
-isValidCdfName <- function(cdfName, platform){ |
|
386 |
- chipList <- validCdfNames(platform) |
|
387 |
- if(!(cdfName %in% chipList)){ |
|
388 |
- warning("cdfName must be one of the following: ", |
|
389 |
- chipList) |
|
390 |
- } |
|
391 |
- result <- cdfName %in% chipList |
|
392 |
- return(result) |
|
393 |
-} |
|
394 |
- |
|
395 |
- |
|
396 |
- |
|
397 | 218 |
crlmmWrapper <- function(filenames, |
398 | 219 |
cdfName="genomewidesnp6", |
399 | 220 |
load.it=FALSE, |
... | ... |
@@ -404,12 +225,19 @@ crlmmWrapper <- function(filenames, |
404 | 225 |
rgFile, |
405 | 226 |
platform=c("affymetrix", "illumina")[1], |
406 | 227 |
...){ |
407 |
- if(!(platform %in% c("affymetrix", "illumina"))) |
|
228 |
+ if(!(platform %in% c("affymetrix", "illumina"))){ |
|
408 | 229 |
stop("Only 'affymetrix' and 'illumina' platforms are supported at this time.") |
230 |
+ } else { |
|
231 |
+ message("Checking whether annotation package for the ", platform, " platform is available") |
|
232 |
+ } |
|
409 | 233 |
if(missing(intensityFile)) stop("must specify 'intensityFile'.") |
410 | 234 |
if(missing(crlmmFile)) stop("must specify 'crlmmFile'.") |
411 | 235 |
if(platform == "illumina"){ |
412 | 236 |
if(missing(rgFile)) stop("must specify 'rgFile'.") |
237 |
+ if(!load.it){ |
|
238 |
+ RG <- readIdatFiles(...) |
|
239 |
+ if(save.it) save(RG, file=rgFile) |
|
240 |
+ } |
|
413 | 241 |
if(load.it & !file.exists(rgFile)){ |
414 | 242 |
message("load.it is TRUE, bug rgFile not present. Attempting to read the idatFiles.") |
415 | 243 |
RG <- readIdatFiles(...) |
... | ... |
@@ -456,6 +284,7 @@ crlmmWrapper <- function(filenames, |
456 | 284 |
} |
457 | 285 |
} else { |
458 | 286 |
message("Loading ", crlmmFile, "...") |
287 |
+ load(intensityFile) |
|
459 | 288 |
load(crlmmFile) |
460 | 289 |
crlmmResult <- get("crlmmResult") |
461 | 290 |
cnrmaResult <- get("cnrmaResult") |
... | ... |
@@ -463,23 +292,38 @@ crlmmWrapper <- function(filenames, |
463 | 292 |
} |
464 | 293 |
if(platform == "illumina"){ |
465 | 294 |
if(!file.exists(crlmmFile) | !load.it){ |
466 |
- crlmmOut <- crlmmIllumina(RG=RG, |
|
467 |
- cdfName=cdfName, |
|
468 |
- sns=sampleNames(RG), |
|
469 |
- returnParams=TRUE, |
|
470 |
- save.it=TRUE, |
|
471 |
- intensityFile=intensityFile) |
|
472 |
- if(save.it) save(crlmmOut, file=crlmmFile) |
|
295 |
+ crlmmResult <- crlmmIllumina(RG=RG, |
|
296 |
+ cdfName=cdfName, |
|
297 |
+ sns=sampleNames(RG), |
|
298 |
+ returnParams=TRUE, |
|
299 |
+ save.it=TRUE, |
|
300 |
+ intensityFile=intensityFile) |
|
301 |
+ if(save.it) save(crlmmResult, file=crlmmFile) |
|
473 | 302 |
} else { |
474 | 303 |
message("Loading ", crlmmFile, "...") |
475 | 304 |
load(crlmmFile) |
476 | 305 |
crlmmResult <- get("crlmmResult") |
477 |
- if(exists("cnAB")){ |
|
478 |
- cnrmaResult <- get("cnAB") |
|
479 |
- } else cnrmaResult <- NULL |
|
480 | 306 |
} |
481 | 307 |
} |
482 | 308 |
load(intensityFile) |
309 |
+ if(platform=="illumina"){ |
|
310 |
+ if(exists("cnAB")){ |
|
311 |
+ np.A <- as.integer(cnAB$A) |
|
312 |
+ np.B <- as.integer(cnAB$B) |
|
313 |
+ np <- ifelse(np.A > np.B, np.A, np.B) |
|
314 |
+ np <- matrix(np, nrow(cnAB$A), ncol(cnAB$A)) |
|
315 |
+ rownames(np) <- cnAB$gns |
|
316 |
+ colnames(np) <- cnAB$sns |
|
317 |
+ cnAB$NP <- np |
|
318 |
+ sampleNames(crlmmResult) <- res$sns |
|
319 |
+ cnrmaResult <- get("cnAB") |
|
320 |
+ } else cnrmaResult <- NULL |
|
321 |
+ } |
|
322 |
+ if(platform=="affymetrix"){ |
|
323 |
+ if(exists("cnrmaResult")){ |
|
324 |
+ cnrmaResult <- get("cnrmaResult") |
|
325 |
+ } else cnrmaResult <- NULL |
|
326 |
+ } |
|
483 | 327 |
ABset <- combineIntensities(get("res"), cnrmaResult, cdfName) |
484 | 328 |
if(platform=="affymetrix") protocolData(ABset)[["ScanDate"]] <- as.character(celDates(filenames)) |
485 | 329 |
crlmmResult <- harmonizeSnpSet(crlmmResult, ABset, cdfName) |
... | ... |
@@ -496,9 +340,52 @@ crlmmWrapper <- function(filenames, |
496 | 340 |
return() |
497 | 341 |
} |
498 | 342 |
|
343 |
+validCdfNames <- function(platform){ |
|
344 |
+ if(!missing(platform)){ |
|
345 |
+ if(!platform %in% c("illumina", "affymetrix")) |
|
346 |
+ stop("only illumina and affymetrix platforms are supported.") |
|
347 |
+ if(platform=="illumina"){ |
|
348 |
+ chipList = c("human1mv1c", # 1M |
|
349 |
+ "human370v1c", # 370CNV |
|
350 |
+ "human650v3a", # 650Y |
|
351 |
+ "human610quadv1b", # 610 quad |
|
352 |
+ "human660quadv1a", # 660 quad |
|
353 |
+ "human370quadv3c", # 370CNV quad |
|
354 |
+ "human550v3b", # 550K |
|
355 |
+ "human1mduov3b") # 1M Duo |
|
356 |
+ } |
|
357 |
+ if(platform=="affymetrix"){ |
|
358 |
+ chipList=c("genomewidesnp6", "genomewidesnp5") |
|
359 |
+ } |
|
360 |
+ } else{ |
|
361 |
+ chipList <- list() |
|
362 |
+ chipList$affymetrix <- c("genomewidesnp6","genomewidesnp5") |
|
363 |
+ chipList$illumina <- c("human370v1c", |
|
364 |
+ "human370quadv3c", |
|
365 |
+ "human550v3b", |
|
366 |
+ "human650v3a", |
|
367 |
+ "human610quadv1b", |
|
368 |
+ "human660quadv1a", |
|
369 |
+ "human1mduov3b") |
|
370 |
+ } |
|
371 |
+ return(chipList) |
|
372 |
+} |
|
499 | 373 |
|
500 |
- |
|
501 |
-cnrma <- function(filenames, cdfName="genomewidesnp6", sns, seed=1, verbose=FALSE){ |
|
374 |
+isValidCdfName <- function(cdfName, platform){ |
|
375 |
+ chipList <- validCdfNames(platform) |
|
376 |
+ if(!(cdfName %in% chipList)){ |
|
377 |
+ warning("cdfName must be one of the following: ", |
|
378 |
+ chipList) |
|
379 |
+ } |
|
380 |
+ result <- cdfName %in% chipList |
|
381 |
+ return(result) |
|
382 |
+} |
|
383 |
+ |
|
384 |
+ |
|
385 |
+ |
|
386 |
+# steps: quantile normalize hapmap: create 1m_reference_cn.rda object |
|
387 |
+cnrma <- function(filenames, cdfName, sns, seed=1, verbose=FALSE){ |
|
388 |
+ if(missing(cdfName)) stop("must specify cdfName") |
|
502 | 389 |
pkgname <- getCrlmmAnnotationName(cdfName) |
503 | 390 |
require(pkgname, character.only=TRUE) || stop("Package ", pkgname, " not available") |
504 | 391 |
if (missing(sns)) sns <- basename(filenames) |
... | ... |
@@ -513,8 +400,14 @@ cnrma <- function(filenames, cdfName="genomewidesnp6", sns, seed=1, verbose=FALS |
513 | 400 |
message("Processing ", length(filenames), " files.") |
514 | 401 |
if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
515 | 402 |
} |
516 |
- loader("1m_reference_cn.rda", .crlmmPkgEnv, pkgname) |
|
403 |
+ if(cdfName=="genomewidesnp6"){ |
|
404 |
+ loader("1m_reference_cn.rda", .crlmmPkgEnv, pkgname) |
|
405 |
+ } |
|
406 |
+ if(cdfName=="genomewidesnp5"){ |
|
407 |
+ loader("5.0_reference_cn.rda", .crlmmPkgEnv, pkgname) |
|
408 |
+ } |
|
517 | 409 |
reference <- getVarInEnv("reference") |
410 |
+ ##if(!is.matrix(reference)) stop("target distribution for quantile normalization not available.") |
|
518 | 411 |
for(i in seq(along=filenames)){ |
519 | 412 |
y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
520 | 413 |
x <- log2(y[idx2]) |
... | ... |
@@ -528,6 +421,7 @@ cnrma <- function(filenames, cdfName="genomewidesnp6", sns, seed=1, verbose=FALS |
528 | 421 |
dimnames(NP) <- list(names(fid), sns) |
529 | 422 |
##dimnames(NP) <- list(map[, "man_fsetid"], sns) |
530 | 423 |
res3 <- list(NP=NP, SKW=SKW) |
424 |
+ cat("\n") |
|
531 | 425 |
return(res3) |
532 | 426 |
} |
533 | 427 |
|
... | ... |
@@ -664,11 +558,12 @@ computeCopynumber <- function(object, |
664 | 558 |
bias.adj=FALSE, |
665 | 559 |
batch, |
666 | 560 |
SNRmin=5, |
667 |
- cdfName, ...){ |
|
561 |
+ cdfName, |
|
562 |
+ platform=c("affymetrix", "illumina")[1], ...){ |
|
668 | 563 |
if(class(object) != "CrlmmSetList") stop("object must be of class ClrmmSetList") |
669 | 564 |
if(missing(cdfName)) |
670 | 565 |
cdfName <- annotation(object) |
671 |
- if(!isValidCdfName(cdfName, platform="affymetrix")) stop(cdfName, " not supported.") |
|
566 |
+ if(!isValidCdfName(cdfName, platform=platform)) stop(cdfName, " not supported.") |
|
672 | 567 |
if(ncol(object) < 10) |
673 | 568 |
stop("Must have at least 10 samples in each batch to estimate model parameters....preferably closer to 90 samples per batch") |
674 | 569 |
##require(oligoClasses) |
... | ... |
@@ -691,7 +586,6 @@ computeCopynumber <- function(object, |
691 | 586 |
##previous version of compute copy number |
692 | 587 |
envir <- new.env() |
693 | 588 |
message("Fitting model for copy number estimation...") |
694 |
- |
|
695 | 589 |
.computeCopynumber(chrom=CHR, |
696 | 590 |
A=A(ABset), |
697 | 591 |
B=B(ABset), |
... | ... |
@@ -705,7 +599,6 @@ computeCopynumber <- function(object, |
705 | 599 |
SNRmin=SNRmin, |
706 | 600 |
cdfName=cdfName, |
707 | 601 |
...) |
708 |
- |
|
709 | 602 |
if(bias.adj){ |
710 | 603 |
message("Running bias adjustment...") |
711 | 604 |
.computeCopynumber(chrom=CHR, |
... | ... |
@@ -991,6 +884,8 @@ list2locusSet <- function(envir, ABset, NPset, CHR, cdfName="genomewidesnp6"){ |
991 | 884 |
return(cnset) |
992 | 885 |
} |
993 | 886 |
|
887 |
+ |
|
888 |
+ |
|
994 | 889 |
thresholdCopyNumberSet <- function(object){ |
995 | 890 |
ca <- CA(object) |
996 | 891 |
cb <- CB(object) |
... | ... |
@@ -1795,195 +1690,6 @@ biasAdj <- function(plateIndex, envir, priorProb, PROP=0.75){ |
1795 | 1690 |
} |
1796 | 1691 |
|
1797 | 1692 |
|
1798 |
-posteriorNonpolymorphic <- function(plateIndex, envir, priorProb, cnStates=0:6){ |
|
1799 |
- p <- plateIndex |
|
1800 |
- CHR <- envir[["chrom"]] |
|
1801 |
- if(missing(priorProb)) priorProb <- rep(1/length(cnStates), length(cnStates)) ##uniform |
|
1802 |
- plate <- envir[["plate"]] |
|
1803 |
- uplate <- envir[["plate"]] |
|
1804 |
- NP <- envir[["NP"]][, plate==uplate[p]] |
|
1805 |
- nuT <- envir[["nuT"]][, p] |
|
1806 |
- phiT <- envir[["phiT"]][, p] |
|
1807 |
- sig2T <- envir[["sig2T"]][, p] |
|
1808 |
- ##Assuming background variance for np probes is the same on the log-scale |
|
1809 |
- emit <- array(NA, dim=c(nrow(NP), ncol(NP), length(cnStates)))##SNPs x sample x 'truth' |
|
1810 |
- lT <- log2(NP) |
|
1811 |
- sds <- sqrt(sig2T) |
|
1812 |
- counter <- 1##state counter |
|
1813 |
- for(CT in cnStates){ |
|
1814 |
- cat(".") |
|
1815 |
- if(CHR == 23) browser() |
|
1816 |
- means <- suppressWarnings(log2(nuT + CT*phiT)) |
|
1817 |
- emit[, , counter] <- dnorm(lT, mean=means, sd=sds) |
|
1818 |
- counter <- counter+1 |
|
1819 |
- } |
|
1820 |
- for(j in seq(along=cnStates)){ |
|
1821 |
- emit[, , j] <- priorProb[j]*emit[, , j] |
|
1822 |
- } |
|
1823 |
- homDel <- emit[, , 1] |
|
1824 |
- hemDel <- emit[, , 2] |
|
1825 |
- norm <- emit[, , 3] |
|
1826 |
- amp <- emit[, , 4] |
|
1827 |
- amp4 <- emit[, , 5] |
|
1828 |
- amp5 <- emit[, , 6] |
|
1829 |
- amp6 <- emit[, , 7] |
|
1830 |
- total <- homDel+hemDel+norm+amp+amp4+amp5+amp6 |
|
1831 |
- weights <- array(NA, dim=c(nrow(NP), ncol(NP), length(cnStates))) |
|
1832 |
- weights[, , 1] <- homDel/total |
|
1833 |
- weights[, , 2] <- hemDel/total |
|
1834 |
- weights[, , 3] <- norm/total |
|
1835 |
- weights[, , 4] <- amp/total |
|
1836 |
- weights[, , 5] <- amp4/total |
|
1837 |
- weights[, , 6] <- amp5/total |
|
1838 |
- weights[, , 7] <- amp6/total |
|
1839 |
- ##posterior mode |
|
1840 |
- posteriorMode <- apply(weights, c(1, 2), function(x) order(x, decreasing=TRUE)[1]) |
|
1841 |
- posteriorMode <- posteriorMode-1 |
|
1842 |
- ##sns <- envir[["sns"]] |
|
1843 |
- ##colnames(posteriorMode) <- sns |
|
1844 |
- ##envir[["np.posteriorMode"]] <- posteriorMode |
|
1845 |
- ##envir[["np.weights"]] <- weights |
|
1846 |
- posteriorMeans <- 0*homDel/total + 1*hemDel/total + 2*norm/total + 3*amp/total + 4*amp4/total + 5*amp5/total + 6*amp6/total |
|
1847 |
- ##colnames(posteriorMeans) <- sns |
|
1848 |
- ##envir[["np.posteriorMeans"]] <- posteriorMeans |
|
1849 |
- return(posteriorMode) |
|
1850 |
-} |
|
1851 |
- |
|
1852 |
-posteriorWrapper <- function(envir){ |
|
1853 |
- snp.PM <- matrix(NA, length(envir[["snps"]]), length(envir[["sns"]])) |
|
1854 |
- np.PM <- matrix(NA, length(envir[["cnvs"]]), length(envir[["sns"]])) |
|
1855 |
- plate <- envir[["plate"]] |
|
1856 |
- uplate <- envir[["uplate"]] |
|
1857 |
- for(p in seq(along=uplate)){ |
|
1858 |
- tmp <- expectedC(plateIndex=p, envir=envir) |
|
1859 |
- snp.PM[, plate==uplate[p]] <- tmp |
|
1860 |
- ##snp.pm <- env[["posteriorMode"]] |
|
1861 |
- ##trace(posteriorNonpolymorphic, browser) |
|
1862 |
- tmp <- posteriorNonpolymorphic(plateIndex=p, envir=envir) |
|
1863 |
- np.PM[, plate==uplate[p]] <- tmp##env[["np.posteriorMode"]] |
|
1864 |
- ##pMode <- rbind(snp.pm, np.pm) |
|
1865 |
- ##rownames(pMode) <- c(env[["snps"]], env[["cnvs"]]) |
|
1866 |
- ##dn <- dimnames(pMode) |
|
1867 |
- ##pMode <- matrix(as.integer(pMode), nrow(pMode), ncol(pMode)) |
|
1868 |
- } |
|
1869 |
- PM <- rbind(snp.PM, np.PM) |
|
1870 |
- PM <- matrix(as.integer(PM), nrow(PM), ncol(PM)) |
|
1871 |
- dns <- list(c(envir[["snps"]], envir[["cnvs"]]), envir[["sns"]]) |
|
1872 |
- dimnames(PM) <- dns |
|
1873 |
- return(PM) |
|
1874 |
-} |
|
1875 |
- |
|
1876 |
- |
|
1877 |
- |
|
1878 |
- |
|
1879 |
- |
|
1880 |
-##for polymorphic probes |
|
1881 |
-expectedC <- function(plateIndex, envir, priorProb, cnStates=0:6){ |
|
1882 |
- p <- plateIndex |
|
1883 |
- CHR <- envir[["chrom"]] |
|
1884 |
- if(missing(priorProb)) priorProb <- rep(1/length(cnStates), length(cnStates)) ##uniform |
|
1885 |
- plate <- envir[["plate"]] |
|
1886 |
- uplate <- envir[["uplate"]] |
|
1887 |
- A <- envir[["A"]] |
|
1888 |
- B <- envir[["B"]] |
|
1889 |
- A <- A[, plate==uplate[p]] |
|
1890 |
- B <- B[, plate==uplate[p]] |
|
1891 |
- calls <- envir[["calls"]] |
|
1892 |
- calls <- calls[, plate==unique(plate)[p]] |
|
1893 |
- probA <- sqrt(rowMeans(calls == 1, na.rm=TRUE)) |
|
1894 |
- probB <- sqrt(rowMeans(calls == 3, na.rm=TRUE)) |
|
1895 |
- sig2A <- envir[["sig2A"]] |
|
1896 |
- sig2B <- envir[["sig2B"]] |
|
1897 |
- tau2A <- envir[["tau2A"]] |
|
1898 |
- tau2B <- envir[["tau2B"]] |
|
1899 |
- corrA.BB <- envir[["corrA.BB"]] |
|
1900 |
- corrB.AA <- envir[["corrB.AA"]] |
|
1901 |
- corr <- envir[["corr"]] |
|
1902 |
- nuA <- envir[["nuA"]] |
|
1903 |
- nuB <- envir[["nuB"]] |
|
1904 |
- phiA <- envir[["phiA"]] |
|
1905 |
- phiB <- envir[["phiB"]] |
|
1906 |
- emit <- array(NA, dim=c(nrow(A), ncol(A), 28))##SNPs x sample x 'truth' |
|
1907 |
- ##AAAA, AAAB, AABB, ABBB, BBBB |
|
1908 |
- ##AAAAA, AAAAB, AAABB, AABBB, ABBBB, BBBBB |
|
1909 |
- ##AAAAAA, AAAAAB, AAAABB, AAABBB, AABBBB, ABBBBB, BBBBBB |
|
1910 |
- lA <- log2(A) |
|
1911 |
- lB <- log2(B) |
|
1912 |
- X <- cbind(lA, lB) |
|
1913 |
- counter <- 1##state counter |
|
1914 |
- for(CT in cnStates){ |
|
1915 |
- cat(".") |
|
1916 |
- for(CA in 0:CT){ |
|
1917 |
- CB <- CT-CA |
|
1918 |
- A.scale <- sqrt(tau2A[, p]*(CA==0) + sig2A[, p]*(CA > 0)) |
|
1919 |
- B.scale <- sqrt(tau2B[, p]*(CB==0) + sig2B[, p]*(CB > 0)) |
|
1920 |
- scale <- c(A.scale, B.scale) |
|
1921 |
- if(CA == 0 & CB == 0) rho <- 0 |
|
1922 |
- if(CA == 0 & CB > 0) rho <- corrA.BB[, p] |
|
1923 |
- if(CA > 0 & CB == 0) rho <- corrB.AA[, p] |
|
1924 |
- if(CA > 0 & CB > 0) rho <- corr[, p] |
|
1925 |
- if(CHR == 23) browser() |
|
1926 |
- means <- cbind(suppressWarnings(log2(nuA[, p]+CA*phiA[, p])), suppressWarnings(log2(nuB[, p]+CB*phiB[, p]))) |
|
1927 |
- covs <- rho*A.scale*B.scale |
|
1928 |
- A.scale2 <- A.scale^2 |
|
1929 |
- B.scale2 <- B.scale^2 |
|
1930 |
- ##ensure positive definite |
|
1931 |
- ##Sigma <- as.matrix(nearPD(matrix(c(A.scale^2, covs, |
|
1932 |
- ##covs, B.scale^2), 2, 2))[[1]]) |
|
1933 |
- m <- 1##snp counter |
|
1934 |
- for(i in 1:nrow(A)){ |
|
1935 |
- Sigma <- matrix(c(A.scale2[i], covs[i], covs[i], B.scale2[i]), 2,2) |
|
1936 |
- xx <- matrix(X[i, ], ncol=2) |
|
1937 |
- tmp <- dmvnorm(xx, mean=means[i, ], sigma=Sigma) |
|
1938 |
- ##Using HWE: P(CA=ca, CB=cb|CT=c) |
|
1939 |
- ptmp <- (probA[i]^CA)*(probB[i]^CB)*tmp |
|
1940 |
- emit[m, , counter] <- ptmp |
|
1941 |
- m <- m+1 |
|
1942 |
- } |
|
1943 |
- counter <- counter+1 |
|
1944 |
- } |
|
1945 |
- } |
|
1946 |
- ##priorProb=P(CT=c) |
|
1947 |
- homDel <- priorProb[1]*emit[, , 1] |
|
1948 |
- hemDel <- priorProb[2]*emit[, , c(2, 3)] # + priorProb[3]*emit[, c(4, 5, 6)] + priorProb[4]*emit[, c(7:10)] |
|
1949 |
- norm <- priorProb[3]*emit[, , 4:6] |
|
1950 |
- amp <- priorProb[4]*emit[, , 7:10] |
|
1951 |
- amp4 <- priorProb[5]*emit[, , 11:15] |
|
1952 |
- amp5 <- priorProb[6]*emit[, , 16:21] |
|
1953 |
- amp6 <- priorProb[7]*emit[, , 22:28] |
|
1954 |
- ##sum over the different combinations within each copy number state |
|
1955 |
- hemDel <- apply(hemDel, c(1,2), sum) |
|
1956 |
- norm <- apply(norm, c(1, 2), sum) |
|
1957 |
- amp <- apply(amp, c(1,2), sum) |
|
1958 |
- amp4 <- apply(amp4, c(1,2), sum) |
|
1959 |
- amp5 <- apply(amp5, c(1,2), sum) |
|
1960 |
- amp6 <- apply(amp6, c(1,2), sum) |
|
1961 |
- total <- homDel+hemDel+norm+amp+amp4+amp5+amp6 |
|
1962 |
- weights <- array(NA, dim=c(nrow(homDel), ncol(A), 7)) |
|
1963 |
- weights[, , 1] <- homDel/total |
|
1964 |
- weights[, , 2] <- hemDel/total |
|
1965 |
- weights[, , 3] <- norm/total |
|
1966 |
- weights[, , 4] <- amp/total |
|
1967 |
- weights[, , 5] <- amp4/total |
|
1968 |
- weights[, , 6] <- amp5/total |
|
1969 |
- weights[, , 7] <- amp6/total |
|
1970 |
- ##posterior mode |
|
1971 |
- posteriorMode <- apply(weights, c(1, 2), function(x) order(x, decreasing=TRUE)[1]) |
|
1972 |
- posteriorMode <- posteriorMode-1 |
|
1973 |
- ##This is for one plate. Need to instantiate a much bigger |
|
1974 |
- ##object in the environment |
|
1975 |
- |
|
1976 |
- ##envir[["posteriorMode"]] <- posteriorMode |
|
1977 |
- ##weights <- list(homDel/total, hemDel/total, norm/total, amp/total, amp4/total, amp5/total, amp6/total) |
|
1978 |
- ##names(weights) <- c(cnStates) |
|
1979 |
- ##envir[["weights"]] <- weights |
|
1980 |
- posteriorMeans <- 0*homDel/total + 1*hemDel/total + 2*norm/total + 3*amp/total + 4*amp4/total + 5*amp5/total + 6*amp6/total |
|
1981 |
- ##sns <- envir[["sns"]] |
|
1982 |
- ##colnames(posteriorMeans) <- sns |
|
1983 |
- ##envir[["posteriorMeans"]] <- posteriorMeans |
|
1984 |
- return(posteriorMode) |
|
1985 |
-} |
|
1986 |
- |
|
1987 | 1693 |
biasAdjNP <- function(plateIndex, envir, priorProb){ |
1988 | 1694 |
p <- plateIndex |
1989 | 1695 |
normalNP <- envir[["normalNP"]] |
... | ... |
@@ -2079,7 +1785,6 @@ computeEmission <- function(object, |
2079 | 1785 |
##threshold small nu's and phis |
2080 | 1786 |
cnset <- thresholdModelParams(object[[3]], MIN=MIN) |
2081 | 1787 |
index <- order(chromosome(cnset), position(cnset)) |
2082 |
- |
|
2083 | 1788 |
if(any(diff(index) > 1)) stop("must be ordered by chromosome and physical position") |
2084 | 1789 |
emissionProbs <- array(NA, dim=c(nrow(cnset), ncol(cnset), length(copyNumberStates))) |
2085 | 1790 |
dimnames(emissionProbs) <- list(featureNames(object), |
... | ... |
@@ -2223,3 +1928,15 @@ thresholdModelParams <- function(object, MIN=2^3){ |
2223 | 1928 |
} |
2224 | 1929 |
emissionProbs |
2225 | 1930 |
} |
1931 |
+ |
|
1932 |
+setMethod("update", "character", function(object, ...){ |
|
1933 |
+ crlmmFile <- object |
|
1934 |
+ for(i in seq(along=crlmmFile)){ |
|
1935 |
+ cat("Processing ", clrmmFile[i], "...\n") |
|
1936 |
+ load(crlmmFile[i]) |
|
1937 |
+ crlmmSetList <- get("crlmmSetList") |
|
1938 |
+ crlmmSetList <- update(crlmmSetList, ...) |
|
1939 |
+ save(crlmmSetList, file=crlmmFile[i]) |
|
1940 |
+ rm(crlmmSetList); gc(); |
|
1941 |
+ } |
|
1942 |
+}) |
... | ... |
@@ -12,6 +12,14 @@ readIdatFiles <- function(sampleSheet=NULL, |
12 | 12 |
sep="_", |
13 | 13 |
fileExt=list(green="Grn.idat", red="Red.idat"), |
14 | 14 |
saveDate=FALSE) { |
15 |
+ if(is.null(arrayNames)) { |
|
16 |
+ arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path)) |
|
17 |
+ if(!is.null(sampleSheet)) { |
|
18 |
+ sampleSheet=NULL |
|
19 |
+ cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n") |
|
20 |
+ } |
|
21 |
+ pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) |
|
22 |
+ } |
|
15 | 23 |
if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet |
16 | 24 |
if(!is.null(arrayNames)){ |
17 | 25 |
##arrayNames=NULL |
... | ... |
@@ -35,14 +43,6 @@ readIdatFiles <- function(sampleSheet=NULL, |
35 | 43 |
pd = new("AnnotatedDataFrame", data = sampleSheet) |
36 | 44 |
sampleNames(pd) <- arrayNames |
37 | 45 |
} |
38 |
- if(is.null(arrayNames)) { |
|
39 |
- arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path)) |
|
40 |
- if(!is.null(sampleSheet)) { |
|
41 |
- sampleSheet=NULL |
|
42 |
- cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n") |
|
43 |
- } |
|
44 |
- pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) |
|
45 |
- } |
|
46 | 46 |
narrays = length(arrayNames) |
47 | 47 |
grnfiles = paste(arrayNames, fileExt$green, sep=sep) |
48 | 48 |
redfiles = paste(arrayNames, fileExt$red, sep=sep) |
... | ... |
@@ -50,9 +50,10 @@ readIdatFiles <- function(sampleSheet=NULL, |
50 | 50 |
stop("Cannot find .idat files") |
51 | 51 |
if(length(grnfiles)!=length(redfiles)) |
52 | 52 |
stop("Cannot find matching .idat files") |
53 |
- if(!all(c(redfiles,grnfiles) %in% dir(path=path))) |
|
53 |
+ if(!all(c(redfiles,grnfiles) %in% dir(path=path))){ |
|
54 | 54 |
stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n", |
55 | 55 |
paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" ")) |
56 |
+ } |
|
56 | 57 |
grnidats = file.path(path, grnfiles) |
57 | 58 |
redidats = file.path(path, redfiles) |
58 | 59 |
headerInfo = list(nProbes = rep(NA, narrays), |
... | ... |
@@ -5,7 +5,14 @@ setValidity("ABset", function(object) { |
5 | 5 |
}) |
6 | 6 |
setMethod("A", "ABset", function(object) assayData(object)[["A"]]) |
7 | 7 |
setMethod("B", "ABset", function(object) assayData(object)[["B"]]) |
8 |
- |
|
8 |
+setReplaceMethod("A", signature(object="ABset", value="matrix"), |
|
9 |
+ function(object, value){ |
|
10 |
+ assayDataElementReplace(object, "A", value) |
|
11 |
+ }) |
|
12 |
+setReplaceMethod("B", signature(object="ABset", value="matrix"), |
|
13 |
+ function(object, value){ |
|
14 |
+ assayDataElementReplace(object, "B", value) |
|
15 |
+ }) |
|
9 | 16 |
|
10 | 17 |
|
11 | 18 |
|
... | ... |
@@ -6,12 +6,14 @@ setValidity("CopyNumberSet", function(object) { |
6 | 6 |
##may want to allow thresholding here (... arg) |
7 | 7 |
setMethod("CA", "CopyNumberSet", function(object, ...) assayData(object)[["CA"]]/100) |
8 | 8 |
setMethod("CB", "CopyNumberSet", function(object, ...) assayData(object)[["CB"]]/100) |
9 |
-setReplaceMethod("CB", signature(object="CopyNumberSet", value="matrix"), |
|
10 |
- function(object, value) assayDataElementReplace(object, "CB", value)) |
|
9 |
+ |
|
11 | 10 |
setReplaceMethod("CA", signature(object="CopyNumberSet", value="matrix"), |
12 |
- function(object, value) assayDataElementReplace(object, "CA", value)) |
|
11 |
+ function(object, value) assayDataElementReplace(object, "CA", value*100)) |
|
12 |
+setReplaceMethod("CB", signature(object="CopyNumberSet", value="matrix"), |
|
13 |
+ function(object, value) assayDataElementReplace(object, "CB", value*100)) |
|
14 |
+ |
|
15 |
+ |
|
13 | 16 |
|
14 |
-setMethod("chromosome", "CopyNumberSet", function(object) fData(object)$chromosome) |
|
15 | 17 |
|
16 | 18 |
setMethod("batch", "CopyNumberSet", function(object){ |
17 | 19 |
if("batch" %in% varLabels(object)){ |
... | ... |
@@ -23,19 +25,23 @@ setMethod("batch", "CopyNumberSet", function(object){ |
23 | 25 |
}) |
24 | 26 |
|
25 | 27 |
setMethod("copyNumber", "CopyNumberSet", function(object){ |
28 |
+ require(paste(annotation(object), "Crlmm", sep=""), character.only=TRUE) || stop(paste("Annotation package ", annotation(object), "Crlmm not available", sep="")) |
|
26 | 29 |
##ensure that 2 + NA = 2 by replacing NA's with zero |
30 |
+ ##the above results in copy number 0, 1, or 2 depending on the genotype....safer just to drop |
|
27 | 31 |
CA <- CA(object) |
28 | 32 |
CB <- CB(object) |
29 |
- nas <- is.na(CA) & is.na(CB) |
|
30 |
- CA[is.na(CA)] <- 0 |
|
31 |
- CB[is.na(CB)] <- 0 |
|
33 |
+ ##nas <- is.na(CA) & is.na(CB) |
|
34 |
+ ##CA[is.na(CA)] <- 0 |
|
35 |
+ ##CB[is.na(CB)] <- 0 |
|
32 | 36 |
CN <- CA + CB |
37 |
+ ##For nonpolymorphic probes, CA is the total copy number |
|
38 |
+ CN[cnIndex(object, annotation(object)), ] <- CA(object)[cnIndex(object, annotation(object)), ] |
|
33 | 39 |
##if both CA and CB are NA, report NA |
34 |
- CN[nas] <- NA |
|
40 |
+ ##CN[nas] <- NA |
|
35 | 41 |
CN |
36 | 42 |
}) |
37 | 43 |
|
38 |
-setMethod("position", "CopyNumberSet", function(object) fData(object)$position) |
|
44 |
+ |
|
39 | 45 |
|
40 | 46 |
##setMethod("ellipse", "CopyNumberSet", function(x, copynumber, ...){ |
41 | 47 |
ellipse.CopyNumberSet <- function(x, copynumber, ...){ |
... | ... |
@@ -43,7 +49,15 @@ ellipse.CopyNumberSet <- function(x, copynumber, ...){ |
43 | 49 |
##fittedOrder <- unique(sapply(basename(celFiles), function(x) strsplit(x, "_")[[1]][2])) |
44 | 50 |
##index <- match(plates, fittedOrder) |
45 | 51 |
if(nrow(x) > 1) stop("only 1 snp at a time") |
46 |
- batch <- unique(x$batch) |
|
52 |
+ ##batch <- unique(x$batch) |
|
53 |
+ args <- list(...) |
|
54 |
+ if(!"batch" %in% names(args)){ |
|
55 |
+ jj <- match("batch", varLabels(x)) |
|
56 |
+ if(length(jj) < 1) stop("batch not in varLabels") |
|
57 |
+ batch <- unique(pData(x)[, jj]) |
|
58 |
+ } else{ |
|
59 |
+ batch <- unique(args$batch) |
|
60 |
+ } |
|
47 | 61 |
if(length(batch) > 1) stop("batch variable not unique") |
48 | 62 |
nuA <- as.numeric(fData(x)[, match(paste("nuA", batch, sep="_"), fvarLabels(x))]) |
49 | 63 |
nuB <- as.numeric(fData(x)[, match(paste("nuB", batch, sep="_"), fvarLabels(x))]) |
... | ... |
@@ -65,6 +79,7 @@ ellipse.CopyNumberSet <- function(x, copynumber, ...){ |
65 | 79 |
if(CA == 0 & CB > 0) rho <- corrA.BB |
66 | 80 |
if(CA > 0 & CB == 0) rho <- corrB.AA |
67 | 81 |
if(CA > 0 & CB > 0) rho <- corr |
82 |
+ if(CA == 0 & CB == 0) rho <- 0 |
|
68 | 83 |
lines(ellipse(x=rho, centre=c(log2(nuA+CA*phiA), |
69 | 84 |
log2(nuB+CB*phiB)), |
70 | 85 |
scale=scale), ...) |
... | ... |
@@ -48,13 +48,78 @@ setMethod(".harmonizeDimnames", "CrlmmSetList", function(object){ |
48 | 48 |
}) |
49 | 49 |
|
50 | 50 |
setMethod("A", "CrlmmSetList", function(object) A(object[[1]])) |
51 |
+ |
|
52 |
+setMethod("addFeatureAnnotation", "CrlmmSetList", function(object, CHR){ |
|
53 |
+ if(missing(CHR)) stop("Must specificy chromosome") |
|
54 |
+ cdfName <- annotation(object) |
|
55 |
+ pkgname <- paste(cdfName, "Crlmm", sep="") |
|
56 |
+ path <- system.file("extdata", package=pkgname) |
|
57 |
+ loader("cnProbes.rda", pkgname=pkgname, envir=.crlmmPkgEnv) |
|
58 |
+ cnProbes <- get("cnProbes", envir=.crlmmPkgEnv) |
|
59 |
+ loader("snpProbes.rda", pkgname=pkgname, envir=.crlmmPkgEnv) |
|
60 |
+ snpProbes <- get("snpProbes", envir=.crlmmPkgEnv) |
|
61 |
+ |
|
62 |
+ ##Feature Data |
|
63 |
+ snps <- featureNames(object)[snpIndex(object)] |
|
64 |
+ nps <- featureNames(object)[cnIndex(object)] |
|
65 |
+ position.snp <- snpProbes[match(snps, rownames(snpProbes)), "position"] |
|
66 |
+ names(position.snp) <- snps |
|
67 |
+ position.np <- cnProbes[match(nps, rownames(cnProbes)), "position"] |
|
68 |
+ names(position.np) <- nps |
|
69 |
+ |
|
70 |
+ position <- c(position.snp, position.np) |
|
71 |
+ position <- position[match(featureNames(object), names(position))] |
|
72 |
+ stopifnot(identical(names(position), featureNames(object))) |
|
73 |
+ if(sum(duplicated(names(position))) > 0){ |
|
74 |
+ warning("Removing rows with NA identifiers...") |
|
75 |
+ ##RS: fix this |
|
76 |
+ I <- which(!is.na(names(position))) |
|
77 |
+ } else I <- seq(along=names(position)) |
|
78 |
+ fd <- data.frame(cbind(CHR, |
|
79 |
+ position[I])) |
|
80 |
+ colnames(fd) <- c("chromosome", "position") |
|
81 |
+ rownames(fd) <- featureNames(object) |
|
82 |
+ fD <- new("AnnotatedDataFrame", |
|
83 |
+ data=fd, |
|
84 |
+ varMetadata=data.frame(labelDescription=colnames(fd))) |
|
85 |
+ return(fD) |
|
86 |
+}) |
|
87 |
+ |
|
51 | 88 |
setMethod("annotation", "CrlmmSetList", function(object) annotation(object[[1]])) |
52 | 89 |
setMethod("B", "CrlmmSetList", function(object) B(object[[1]])) |
53 | 90 |
setMethod("batch", "CrlmmSetList", function(object) batch(object[[3]])) |
54 | 91 |
setMethod("CA", "CrlmmSetList", function(object, ...) CA(object[[3]], ...)) |
55 | 92 |
setMethod("CB", "CrlmmSetList", function(object, ...) CB(object[[3]], ...)) |
56 | 93 |
setMethod("calls", "CrlmmSetList", function(object) calls(object[[2]])) |
57 |
-setMethod("chromosome", "CrlmmSetList", function(object) chromosome(object[[3]])) |
|
94 |
+setMethod("chromosome", "CrlmmSetList", function(object){ |
|
95 |
+ chr <- NULL |
|
96 |
+ for(i in 1:length(object)){ |
|
97 |
+ if(length(fvarLabels(object[[i]])) > 0){ |
|
98 |
+ if("chromosome" %in% fvarLabels(object[[i]])){ |
|
99 |
+ chr <- chromosome(object[[i]]) |
|
100 |
+ break() |
|
101 |
+ } |
|
102 |
+ } |
|
103 |
+ } |
|
104 |
+ if(is.null(chr)) warning("fvarLabel 'chromosome' not in any element of the CrlmmSetList object") |
|
105 |
+ return(chr) |
|
106 |
+ ##chromosome(object[[3]]) |
|
107 |
+ }) |
|
108 |
+ |
|
109 |
+setMethod("position", "CrlmmSetList", function(object){ |
|
110 |
+ pos <- NULL |
|
111 |
+ for(i in 1:length(object)){ |
|
112 |
+ if(length(fvarLabels(object[[i]])) > 0){ |
|
113 |
+ if("position" %in% fvarLabels(object[[i]])){ |
|
114 |
+ pos <- position(object[[i]]) |
|
115 |
+ break() |
|
116 |
+ } |
|
117 |
+ } else next() |
|
118 |
+ } |
|
119 |
+ if(is.null(pos)) warning("fvarLabel 'position' not in any element of the CrlmmSetList object") |
|
120 |
+ return(pos) |
|
121 |
+ }) |
|
122 |
+ |
|
58 | 123 |
setMethod("cnIndex", "CrlmmSetList", function(object, ...) { |
59 | 124 |
match(cnNames(object[[1]], annotation(object)), featureNames(object)) |
60 | 125 |
}) |
... | ... |
@@ -94,7 +159,6 @@ setMethod("points", signature(x="CrlmmSetList"), |
94 | 159 |
B <- log2(B(x)) |
95 | 160 |
points(A, B, ...) |
96 | 161 |
}) |
97 |
-setMethod("position", "CrlmmSetList", function(object) position(object[[3]])) |
|
98 | 162 |
setMethod("sampleNames", "CrlmmSetList", function(object) sampleNames(object[[1]])) |
99 | 163 |
setMethod("show", "CrlmmSetList", function(object){ |
100 | 164 |
cat("\n Elements in CrlmmSetList object: \n") |
... | ... |
@@ -130,6 +194,29 @@ setMethod("update", "CrlmmSetList", function(object, ...){ |
130 | 194 |
computeCopynumber(object, ...) |
131 | 195 |
}) |
132 | 196 |
|
197 |
+setReplaceMethod("CA", signature(object="CrlmmSetList", value="matrix"), |
|
198 |
+ function(object, value){ |
|
199 |
+ CA(object[[3]]) <- value |
|
200 |
+ object |
|
201 |
+ }) |
|
202 |
+setReplaceMethod("CB", signature(object="CrlmmSetList", value="matrix"), |
|
203 |
+ function(object, value){ |
|
204 |
+ CB(object[[3]]) <- value |
|
205 |
+ object |
|
206 |
+ }) |
|
207 |
+ |
|
208 |
+setReplaceMethod("A", signature(object="CrlmmSetList", value="matrix"), |
|
209 |
+ function(object, value){ |
|
210 |
+ A(object[[1]]) <- value |
|
211 |
+ object |
|
212 |
+ }) |
|
213 |
+setReplaceMethod("B", signature(object="CrlmmSetList", value="matrix"), |
|
214 |
+ function(object, value){ |
|
215 |
+ B(object[[1]]) <- value |
|
216 |
+ object |
|
217 |
+ }) |
|
218 |
+ |
|
219 |
+ |
|
133 | 220 |
|
134 | 221 |
setMethod("boxplot", "CrlmmSetList", function(x, ...){ |
135 | 222 |
##boxplot.CrlmmSetList <- function(x, ...){ |
... | ... |
@@ -138,7 +225,7 @@ setMethod("boxplot", "CrlmmSetList", function(x, ...){ |
138 | 225 |
A1 <- A(x) |
139 | 226 |
B1 <- B(x) |
140 | 227 |
Alist <- split(A1, genotypes) |
141 |
- Alist <- rev(Alist) |
|
228 |
+ Alist <- as(rev(Alist), "data.frame") |
|
142 | 229 |
Blist <- split(B1, genotypes) |
143 | 230 |
ylim <- range(unlist(Alist)) |
144 | 231 |
boxplot(Alist, xaxt="n", ylab=expression(I[A]), |
... | ... |
@@ -15,6 +15,8 @@ setMethod("snpNames", "eSet", function(object, cdfName){ |
15 | 15 |
snps <- snps[snps %in% featureNames(object)] |
16 | 16 |
featureNames(object)[match(snps, featureNames(object))] |
17 | 17 |
}) |
18 |
+setMethod("chromosome", "eSet", function(object) fData(object)$chromosome) |
|
19 |
+setMethod("position", "eSet", function(object) fData(object)$position) |
|
18 | 20 |
##setMethod("combine", signature=signature(x="eSet", y="eSet"), |
19 | 21 |
## function(x, y, ...){ |
20 | 22 |
## ##Check that both x and y are valid objects |
... | ... |
@@ -1,90 +1,85 @@ |
1 | 1 |
snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1, verbose=TRUE, seed=1, cdfName, sns){ |
2 |
- if (missing(sns)) sns <- basename(filenames) |
|
3 |
- ##ADD CHECK TO SEE IF LOADED |
|
4 |
- if (missing(cdfName)) |
|
5 |
- cdfName <- read.celfile.header(filenames[1])$cdfName |
|
6 |
-## stuffDir <- changeToCrlmmAnnotationName(cdfName) |
|
7 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
8 |
- if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
9 |
- suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
10 |
- msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
11 |
- message(strwrap(msg)) |
|
12 |
- stop("Package ", pkgname, " could not be found.") |
|
13 |
- rm(suggCall, msg) |
|
14 |
- } |
|
15 |
- |
|
16 |
- if(verbose) message("Loading annotations and mixture model parameters.") |
|
17 |
- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
18 |
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
19 |
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
20 |
-## data(preprocStuff, genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
21 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
22 |
- pnsa <- getVarInEnv("pnsa") |
|
23 |
- pnsb <- getVarInEnv("pnsb") |
|
24 |
- fid <- getVarInEnv("fid") |
|
25 |
- reference <- getVarInEnv("reference") |
|
26 |
- aIndex <- getVarInEnv("aIndex") |
|
27 |
- bIndex <- getVarInEnv("bIndex") |
|
28 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
29 |
- theKnots <- getVarInEnv("theKnots") |
|
30 |
- gns <- getVarInEnv("gns") |
|
2 |
+ if (missing(sns)) sns <- basename(filenames) |
|
3 |
+ ##ADD CHECK TO SEE IF LOADED |
|
4 |
+ if (missing(cdfName)) |
|
5 |
+ cdfName <- read.celfile.header(filenames[1])$cdfName |
|
6 |
+ ## stuffDir <- changeToCrlmmAnnotationName(cdfName) |
|
7 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
8 |
+ if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
9 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
10 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
11 |
+ message(strwrap(msg)) |
|
12 |
+ stop("Package ", pkgname, " could not be found.") |
|
13 |
+ rm(suggCall, msg) |
|
14 |
+ } |
|
15 |
+ if(verbose) message("Loading annotations and mixture model parameters.") |
|
16 |
+ loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
17 |
+ loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
18 |
+ loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
19 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
20 |
+ pnsa <- getVarInEnv("pnsa") |
|
21 |
+ pnsb <- getVarInEnv("pnsb") |
|
22 |
+ fid <- getVarInEnv("fid") |
|
23 |
+ reference <- getVarInEnv("reference") |
|
24 |
+ aIndex <- getVarInEnv("aIndex") |
|
25 |
+ bIndex <- getVarInEnv("bIndex") |
|
26 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
27 |
+ theKnots <- getVarInEnv("theKnots") |
|
28 |
+ gns <- getVarInEnv("gns") |
|
31 | 29 |
|
32 |
- ##We will read each cel file, summarize, and run EM one by one |
|
33 |
- ##We will save parameters of EM to use later |
|
34 |
- mixtureParams <- matrix(0, 4, length(filenames)) |
|
35 |
- SNR <- vector("numeric", length(filenames)) |
|
36 |
- SKW <- vector("numeric", length(filenames)) |
|
37 |
- |
|
38 |
- ## This is the sample for the fitting of splines |
|
39 |
- ## BC: I like better the idea of the user passing the seed, |
|
40 |
- ## because this might intefere with other analyses |
|
41 |
- ## (like what happened to GCRMA) |
|
42 |
- set.seed(seed) |
|
43 |
- |
|
44 |
- idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
30 |
+ ##We will read each cel file, summarize, and run EM one by one |
|
31 |
+ ##We will save parameters of EM to use later |
|
32 |
+ mixtureParams <- matrix(0, 4, length(filenames)) |
|
33 |
+ SNR <- vector("numeric", length(filenames)) |
|
34 |
+ SKW <- vector("numeric", length(filenames)) |
|
45 | 35 |
|
46 |
- ##S will hold (A+B)/2 and M will hold A-B |
|
47 |
- ##NOTE: We actually dont need to save S. Only for pics etc... |
|
48 |
- ##f is the correction. we save to avoid recomputing |
|
49 |
- A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
50 |
- B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
36 |
+ ## This is the sample for the fitting of splines |
|
37 |
+ ## BC: I like better the idea of the user passing the seed, |
|
38 |
+ ## because this might intefere with other analyses |
|
39 |
+ ## (like what happened to GCRMA) |
|
40 |
+ set.seed(seed) |
|
41 |
+ idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
42 |
+ ##S will hold (A+B)/2 and M will hold A-B |
|
43 |
+ ##NOTE: We actually dont need to save S. Only for pics etc... |
|
44 |
+ ##f is the correction. we save to avoid recomputing |
|
45 |
+ A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
46 |
+ B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
51 | 47 |
|
52 |
- if(verbose){ |
|
53 |
- message("Processing ", length(filenames), " files.") |
|
54 |
- if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
|
55 |
- } |
|
56 |
- ##We start looping throug cel files |
|
57 |
- idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything |
|
58 |
- for(i in seq(along=filenames)){ |
|
59 |
- y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
60 |
- x <- log2(y[idx2]) |
|
61 |
- SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
62 |
- rm(x) |
|
63 |
- y <- normalize.quantiles.use.target(y, target=reference) |
|
64 |
- A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
65 |
- B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
66 |
- |
|
67 |
- ##Now to fit the EM |
|
68 |
- if(fitMixture){ |
|
69 |
- S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN |
|
70 |
- M <- log2(A[idx, i])-log2(B[idx, i]) |
|
48 |
+ if(verbose){ |
|
49 |
+ message("Processing ", length(filenames), " files.") |
|
50 |
+ if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
|
51 |
+ } |
|
52 |
+ ##We start looping throug cel files |
|
53 |
+ idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything |
|
54 |
+ for(i in seq(along=filenames)){ |
|
55 |
+ y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
56 |
+ x <- log2(y[idx2]) |
|
57 |
+ SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
58 |
+ rm(x) |
|
59 |
+ y <- normalize.quantiles.use.target(y, target=reference) |
|
60 |
+ A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
61 |
+ B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
62 |
+ ##Now to fit the EM |
|
63 |
+ if(fitMixture){ |
|
64 |
+ S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN |
|
65 |
+ M <- log2(A[idx, i])-log2(B[idx, i]) |
|
71 | 66 |
|
72 |
- ##we need to test the choice of eps.. it is not the max diff between funcs |
|
73 |
- tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
74 |
- |
|
75 |
- mixtureParams[, i] <- tmp[["coef"]] |
|
76 |
- SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
77 |
- } |
|
78 |
- if (verbose) |
|
79 |
- if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
|
80 |
- else cat(".") |
|
81 |
- } |
|
82 |
- if (verbose) |
|
83 |
- if (getRversion() > '2.7.0') close(pb) |
|
84 |
- else cat("\n") |
|
85 |
- if (!fitMixture) SNR <- mixtureParams <- NA |
|
86 |
- ## gns comes from preprocStuff.rda |
|
87 |
- list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
|
67 |
+ ##we need to test the choice of eps.. it is not the max diff between funcs |
|
68 |
+ tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
69 |
+ |
|
70 |
+ mixtureParams[, i] <- tmp[["coef"]] |
|
71 |
+ SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
72 |
+ } |
|
73 |
+ if (verbose) |
|
74 |
+ if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
|
75 |
+ else cat(".") |
|
76 |
+ } |
|
77 |
+ if (verbose) |
|
78 |
+ if (getRversion() > '2.7.0') close(pb) |
|
79 |
+ else cat("\n") |
|
80 |
+ if (!fitMixture) SNR <- mixtureParams <- NA |
|
81 |
+ ## gns comes from preprocStuff.rda |
|
82 |
+ list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
|
88 | 83 |
} |
89 | 84 |
|
90 | 85 |
fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=10, verbose=FALSE){ |
... | ... |
@@ -4,7 +4,7 @@ |
4 | 4 |
%\VignettePackage{crlmm} |
5 | 5 |
\documentclass{article} |
6 | 6 |
\usepackage{graphicx} |
7 |
-\usepackage[authoryear,round,numbers]{natbib} |
|
7 |
+\usepackage{natbib} |
|
8 | 8 |
\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
9 | 9 |
\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
10 | 10 |
\newcommand{\Rcode}[1]{{\texttt{#1}}} |
... | ... |
@@ -15,7 +15,7 @@ |
15 | 15 |
\newcommand{\R}{\textsf{R}} |
16 | 16 |
|
17 | 17 |
\begin{document} |
18 |
-\title{Copy number estimation} |
|
18 |
+\title{Copy number estimation and genotype calling with \Rpackage{crlmm}} |
|
19 | 19 |
\date{May, 2009} |
20 | 20 |
\author{Rob Scharpf} |
21 | 21 |
\maketitle |
... | ... |
@@ -40,12 +40,13 @@ options(prompt="R> ") |
40 | 40 |
|
41 | 41 |
\section{Simple usage} |
42 | 42 |
|
43 |
-The following packages are required: |
|
43 |
+CRLMM supports the following platforms: |
|
44 | 44 |
|
45 |
-<<requiredPackages>>= |
|
45 |
+<<cdfname>>= |
|
46 | 46 |
library(crlmm) |
47 |
-library(genomewidesnp6Crlmm) |
|
47 |
+crlmm:::validCdfNames() |
|
48 | 48 |
@ |
49 |
+ |
|
49 | 50 |
\paragraph{Preprocess and genotype.} |
50 | 51 |
|
51 | 52 |
Specify the coordinates of Affymetrix cel files and where to put |
... | ... |
@@ -55,17 +56,26 @@ number estimation. |
55 | 56 |
<<celfiles>>= |
56 | 57 |
myPath <- "/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m" |
57 | 58 |
celFiles <- list.celfiles(myPath, full.names=TRUE, pattern=".CEL") |
58 |
-outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/affy" |
|
59 |
+#outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/affy" |
|
60 |
+outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/1m/tmp" |
|
61 |
+dir.create(outdir) |
|
59 | 62 |
@ |
60 | 63 |
|
61 |
-\noindent Preprocess and genotype (for more info see the crlmm vignette): |
|
62 | 64 |
|
63 |
-<<preprocessAndGenotype>>= |
|
64 |
-crlmmWrapper(celFiles[1:15], |
|
65 |
+ |
|
66 |
+\noindent The next code chunk quantile normalizes the samples to a |
|
67 |
+target reference distribution and uses the crlmm algorithm to genotype. |
|
68 |
+See \cite{Carvalho2007a} for details regarding the crlmm genotyping |
|
69 |
+algorithm. |
|
70 |
+ |
|
71 |
+<<preprocessAndGenotype, eval=FALSE>>= |
|
72 |
+crlmmWrapper(celFiles,##[1:15], |
|
73 |
+ cdfName="genomewidesnp6", |
|
65 | 74 |
save.it=TRUE, |
66 |
- load.it=TRUE, |
|
75 |
+ load.it=FALSE, |
|
67 | 76 |
intensityFile=file.path(outdir, "normalizedIntensities.rda"), |
68 |
- crlmmFile=file.path(outdir, "snpsetObject.rda")) |
|
77 |
+ crlmmFile=file.path(outdir, "snpsetObject.rda"), |
|
78 |
+ platform="affymetrix") |
|
69 | 79 |
@ |
70 | 80 |
|
71 | 81 |
As a result of the above wrapper, the following R objects are now created: |
... | ... |
@@ -110,9 +120,10 @@ if(!exists("emission.cn")){ |
110 | 120 |
} |
111 | 121 |
@ |
112 | 122 |
|
113 |
-*Warning: more can be done than currently implemented to protect against |
|
114 |
-outliers. In addition, improved estimates of uncertainty for the copy |
|
115 |
-number prediction regions will also help.* |
|
123 |
+*Before smoothing the estimates as a function of physical position by |
|
124 |
+segmentation hidden Markov models, one should consider how to handle |
|
125 |
+outliers. In particular, samples with low signal to noise ratios are |
|
126 |
+likely to have many copy number outliers. See suggested visualizations. |
|
116 | 127 |
|
117 | 128 |
Initial state probabilities and transition probabilities for the HMM: |
118 | 129 |
|
... | ... |
@@ -265,8 +276,6 @@ as well as better containers for storing these parameters. See |
265 | 276 |
<<copynumberParameters>>= |
266 | 277 |
fvarLabels(crlmmSetList[[3]]) |
267 | 278 |
@ |
268 |
- |
|
269 |
- |
|
270 | 279 |
\section{Suggested visualizations} |
271 | 280 |
|
272 | 281 |
A histogram of the signal to noise ratio for the HapMap samples: |
... | ... |
@@ -285,11 +294,11 @@ hist(crlmmSetList[[2]]$SNR, xlab="SNR", main="") |
285 | 294 |
SNRmin <- 5 |
286 | 295 |
@ |
287 | 296 |
|
288 |
-We suggest excluding samples with a signal to noise ratio less than |
|
289 |
-\Sexpr{SNRmin}. As batch effects can be very large in the |
|
290 |
-quantile-normalized intensities, we suggest adjusting for date or |
|
291 |
-chemistry plate. Ideally, one would have 70+ files in a given |
|
292 |
-batch. Here we make a table of date versus ancestry: |
|
297 |
+For Affymetrix 6.0, we suggest excluding samples with a signal to noise |
|
298 |
+ratio less than \Sexpr{SNRmin}. Adjusting by date or chemistry plate |
|
299 |
+can be helpful for limiting the influence of batch effects. Ideally, |
|
300 |
+one would have 70+ files in a given batch. Here we make a table of date |
|
301 |
+versus ancestry: |
|
293 | 302 |
|
294 | 303 |
<<specifyBatch, eval=FALSE, echo=FALSE>>= |
295 | 304 |
sns <- sampleNames(crlmmResults) |
... | ... |
@@ -307,7 +316,7 @@ not the case, we illustrate how one may adjust for batch using the |
307 | 316 |
chemistry plate as an argument for \Robject{batch} in the |
308 | 317 |
\Robject{computeCopynumber} function. |
309 | 318 |
|
310 |
-**Note: the number of samples in the \Robject{CrlmmSetList} object after |
|
319 |
+*Note: the number of samples in the \Robject{CrlmmSetList} object after |
|
311 | 320 |
copy number estimation may be fewer than the number of samples in the |
312 | 321 |
\Robject{CrlmmSetList} object after preprocessing/genotyping. This |
313 | 322 |
occurs when 1 or more samples have a signal-to-noise ratio less than |
... | ... |
@@ -1,8 +1,32 @@ |
1 |
+%\VignetteIndexEntry{crlmm copy number Vignette for Illumina} |
|
2 |
+%\VignetteDepends{crlmm} |
|
3 |
+%\VignetteKeywords{crlmm, illumina} |
|
4 |
+%\VignettePackage{crlmm} |
|
5 |
+\documentclass{article} |
|
6 |
+\usepackage{graphicx} |
|
7 |
+\usepackage{natbib} |
|
8 |
+\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
9 |
+\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
10 |
+\newcommand{\Rcode}[1]{{\texttt{#1}}} |
|
11 |
+\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
12 |
+\newcommand{\Rpackage}[1]{{\textsf{#1}}} |
|
13 |
+\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
14 |
+\newcommand{\oligo}{\Rpackage{oligo }} |
|
15 |
+\newcommand{\R}{\textsf{R}} |
|
16 |
+ |
|
17 |
+\begin{document} |
|
18 |
+\title{Copy number estimation and genotype calling with \Rpackage{crlmm}} |
|
19 |
+\date{Oct, 2009} |
|
20 |
+\author{Rob Scharpf} |
|
21 |
+\maketitle |
|
22 |
+ |
|
23 |
+<<setup, echo=FALSE, results=hide>>= |
|
24 |
+options(width=60) |
|
25 |
+options(continue=" ") |
|
26 |
+options(prompt="R> ") |
|
27 |
+@ |
|
28 |
+ |
|
1 | 29 |
<<>>= |
2 |
-# start R from /home/bst/other/mritchie/R/R-300309/bin/R |
|
3 |
-# as this has current crlmm and region package installed |
|
4 |
-# if you want to ue your own R, the current region package is at |
|
5 |
-# /thumper/ctsa/snpmicroarray/illumina/crlmm/370k/human370v1cCrlmm/ |
|
6 | 30 |
library(Biobase) |
7 | 31 |
library(crlmm) |
8 | 32 |
setwd("/thumper/ctsa/snpmicroarray/illumina/IDATS/370k") |
... | ... |
@@ -11,71 +35,37 @@ setwd("/thumper/ctsa/snpmicroarray/illumina/IDATS/370k") |
11 | 35 |
<<readIdat>>= |
12 | 36 |
samplesheet5 = read.csv("HumanHap370Duo_Sample_Map.csv", header=TRUE, as.is=TRUE)[-c(28:46,61:75,78:79),] |
13 | 37 |
if(!exists("RG")){ |
14 |
- ## remove samples which I don't have .idats for |
|
15 |
- ## subset further to allow for quicker testing |
|
16 |
- ##samplesheet5 = samplesheet5[1:5,] |
|
17 |
- ## read in the .idats |
|
18 | 38 |
RG <- readIdatFiles(samplesheet5, arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), saveDate=TRUE) |
19 | 39 |
} |
20 | 40 |
@ |
21 | 41 |
|
22 |
-Get dates for each array - scan date is likely to be most meaningful |
|
42 |
+Alternatively, arguments to the readIdatFiles can be passed through the |
|
43 |
+{\tt ...} argument of the \R{} function \Robject{crlmmWrapper}. |
|
23 | 44 |
|
24 |
-<<dates>>= |
|
25 |
-pd <- pData(RG) |
|
26 |
-scandatetime = strptime(as.character(pd$ScanDate), "%m/%d/%Y %H:%M:%S %p") |
|
27 |
-decodedatetime = strptime(as.character(pd$DecodeDate), "%m/%d/%Y %H:%M:%S %p") |
|
28 |
-table(format(scandatetime, "%d %b %Y")) |
|
29 |
-table(format(scandatetime, "%b")) |
|
45 |
+<<wrapper>>= |
|
46 |
+crlmmWrapper(sampleSheet=samplesheet5, |
|
47 |
+ arrayInfoColNames=list(barcode=NULL, position="SentrixPosition"), |
|
48 |
+ saveDate=TRUE, |
|
49 |
+ cdfName=cdfName, |
|
50 |
+ load.it=FALSE, |
|
51 |
+ save.it=TRUE, |
|
52 |
+ intensityFile=file.path(platedir, "normalizedIntensities.rda"), |
|
53 |
+ crlmmFile=file.path(platedir, "snpsetObject.rda"), |
|
54 |
+ rgFile=file.path(platedir, "rgFile.rda"), |
|
55 |
+ platform="illumina") |
|
30 | 56 |
@ |
31 | 57 |
|
32 | 58 |
Note: the code below is run for testing purposes only. None of these |
33 | 59 |
functions are exported, so would not routinely be run directly by the |
34 |
-user. A typical analysis would involve runnning readIdatFiles() |
|
35 |
-followed by crlmmIllumina() |
|
36 |
- |
|
37 |
-<<crlmm>>= |
|
38 |
-if(!exists("res.rda")){ |
|
39 |
- outdir <- "/thumper/ctsa/snpmicroarray/rs/data/hapmap/illumina/HumanCNV370-Duo" |
|
40 |
- if(!file.exists(file.path(outdir, "res.rda"))){ |
|
41 |
- crlmmOut <- crlmmIllumina(RG=RG, cdfName="human370v1c", sns=pData(RG)$ID, returnParams=TRUE, save.it=TRUE, intensityFile=file.path(outdir, "res.rda")) |
|
42 |
- save(crlmmOut, file=file.path(outdir, "crlmmOut.rda")) |
|
43 |
- } else{ |
|
44 |
- message("Loading...") |
|
45 |
- load(file.path(outdir, "res.rda")) |
|
46 |
- load(file.path(outdir, "crlmmOut.rda")) |
|
47 |
- } |
|
48 |
-} |
|
49 |
-@ |
|
60 |
+user. A typical analysis would involve runnning crlmmWrapper() followed |
|
61 |
+by update() to obtain copy number. |
|
50 | 62 |
|
51 |
-TODO: The code below needs to be udpated for the Illumina platform. |
|
52 | 63 |
|
53 | 64 |
<<SNR>>= |
54 | 65 |
hist(crlmmOut$SNR) ##approx. 5-fold higher than what we see in Affy! |
55 | 66 |
@ |
56 | 67 |
|
57 |
-<<>>= |
|
58 |
-##cnAB was assigned to the global environment |
|
59 |
-NP = (cnAB$A+cnAB$B)/2 # average normalized A and B intensities |
|
60 |
-colnames(NP) <- colnames(crlmm:::calls(crlmmOut)) |
|
61 |
-chr22 <- computeCopynumber(chrom=22, A=res[["A"]], B=res[["B"]], |
|
62 |
- calls=crlmm:::calls(crlmmOut), |
|
63 |
- conf=confs(crlmmOut), |
|
64 |
- plate=rep(1, ncol(NP)), |
|
65 |
- NP=NP, |
|
66 |
- cdfName="human370v1c", |
|
67 |
- SNR=res[["SNR"]], |
|
68 |
- envir=cnenv) |
|
69 |
-@ |
|
70 |
- |
|
71 |
-<<>>= |
|
72 |
-copyA=cnenv[["CA"]] |
|
73 |
-copyB=cnenv[["CB"]] |
|
74 |
-copyT=(copyA + copyB)/100 |
|
75 |
-copyT[copyT > 6] <- 6 |
|
76 |
-copyT[copyT < 0] <- 0 |
|
77 |
-plot(copyT[, 1], pch=".") |
|
78 |
-@ |
|
68 |
+\end{document} |
|
79 | 69 |
|
80 | 70 |
|
81 | 71 |
|
... | ... |
@@ -4,5 +4,52 @@ |
4 | 4 |
title = {A multilevel model to address batch effects in copy number estimation |
5 | 5 |
using SNP arrays}, |
6 | 6 |
month = {May}, |
7 |
- year = {2009} |
|
7 |
+ year = {2009}, |
|
8 |
+ url={http://www.bepress.com/cgi/viewcontent.cgi?article=1193&context=jhubiostat} |
|
9 |
+} |
|
10 |
+ |
|
11 |
+@ARTICLE{Carvalho2007a, |
|
12 |
+ author = {Benilton Carvalho and Henrik Bengtsson and Terence P Speed and Rafael |
|
13 |
+ A Irizarry}, |
|
14 |
+ title = {Exploration, normalization, and genotype calls of high-density oligonucleotide |
|
15 |
+ SNP array data.}, |
|
16 |
+ journal = {Biostatistics}, |
|
17 |
+ year = {2007}, |
|
18 |
+ volume = {8}, |
|
19 |
+ pages = {485--499}, |
|
20 |
+ number = {2}, |
|
21 |
+ month = {Apr}, |
|
22 |
+ abstract = {In most microarray technologies, a number of critical steps are required |
|
23 |
+ to convert raw intensity measurements into the data relied upon by |
|
24 |
+ data analysts, biologists, and clinicians. These data manipulations, |
|
25 |
+ referred to as preprocessing, can influence the quality of the ultimate |
|
26 |
+ measurements. In the last few years, the high-throughput measurement |
|
27 |
+ of gene expression is the most popular application of microarray |
|
28 |
+ technology. For this application, various groups have demonstrated |
|
29 |
+ that the use of modern statistical methodology can substantially |
|
30 |
+ improve accuracy and precision of the gene expression measurements, |
|
31 |
+ relative to ad hoc procedures introduced by designers and manufacturers |
|
32 |
+ of the technology. Currently, other applications of microarrays are |
|
33 |
+ becoming more and more popular. In this paper, we describe a preprocessing |
|
34 |
+ methodology for a technology designed for the identification of DNA |
|
35 |
+ sequence variants in specific genes or regions of the human genome |
|
36 |
+ that are associated with phenotypes of interest such as disease. |
|
37 |
+ In particular, we describe a methodology useful for preprocessing |
|
38 |
+ Affymetrix single-nucleotide polymorphism chips and obtaining genotype |
|
39 |
+ calls with the preprocessed data. We demonstrate how our procedure |
|
40 |
+ improves existing approaches using data from 3 relatively large studies |
|
41 |
+ including the one in which large numbers of independent calls are |
|
42 |
+ available. The proposed methods are implemented in the package oligo |
|
43 |
+ available from Bioconductor.}, |
|
44 |
+ doi = {10.1093/biostatistics/kxl042}, |
|
45 |
+ institution = {Department of Biostatistics, Johns Hopkins University, Baltimore, |
|
46 |
+ MD 21205, USA.}, |
|
47 |
+ keywords = {Algorithms; Alleles; Data Interpretation, Statistical; Genotype; Humans; |
|
48 |
+ Oligonucleotide Array Sequence Analysis; Oligonucleotides; Polymorphism, |
|
49 |
+ Single Nucleotide}, |
|
50 |
+ owner = {rscharpf}, |
|
51 |
+ pii = {kxl042}, |
|
52 |
+ pmid = {17189563}, |
|
53 |
+ timestamp = {2008.08.07}, |
|
54 |
+ url = {http://dx.doi.org/10.1093/biostatistics/kxl042} |
|
8 |