* collab:
add warning in vignette about NAs with BafLrrSetList function
Added Human Omni Express Exome 8 v1.1b as a supported chip
updated version number of pacakge and man pages to reflect these changes
skeleton for krlmm capability added. genotype.Illumina() can now take and XY object as input
update copynumber.Rnw to use BafLrrSetList
updates to vignettes
update namespace
# Please enter a commit message to explain why this merge is necessary,
# especially if it merges an updated upstream into a topic branch.
#
# Lines starting with '#' will be ignored, and an empty message aborts
# the commit.
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@79138 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.19.2 |
|
4 |
+Version: 1.19.6 |
|
5 | 5 |
Author: Benilton S Carvalho, Robert Scharpf, Matt Ritchie, Ingo Ruczinski, Rafael A Irizarry |
6 | 6 |
Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
7 | 7 |
Description: Faster implementation of CRLMM specific to SNP 5.0 and 6.0 arrays, as well as a copy number tool specific to 5.0, 6.0, and Illumina platforms |
... | ... |
@@ -22,7 +22,7 @@ Imports: methods, |
22 | 22 |
lattice, |
23 | 23 |
ff, |
24 | 24 |
foreach, |
25 |
- RcppEigen, |
|
25 |
+ RcppEigen (>= 0.3.1.2.1), |
|
26 | 26 |
matrixStats |
27 | 27 |
Suggests: hapmapsnp6, |
28 | 28 |
genomewidesnp6Crlmm (>= 1.0.7), |
... | ... |
@@ -43,6 +43,7 @@ Collate: AllGenerics.R |
43 | 43 |
crlmm-functions.R |
44 | 44 |
crlmmGT2.R |
45 | 45 |
crlmm-illumina.R |
46 |
+ krlmm.R |
|
46 | 47 |
snprma-functions.R |
47 | 48 |
utils.R |
48 | 49 |
zzz.R |
... | ... |
@@ -63,7 +63,7 @@ importFrom(oligoClasses, celfileDate, chromosome2integer, i2p, |
63 | 63 |
parStatus, |
64 | 64 |
splitIndicesByLength, splitIndicesByNode, AssayDataList) |
65 | 65 |
|
66 |
-importFrom(preprocessCore, normalize.quantiles, |
|
66 |
+importFrom(preprocessCore, normalize.quantiles, normalize.quantiles.determine.target, |
|
67 | 67 |
normalize.quantiles.use.target, subColSummarizeMedian) |
68 | 68 |
|
69 | 69 |
importFrom(stats, coef, cov, dnorm, kmeans, lm, mad, median, quantile, |
... | ... |
@@ -2787,12 +2787,12 @@ calculateRTheta <- function(object, ##genotype=c("AA", "AB", "BB"), |
2787 | 2787 |
centersMatrix <- lapply(centers, function(x, i, j) x[i, j], j=j, i=feature.index) |
2788 | 2788 |
lapply(centers, close) |
2789 | 2789 |
centersMatrix <- do.call(cbind, centersMatrix) |
2790 |
- centersA <- centersMatrix[, 1:3] |
|
2791 |
- centersB <- centersMatrix[, 4:6] |
|
2790 |
+ centersA <- centersMatrix[, 1:3, drop=FALSE] |
|
2791 |
+ centersB <- centersMatrix[, 4:6, drop=FALSE] |
|
2792 | 2792 |
rm(centers, centersMatrix); gc() |
2793 | 2793 |
centersA[centersA < 64] <- 64 |
2794 | 2794 |
centersB[centersB < 64] <- 64 |
2795 |
- theta <- atan2(centersB, centersA) * 2/pi |
|
2795 |
+ theta <- as.matrix(atan2(centersB, centersA) * 2/pi) |
|
2796 | 2796 |
centersA[is.na(centersA)] <- 0 |
2797 | 2797 |
centersB[is.na(centersB)] <- 0 |
2798 | 2798 |
if(length(index.np) > 0) theta[index.np, ] <- 1 |
... | ... |
@@ -2844,3 +2844,5 @@ calculateRTheta <- function(object, ##genotype=c("AA", "AB", "BB"), |
2844 | 2844 |
lrr <- integerMatrix(lrr, 100) |
2845 | 2845 |
return(list(baf=bf, lrr=lrr)) |
2846 | 2846 |
} |
2847 |
+ |
|
2848 |
+truncate <- func |
... | ... |
@@ -1,4 +1,4 @@ |
1 |
- # function below works OK provided all .idat files are in the current working directory |
|
1 |
+# function below works OK provided all .idat files are in the current working directory |
|
2 | 2 |
# - could add an option to allow files in Illumina directory structure to be handled |
3 | 3 |
# or to use the optional 'Path' column in sampleSheet |
4 | 4 |
# - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet |
... | ... |
@@ -243,10 +243,12 @@ RGtoXY = function(RG, chipType, verbose=TRUE) { |
243 | 243 |
"human1mduov3b", # 1M Duo |
244 | 244 |
"humanomni1quadv1b", # Omni1 quad |
245 | 245 |
"humanomni25quadv1b", # Omni2.5 quad |
246 |
- "humanomni258v1a", # Omni2.5 8 sample |
|
246 |
+ "humanomni258v1a", # Omni2.5 8 |
|
247 |
+ "humanomni5quadv1b", # Omni5 quad |
|
247 | 248 |
"humanomniexpress12v1b", # Omni express 12 |
248 | 249 |
"humanimmuno12v1b", # Immuno chip 12 |
249 |
- "humancytosnp12v2p1h") # CytoSNP 12 |
|
250 |
+ "humancytosnp12v2p1h", # CytoSNP 12 |
|
251 |
+ "humanomniexpexome8v1p1b") # Omni Express Exome 8 v1.1b |
|
250 | 252 |
## RS: added cleancdfname() |
251 | 253 |
if(missing(chipType)){ |
252 | 254 |
chipType = match.arg(cleancdfname(annotation(RG)), chipList) |
... | ... |
@@ -353,12 +355,13 @@ RGtoXY = function(RG, chipType, verbose=TRUE) { |
353 | 355 |
} |
354 | 356 |
|
355 | 357 |
|
356 |
-stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) { |
|
357 |
- if(useTarget){ |
|
358 |
+stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE, quantile.method="between") { |
|
359 |
+ if(quantile.method=="between") { |
|
360 |
+ if(useTarget){ |
|
358 | 361 |
objectsNeeded <- c("stripnum", "reference") |
359 |
- } else objectsNeeded <- "stripnum" |
|
360 |
- needToLoad <- !all(sapply(objectsNeeded, isLoaded)) |
|
361 |
- if(needToLoad){ |
|
362 |
+ } else objectsNeeded <- "stripnum" |
|
363 |
+ needToLoad <- !all(sapply(objectsNeeded, isLoaded)) |
|
364 |
+ if(needToLoad){ |
|
362 | 365 |
pkgname = getCrlmmAnnotationName(annotation(XY)) |
363 | 366 |
if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
364 | 367 |
suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
... | ... |
@@ -369,36 +372,86 @@ stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) { |
369 | 372 |
} |
370 | 373 |
if(verbose) message("Loading strip and reference normalization information.") |
371 | 374 |
loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
372 |
- } |
|
373 |
- stripnum = getVarInEnv("stripnum") |
|
374 |
- if(useTarget) |
|
375 |
+ } |
|
376 |
+ stripnum = getVarInEnv("stripnum") |
|
377 |
+ if(useTarget) |
|
375 | 378 |
targetdist = getVarInEnv("reference") |
376 |
- if(verbose){ |
|
379 |
+ if(verbose){ |
|
377 | 380 |
message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.") |
378 | 381 |
if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=max(stripnum), style=3) |
379 |
- } |
|
380 |
- for(s in 1:max(stripnum)) { |
|
381 |
- if(verbose) { |
|
382 |
- if (getRversion() > '2.7.0') setTxtProgressBar(pb, s) |
|
383 |
- else cat(".") |
|
384 |
- } |
|
385 |
- sel = stripnum==s |
|
386 |
- ##RS: faster to access data directly |
|
387 |
- ##subX = as.matrix(exprs(channel(XY, "X"))[sel,]) |
|
388 |
- ##subY = as.matrix(exprs(channel(XY, "Y"))[sel,]) |
|
389 |
- subX <- as.matrix(assayData(XY)[["X"]][sel, ]) |
|
390 |
- subY <- as.matrix(assayData(XY)[["Y"]][sel, ]) |
|
391 |
- if(useTarget) |
|
392 |
- tmp = normalize.quantiles.use.target(cbind(subX, subY), targetdist[[s]]) |
|
393 |
- else |
|
394 |
- tmp = normalize.quantiles(cbind(subX, subY)) |
|
395 |
- XY@assayData$X[sel,] = matrix(as.integer(tmp[,1:(ncol(tmp)/2)]+16), nrow(tmp), ncol(tmp)/2) |
|
396 |
- XY@assayData$Y[sel,] = matrix(as.integer(tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16), nrow(tmp), ncol(tmp)/2) |
|
397 |
- rm(subX, subY, tmp, sel) |
|
398 |
- gc(verbose=FALSE) |
|
399 |
- } |
|
400 |
- if(verbose) |
|
401 |
- cat("\n") |
|
382 |
+ } |
|
383 |
+ for(s in 1:max(stripnum)) { |
|
384 |
+ if(verbose) { |
|
385 |
+ if (getRversion() > '2.7.0') setTxtProgressBar(pb, s) |
|
386 |
+ else cat(".") |
|
387 |
+ } |
|
388 |
+ sel = stripnum==s |
|
389 |
+ ##RS: faster to access data directly |
|
390 |
+ ##subX = as.matrix(exprs(channel(XY, "X"))[sel,]) |
|
391 |
+ ##subY = as.matrix(exprs(channel(XY, "Y"))[sel,]) |
|
392 |
+ subX <- as.matrix(assayData(XY)[["X"]][sel, ]) |
|
393 |
+ subY <- as.matrix(assayData(XY)[["Y"]][sel, ]) |
|
394 |
+ if(useTarget) |
|
395 |
+ tmp = normalize.quantiles.use.target(cbind(subX, subY), targetdist[[s]]) |
|
396 |
+ else |
|
397 |
+ tmp = normalize.quantiles(cbind(subX, subY)) |
|
398 |
+ XY@assayData$X[sel,] = matrix(as.integer(tmp[,1:(ncol(tmp)/2)]+16), nrow(tmp), ncol(tmp)/2) |
|
399 |
+ XY@assayData$Y[sel,] = matrix(as.integer(tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16), nrow(tmp), ncol(tmp)/2) |
|
400 |
+ rm(subX, subY, tmp, sel) |
|
401 |
+ gc(verbose=FALSE) |
|
402 |
+ } |
|
403 |
+ if(verbose) |
|
404 |
+ cat("\n") |
|
405 |
+ } |
|
406 |
+ if(quantile.method=="within") { # ignore strip information |
|
407 |
+ if(useTarget){ |
|
408 |
+ objectsNeeded <- c("Xref", "Yref") |
|
409 |
+ } else objectsNeeded <- "" |
|
410 |
+ needToLoad <- !all(sapply(objectsNeeded, isLoaded)) |
|
411 |
+ if(needToLoad){ |
|
412 |
+ pkgname = getCrlmmAnnotationName(annotation(XY)) |
|
413 |
+ if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
414 |
+ suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
415 |
+ msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
416 |
+ message(strwrap(msg)) |
|
417 |
+ stop("Package ", pkgname, " could not be found.") |
|
418 |
+ rm(suggCall, msg) |
|
419 |
+ } |
|
420 |
+ if(verbose) message("Loading reference normalization information.") |
|
421 |
+ loader("targetXY.rda", .crlmmPkgEnv, pkgname) |
|
422 |
+ } |
|
423 |
+ if(useTarget) { |
|
424 |
+ Xref = getVarInEnv("Xref") |
|
425 |
+ Yref = getVarInEnv("Yref") |
|
426 |
+ } else{ |
|
427 |
+ Xref = normalize.quantiles.determine.target(as.matrix(assayData(XY)[["X"]])) |
|
428 |
+ gc(verbose=FALSE) |
|
429 |
+ Yref = normalize.quantiles.determine.target(as.matrix(assayData(XY)[["Y"]])) |
|
430 |
+ gc(verbose=FALSE) |
|
431 |
+ } |
|
432 |
+ if(verbose){ |
|
433 |
+ message("Quantile normalizing ", ncol(XY), " arrays, one at a time.") |
|
434 |
+ if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=ncol(XY), style=3) |
|
435 |
+ } |
|
436 |
+ for(s in 1:ncol(XY)) { |
|
437 |
+ if(verbose) { |
|
438 |
+ if (getRversion() > '2.7.0') setTxtProgressBar(pb, s) |
|
439 |
+ else cat(".") |
|
440 |
+ } |
|
441 |
+ ##RS: faster to access data directly |
|
442 |
+ ##subX = as.matrix(exprs(channel(XY, "X"))[sel,]) |
|
443 |
+ ##subY = as.matrix(exprs(channel(XY, "Y"))[sel,]) |
|
444 |
+ subX <- as.matrix(assayData(XY)[["X"]][,s]) |
|
445 |
+ subY <- as.matrix(assayData(XY)[["Y"]][,s]) |
|
446 |
+ tmpX = normalize.quantiles.use.target(subX, as.integer(Xref)) |
|
447 |
+ tmpY = normalize.quantiles.use.target(subY, as.integer(Yref)) |
|
448 |
+ XY@assayData$X[,s] = as.integer(tmpX+16) |
|
449 |
+ XY@assayData$Y[,s] = as.integer(tmpY+16) |
|
450 |
+ rm(subX, subY, tmpX, tmpY) |
|
451 |
+ } |
|
452 |
+ if(verbose) |
|
453 |
+ cat("\n") |
|
454 |
+ } |
|
402 | 455 |
XY |
403 | 456 |
} |
404 | 457 |
|
... | ... |
@@ -411,13 +464,14 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, |
411 | 464 |
cdfName, |
412 | 465 |
sns, |
413 | 466 |
stripNorm=TRUE, |
414 |
- useTarget=TRUE) { #, |
|
467 |
+ useTarget=TRUE, |
|
468 |
+ quantile.method="between") { #, |
|
415 | 469 |
# outdir=".") { |
416 | 470 |
# save.it=FALSE, |
417 | 471 |
# snpFile, |
418 | 472 |
# cnFile) { |
419 | 473 |
if(stripNorm) |
420 |
- XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose) |
|
474 |
+ XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose, quantile.method=quantile.method) |
|
421 | 475 |
## MR: the code below is mostly straight from snprma.R |
422 | 476 |
if (missing(sns)) sns = sampleNames(XY) #$X |
423 | 477 |
if(missing(cdfName)) |
... | ... |
@@ -437,14 +491,17 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, |
437 | 491 |
} |
438 | 492 |
if(verbose) message("Loading snp annotation and mixture model parameters.") |
439 | 493 |
loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
440 |
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
494 |
+ if(fitMixture) |
|
495 |
+ loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
441 | 496 |
loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname) |
442 | 497 |
loader("npProbesFid.rda", .crlmmPkgEnv, pkgname) |
443 | 498 |
} |
444 | 499 |
autosomeIndex = getVarInEnv("autosomeIndex") |
445 |
- SMEDIAN = getVarInEnv("SMEDIAN") |
|
446 |
- theKnots = getVarInEnv("theKnots") |
|
447 |
- npIndex = getVarInEnv("npProbesFid") |
|
500 |
+ if(fitMixture) { |
|
501 |
+ SMEDIAN = getVarInEnv("SMEDIAN") |
|
502 |
+ theKnots = getVarInEnv("theKnots") |
|
503 |
+ } |
|
504 |
+ npIndex = getVarInEnv("npProbesFid") |
|
448 | 505 |
snpIndex = getVarInEnv("snpProbesFid") |
449 | 506 |
narrays = ncol(XY) |
450 | 507 |
nprobes = length(npIndex) |
... | ... |
@@ -490,7 +547,7 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, |
490 | 547 |
B = matrix(NA, nprobes, narrays) |
491 | 548 |
zero = matrix(NA, nprobes, narrays) |
492 | 549 |
|
493 |
- if(verbose){ |
|
550 |
+ if(verbose && fitMixture){ |
|
494 | 551 |
message("Calibrating ", narrays, " arrays.") |
495 | 552 |
if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=narrays, style=3) |
496 | 553 |
} |
... | ... |
@@ -510,15 +567,15 @@ preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, |
510 | 567 |
tmp = fitAffySnpMixture56(S, M, theKnots, eps=eps) |
511 | 568 |
mixtureParams[, i] = tmp[["coef"]] |
512 | 569 |
SNR[i] = tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
513 |
- } |
|
514 |
- if(verbose) { |
|
515 |
- if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
|
516 |
- else cat(".") |
|
570 |
+ if(verbose) { |
|
571 |
+ if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
|
572 |
+ else cat(".") |
|
573 |
+ } |
|
517 | 574 |
} |
518 | 575 |
## run garbage collection every now and then |
519 | 576 |
if(i %% 100 == 0) gc(verbose=FALSE); |
520 | 577 |
} |
521 |
- if (verbose) { |
|
578 |
+ if (verbose && fitMixture) { |
|
522 | 579 |
if (getRversion() > '2.7.0') close(pb) |
523 | 580 |
else cat("\n") |
524 | 581 |
} |
... | ... |
@@ -800,15 +857,17 @@ constructInf <- function(sampleSheet=NULL, |
800 | 857 |
highDensity=FALSE, |
801 | 858 |
sep="_", |
802 | 859 |
fileExt=list(green="Grn.idat", red="Red.idat"), |
860 |
+ XY, |
|
803 | 861 |
cdfName, |
804 | 862 |
verbose=FALSE, |
805 |
- batch, #fns, |
|
863 |
+ batch=NULL, #fns, |
|
806 | 864 |
saveDate=TRUE) { #, outdir="."){ |
807 | 865 |
verbose <- FALSE |
808 |
- if(!is.null(arrayNames)) { |
|
866 |
+ if(is.null(XY)) { |
|
867 |
+ if(!is.null(arrayNames)) { |
|
809 | 868 |
pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) |
810 |
- } |
|
811 |
- if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet |
|
869 |
+ } |
|
870 |
+ if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet |
|
812 | 871 |
if(is.null(arrayNames)) { |
813 | 872 |
if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) { |
814 | 873 |
barcode = sampleSheet[,arrayInfoColNames$barcode] |
... | ... |
@@ -826,61 +885,62 @@ constructInf <- function(sampleSheet=NULL, |
826 | 885 |
arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames) |
827 | 886 |
} |
828 | 887 |
} |
829 |
- } |
|
830 |
- pd = new("AnnotatedDataFrame", data = sampleSheet) |
|
831 |
- sampleNames(pd) = make.unique(basename(arrayNames)) |
|
832 |
- } |
|
833 |
- if(is.null(arrayNames)) { |
|
888 |
+ } |
|
889 |
+ pd = new("AnnotatedDataFrame", data = sampleSheet) |
|
890 |
+ sampleNames(pd) = make.unique(basename(arrayNames)) |
|
891 |
+ } |
|
892 |
+ if(is.null(arrayNames)) { |
|
834 | 893 |
arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path)) |
835 | 894 |
if(!is.null(sampleSheet)) { |
836 | 895 |
sampleSheet=NULL |
837 | 896 |
cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n") |
838 | 897 |
} |
839 | 898 |
pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) |
840 |
- } |
|
841 |
- narrays = length(arrayNames) |
|
899 |
+ } |
|
900 |
+ narrays = length(arrayNames) |
|
842 | 901 |
|
843 |
- if(!missing(batch)) { |
|
902 |
+ if(!is.null(batch)) { |
|
844 | 903 |
stopifnot(length(batch) == narrays) |
845 |
- } |
|
846 |
- if(missing(batch)) { |
|
847 |
- stop("Must specify 'batch'") # batch = as.factor(rep(1, narrays)) |
|
848 |
- } |
|
904 |
+ } |
|
905 |
+ if(is.null(batch)) { |
|
906 |
+ batch = rep("batch1", narrays) # assume only one batch stop("Must specify 'batch'") |
|
907 |
+ } |
|
908 |
+ if(is(batch, "factor")) batch = as.character(batch) |
|
849 | 909 |
|
850 |
- grnfiles = paste(arrayNames, fileExt$green, sep=sep) |
|
851 |
- redfiles = paste(arrayNames, fileExt$red, sep=sep) |
|
852 |
- if(length(grnfiles)==0 || length(redfiles)==0) |
|
910 |
+ grnfiles = paste(arrayNames, fileExt$green, sep=sep) |
|
911 |
+ redfiles = paste(arrayNames, fileExt$red, sep=sep) |
|
912 |
+ if(length(grnfiles)==0 || length(redfiles)==0) |
|
853 | 913 |
stop("Cannot find .idat files") |
854 |
- if(length(grnfiles)!=length(redfiles)) |
|
914 |
+ if(length(grnfiles)!=length(redfiles)) |
|
855 | 915 |
stop("Cannot find matching .idat files") |
856 |
- if(path[1] != "." & path[1] != ""){ |
|
916 |
+ if(path[1] != "." & path[1] != ""){ |
|
857 | 917 |
grnidats = file.path(path, grnfiles) |
858 | 918 |
redidats = file.path(path, redfiles) |
859 |
- } else { |
|
919 |
+ } else { |
|
860 | 920 |
message("path arg not set. Assuming files are in local directory, or that complete path is provided") |
861 | 921 |
grnidats = grnfiles |
862 | 922 |
redidats = redfiles |
863 |
- } |
|
864 |
- if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files") |
|
865 |
- if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files") |
|
866 |
- |
|
867 |
- if(verbose) message("Initializing container for genotyping and copy number estimation") |
|
868 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
869 |
- path <- system.file("extdata", package=pkgname) |
|
870 |
- genome <- getAvailableIlluminaGenomeBuild(path) |
|
871 |
- featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome) |
|
872 |
- nr = nrow(featureData); nc = narrays |
|
873 |
- sns <- if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) { |
|
923 |
+ } |
|
924 |
+ if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files") |
|
925 |
+ if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files") |
|
926 |
+ |
|
927 |
+ if(verbose) message("Initializing container for genotyping and copy number estimation") |
|
928 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
929 |
+ path <- system.file("extdata", package=pkgname) |
|
930 |
+ genome <- getAvailableIlluminaGenomeBuild(path) |
|
931 |
+ featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome) |
|
932 |
+ nr = nrow(featureData); nc = narrays |
|
933 |
+ sns <- if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) { |
|
874 | 934 |
make.unique(sampleSheet$Sample_ID) |
875 |
- } else{ |
|
935 |
+ } else{ |
|
876 | 936 |
make.unique(basename(arrayNames)) |
877 |
- } |
|
878 |
- biga <- initializeBigMatrix(name="A", nr, nc) |
|
879 |
- bigb <- initializeBigMatrix(name="B", nr, nc) |
|
880 |
- bigc <- initializeBigMatrix(name="call", nr, nc) |
|
881 |
- bigd <- initializeBigMatrix(name="callPr", nr,nc) |
|
882 |
- colnames(biga) <- colnames(bigb) <- colnames(bigc) <- colnames(bigd) <- sns |
|
883 |
- cnSet <- new("CNSet", |
|
937 |
+ } |
|
938 |
+ biga <- initializeBigMatrix(name="A", nr, nc) |
|
939 |
+ bigb <- initializeBigMatrix(name="B", nr, nc) |
|
940 |
+ bigc <- initializeBigMatrix(name="call", nr, nc) |
|
941 |
+ bigd <- initializeBigMatrix(name="callPr", nr,nc) |
|
942 |
+ colnames(biga) <- colnames(bigb) <- colnames(bigc) <- colnames(bigd) <- sns |
|
943 |
+ cnSet <- new("CNSet", |
|
884 | 944 |
alleleA=biga, |
885 | 945 |
alleleB=bigb, |
886 | 946 |
call=bigc, |
... | ... |
@@ -894,25 +954,63 @@ constructInf <- function(sampleSheet=NULL, |
894 | 954 |
## } else{ |
895 | 955 |
## sampleNames(cnSet) <- make.unique(basename(arrayNames)) |
896 | 956 |
## } |
897 |
- if(saveDate){ |
|
957 |
+ if(saveDate){ |
|
898 | 958 |
protocolData = getProtocolData.Illumina(grnidats, sep=sep, fileExt=fileExt$green, verbose=verbose) |
899 |
- } else{ |
|
959 |
+ } else{ |
|
900 | 960 |
protocolData = annotatedDataFrameFrom(A(cnSet), byrow=FALSE) |
901 |
- } |
|
902 |
- rownames(pData(protocolData)) = sampleNames(cnSet) |
|
903 |
- protocolData(cnSet) = protocolData |
|
961 |
+ } |
|
962 |
+ rownames(pData(protocolData)) = sampleNames(cnSet) |
|
963 |
+ protocolData(cnSet) = protocolData |
|
904 | 964 |
##featureData(cnSet) = featureData |
905 |
- featureNames(cnSet) = featureNames(featureData) |
|
906 |
- cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer") |
|
907 |
- cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double") |
|
908 |
- cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double") |
|
965 |
+ featureNames(cnSet) = featureNames(featureData) |
|
966 |
+ cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer") |
|
967 |
+ cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double") |
|
968 |
+ cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double") |
|
909 | 969 |
##sampleNames(cnSet) <- basename(sampleNames(cnSet)) |
970 |
+ } else{ # if XY specified, easier set-up of cnSet |
|
971 |
+ narrays = ncol(XY) |
|
972 |
+ if(verbose) message("Initializing container for genotyping and copy number estimation") |
|
973 |
+ if(!is.null(batch)) { |
|
974 |
+ stopifnot(length(batch) == narrays) |
|
975 |
+ } |
|
976 |
+ if(is.null(batch)) { |
|
977 |
+ batch = rep("batch1", narrays) # assume only one batch stop("Must specify 'batch'") |
|
978 |
+ } |
|
979 |
+ if(is(batch, "factor")) batch = as.character(batch) |
|
980 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
981 |
+ path <- system.file("extdata", package=pkgname) |
|
982 |
+ genome <- getAvailableIlluminaGenomeBuild(path) |
|
983 |
+ featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome) |
|
984 |
+ nr = nrow(featureData); nc = narrays |
|
985 |
+ sns <- sampleNames(XY) |
|
986 |
+ biga <- initializeBigMatrix(name="A", nr, nc) |
|
987 |
+ bigb <- initializeBigMatrix(name="B", nr, nc) |
|
988 |
+ bigc <- initializeBigMatrix(name="call", nr, nc) |
|
989 |
+ bigd <- initializeBigMatrix(name="callPr", nr,nc) |
|
990 |
+ colnames(biga) <- colnames(bigb) <- colnames(bigc) <- colnames(bigd) <- sns |
|
991 |
+ cnSet <- new("CNSet", |
|
992 |
+ alleleA=biga, |
|
993 |
+ alleleB=bigb, |
|
994 |
+ call=bigc, |
|
995 |
+ callProbability=bigd, |
|
996 |
+ annotation=cdfName, |
|
997 |
+ featureData=featureData, |
|
998 |
+ batch=batch, |
|
999 |
+ genome=genome) |
|
1000 |
+ protocolData = annotatedDataFrameFrom(A(cnSet), byrow=FALSE) |
|
1001 |
+ rownames(pData(protocolData)) = sampleNames(cnSet) |
|
1002 |
+ protocolData(cnSet) = protocolData |
|
1003 |
+ featureNames(cnSet) = featureNames(featureData) |
|
1004 |
+ cnSet$gender = initializeBigVector("gender", ncol(cnSet), vmode="integer") |
|
1005 |
+ cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double") |
|
1006 |
+ cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double") |
|
1007 |
+ } |
|
910 | 1008 |
return(cnSet) |
911 | 1009 |
} |
912 | 1010 |
construct.Illumina <- constructInf |
913 | 1011 |
|
914 | 1012 |
preprocessInf <- function(cnSet, |
915 |
- sampleSheet=NULL, |
|
1013 |
+ sampleSheet=NULL, |
|
916 | 1014 |
arrayNames=NULL, |
917 | 1015 |
ids=NULL, |
918 | 1016 |
path=".", |
... | ... |
@@ -920,11 +1018,13 @@ preprocessInf <- function(cnSet, |
920 | 1018 |
highDensity=TRUE, |
921 | 1019 |
sep="_", |
922 | 1020 |
fileExt=list(green="Grn.idat", red="Red.idat"), |
1021 |
+ XY, |
|
923 | 1022 |
saveDate=TRUE, |
924 | 1023 |
stripNorm=TRUE, |
925 | 1024 |
useTarget=TRUE, |
926 | 1025 |
mixtureSampleSize=10^5, |
927 |
- fitMixture=TRUE, |
|
1026 |
+ fitMixture=TRUE, |
|
1027 |
+ quantile.method="between", |
|
928 | 1028 |
eps=0.1, |
929 | 1029 |
verbose=TRUE, |
930 | 1030 |
seed=1, cdfName){ |
... | ... |
@@ -949,6 +1049,7 @@ preprocessInf <- function(cnSet, |
949 | 1049 |
highDensity=highDensity, |
950 | 1050 |
sep=sep, |
951 | 1051 |
fileExt=fileExt, |
1052 |
+ XY=XY, |
|
952 | 1053 |
saveDate=saveDate, |
953 | 1054 |
verbose=verbose, |
954 | 1055 |
mixtureSampleSize=mixtureSampleSize, |
... | ... |
@@ -959,6 +1060,7 @@ preprocessInf <- function(cnSet, |
959 | 1060 |
sns=sns, |
960 | 1061 |
stripNorm=stripNorm, |
961 | 1062 |
useTarget=useTarget, |
1063 |
+ quantile.method=quantile.method, |
|
962 | 1064 |
A=A(cnSet), |
963 | 1065 |
B=B(cnSet), |
964 | 1066 |
GT=calls(cnSet), |
... | ... |
@@ -981,7 +1083,8 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3), |
981 | 1083 |
badSNP=0.7, |
982 | 1084 |
gender=NULL, |
983 | 1085 |
DF=6, |
984 |
- cdfName){ |
|
1086 |
+ cdfName, |
|
1087 |
+ call.method="crlmm"){ |
|
985 | 1088 |
is.snp = isSnp(cnSet) |
986 | 1089 |
snp.index = which(is.snp) |
987 | 1090 |
## narrays = ncol(cnSet) |
... | ... |
@@ -998,7 +1101,8 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3), |
998 | 1101 |
## message("Writing complete. Begin genotyping...") |
999 | 1102 |
## close(A(cnSet)) |
1000 | 1103 |
## close(B(cnSet)) |
1001 |
- tmp <- crlmmGT2(A=A(cnSet), |
|
1104 |
+ if(call.method=="crlmm") { |
|
1105 |
+ tmp <- crlmmGT2(A=A(cnSet), |
|
1002 | 1106 |
B=B(cnSet), |
1003 | 1107 |
SNR=cnSet$SNR, |
1004 | 1108 |
mixtureParams=mixtureParams, |
... | ... |
@@ -1016,11 +1120,14 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3), |
1016 | 1120 |
callsGt=calls(cnSet), |
1017 | 1121 |
callsPr=snpCallProbability(cnSet))#, |
1018 | 1122 |
##RS: I changed the API for crlmmGT2 to be consistent with crlmmGT |
1123 |
+ open(cnSet$gender) |
|
1124 |
+ cnSet$gender[,] = tmp$gender |
|
1125 |
+ close(cnSet$gender) |
|
1126 |
+ } |
|
1127 |
+ if(call.method=="krlmm") |
|
1128 |
+ tmp <- krlmm(cnSet, cdfName, gender=gender) # new function required... cnSet, cdfName and gender are probably the only arguments you need... |
|
1019 | 1129 |
## snp.names=featureNames(cnSet)[snp.index]) |
1020 |
- if(verbose) message("Genotyping finished. Updating container with genotype calls and confidence scores.") |
|
1021 |
- open(cnSet$gender) |
|
1022 |
- cnSet$gender[,] = tmp$gender |
|
1023 |
- close(cnSet$gender) |
|
1130 |
+ if(verbose) message("Genotyping finished.") # Updating container with genotype calls and confidence scores.") |
|
1024 | 1131 |
TRUE |
1025 | 1132 |
} |
1026 | 1133 |
|
... | ... |
@@ -1033,12 +1140,15 @@ genotype.Illumina <- function(sampleSheet=NULL, |
1033 | 1140 |
highDensity=FALSE, |
1034 | 1141 |
sep="_", |
1035 | 1142 |
fileExt=list(green="Grn.idat", red="Red.idat"), |
1143 |
+ XY=NULL, |
|
1144 |
+ call.method="crlmm", |
|
1036 | 1145 |
cdfName, |
1037 | 1146 |
copynumber=TRUE, |
1038 |
- batch, |
|
1147 |
+ batch=NULL, |
|
1039 | 1148 |
saveDate=TRUE, |
1040 | 1149 |
stripNorm=TRUE, |
1041 | 1150 |
useTarget=TRUE, |
1151 |
+ quantile.method="between", |
|
1042 | 1152 |
mixtureSampleSize=10^5, |
1043 | 1153 |
fitMixture=TRUE, |
1044 | 1154 |
eps=0.1, |
... | ... |
@@ -1053,25 +1163,58 @@ genotype.Illumina <- function(sampleSheet=NULL, |
1053 | 1163 |
gender=NULL, |
1054 | 1164 |
returnParams=TRUE, |
1055 | 1165 |
badSNP=0.7) { |
1166 |
+ krlmm.supported = c("humanomni1quadv1b", # Omni1 quad |
|
1167 |
+ "humanomni25quadv1b", # Omni2.5 quad |
|
1168 |
+ "humanomni258v1a", # Omni2.5 8 |
|
1169 |
+ "humanomni5quadv1b") # Omni5 quad |
|
1170 |
+ crlmm.supported = c("human370v1c", # 370CNV |
|
1171 |
+ "human650v3a", # 650Y |
|
1172 |
+ "human610quadv1b", # 610 quad |
|
1173 |
+ "human660quadv1a", # 660 quad |
|
1174 |
+ "human370quadv3c", # 370CNV quad |
|
1175 |
+ "human550v3b", # 550K |
|
1176 |
+ "human1mduov3b", # 1M Duo |
|
1177 |
+ "humanomni1quadv1b", # Omni1 quad |
|
1178 |
+ "humanomniexpress12v1b", # Omni express 12 |
|
1179 |
+ "humanimmuno12v1b", # Immuno chip 12 |
|
1180 |
+ "humancytosnp12v2p1h", # CytoSNP 12 |
|
1181 |
+ "humanomniexpexome8v1p1b") # Omni Express Exome 8 v1.1b |
|
1182 |
+ if(call.method=="krlmm") { |
|
1183 |
+ if(!any(cdfName==krlmm.supported)) |
|
1184 |
+ stop(cdfName, " platform not supported by krlmm. Consider setting call.method=\'crlmm\'") |
|
1185 |
+ fitMixture=FALSE |
|
1186 |
+ quantile.method="within" |
|
1187 |
+ } |
|
1188 |
+ if(call.method=="crlmm") { |
|
1189 |
+ if(!any(cdfName==crlmm.supported)) |
|
1190 |
+ stop(cdfName, " platform not supported by crlmm. Consider setting call.method=\'krlmm\'") |
|
1191 |
+ fitMixture=TRUE |
|
1192 |
+ # quantile.method="between" |
|
1193 |
+ } |
|
1056 | 1194 |
is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE) |
1057 | 1195 |
stopifnot(is.lds) |
1196 |
+ if(!is.null(XY) && missing(cdfName)) |
|
1197 |
+ cdfName = getCrlmmAnnotationName(annotation(cnSet)) |
|
1058 | 1198 |
if(missing(cdfName)) stop("must specify cdfName") |
1059 | 1199 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
1060 |
- stopifnot(!missing(batch)) |
|
1061 |
- if(is(batch, "factor")) batch <- as.character(batch) |
|
1200 |
+# stopifnot(!missing(batch)) |
|
1201 |
+# if(is(batch, "factor")) batch <- as.character(batch) |
|
1062 | 1202 |
pkgname <- getCrlmmAnnotationName(cdfName) |
1063 |
- message("Instantiate CNSet container.") |
|
1064 |
- cnSet <- constructInf(sampleSheet=sampleSheet, |
|
1203 |
+# if(is.null(cnSet)) { |
|
1204 |
+ message("Instantiate CNSet container.") |
|
1205 |
+ cnSet <- constructInf(sampleSheet=sampleSheet, |
|
1065 | 1206 |
arrayNames=arrayNames, |
1066 | 1207 |
path=path, |
1067 | 1208 |
arrayInfoColNames=arrayInfoColNames, |
1068 | 1209 |
highDensity=highDensity, |
1069 | 1210 |
sep=sep, |
1070 | 1211 |
fileExt=fileExt, |
1212 |
+ XY=XY, |
|
1071 | 1213 |
cdfName=cdfName, |
1072 | 1214 |
verbose=verbose, |
1073 | 1215 |
batch=batch, |
1074 | 1216 |
saveDate=saveDate) |
1217 |
+# } |
|
1075 | 1218 |
mixtureParams <- preprocessInf(cnSet=cnSet, |
1076 | 1219 |
sampleSheet=sampleSheet, |
1077 | 1220 |
arrayNames=arrayNames, |
... | ... |
@@ -1082,32 +1225,33 @@ genotype.Illumina <- function(sampleSheet=NULL, |
1082 | 1225 |
sep=sep, |
1083 | 1226 |
fileExt=fileExt, |
1084 | 1227 |
saveDate=saveDate, |
1228 |
+ XY=XY, |
|
1085 | 1229 |
stripNorm=stripNorm, |
1086 | 1230 |
useTarget=useTarget, |
1087 | 1231 |
mixtureSampleSize=mixtureSampleSize, |
1088 | 1232 |
fitMixture=fitMixture, |
1233 |
+ quantile.method=quantile.method, |
|
1089 | 1234 |
eps=eps, |
1090 | 1235 |
verbose=verbose, |
1091 | 1236 |
seed=seed, |
1092 |
- cdfName=cdfName) |
|
1237 |
+ cdfName=cdfName) |
|
1093 | 1238 |
message("Preprocessing complete. Begin genotyping...") |
1094 | 1239 |
genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams, |
1095 | 1240 |
probs=probs, |
1096 |
- SNRMin=SNRMin, |
|
1097 |
- recallMin=recallMin, |
|
1098 |
- recallRegMin=recallRegMin, |
|
1099 |
- verbose=verbose, |
|
1100 |
- returnParams=returnParams, |
|
1101 |
- badSNP=badSNP, |
|
1102 |
- gender=gender, |
|
1103 |
- DF=DF, |
|
1104 |
- cdfName=cdfName) |
|
1241 |
+ SNRMin=SNRMin, |
|
1242 |
+ recallMin=recallMin, |
|
1243 |
+ recallRegMin=recallRegMin, |
|
1244 |
+ verbose=verbose, |
|
1245 |
+ returnParams=returnParams, |
|
1246 |
+ badSNP=badSNP, |
|
1247 |
+ gender=gender, |
|
1248 |
+ DF=DF, |
|
1249 |
+ cdfName=cdfName, |
|
1250 |
+ call.method=call.method) |
|
1105 | 1251 |
return(cnSet) |
1106 | 1252 |
} |
1107 | 1253 |
|
1108 | 1254 |
|
1109 |
- |
|
1110 |
- |
|
1111 | 1255 |
processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL, |
1112 | 1256 |
arrayNames=NULL, |
1113 | 1257 |
ids=NULL, |
... | ... |
@@ -1116,6 +1260,7 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL, |
1116 | 1260 |
highDensity=FALSE, |
1117 | 1261 |
sep="_", |
1118 | 1262 |
fileExt=list(green="Grn.idat", red="Red.idat"), |
1263 |
+ XY, |
|
1119 | 1264 |
saveDate=FALSE, |
1120 | 1265 |
verbose=TRUE, |
1121 | 1266 |
mixtureSampleSize=10^5, |
... | ... |
@@ -1126,6 +1271,7 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL, |
1126 | 1271 |
sns, |
1127 | 1272 |
stripNorm=TRUE, |
1128 | 1273 |
useTarget=TRUE, |
1274 |
+ quantile.method=quantile.method, |
|
1129 | 1275 |
A, B, |
1130 | 1276 |
GT, |
1131 | 1277 |
GTP, |
... | ... |
@@ -1133,16 +1279,23 @@ processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL, |
1133 | 1279 |
message("Processing sample stratum ", stratum, " of ", length(sampleBatches)) |
1134 | 1280 |
sel <- sampleBatches[[stratum]] |
1135 | 1281 |
if(length(path)>= length(sel)) path = path[sel] |
1136 |
- RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel], |
|
1282 |
+ if(is.null(XY)) { |
|
1283 |
+ RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel], |
|
1137 | 1284 |
ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, |
1138 | 1285 |
highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose) |
1139 |
- XY = RGtoXY(RG, chipType=cdfName) |
|
1140 |
- rm(RG) |
|
1141 |
- gc(verbose=FALSE) |
|
1142 |
- if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY) |
|
1143 |
- res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, |
|
1144 |
- seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir) |
|
1145 |
- rm(XY) |
|
1286 |
+ XY = RGtoXY(RG, chipType=cdfName) |
|
1287 |
+ rm(RG) |
|
1288 |
+ gc(verbose=FALSE) |
|
1289 |
+ if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY) |
|
1290 |
+ res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, |
|
1291 |
+ seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget, |
|
1292 |
+ quantile.method=quantile.method) #, outdir=outdir) |
|
1293 |
+ rm(XY) |
|
1294 |
+ }else{ #XY already available |
|
1295 |
+ res = preprocessInfinium2(XY[,sel], mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, |
|
1296 |
+ seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget, |
|
1297 |
+ quantile.method=quantile.method) |
|
1298 |
+ } |
|
1146 | 1299 |
gc(verbose=FALSE) |
1147 | 1300 |
if(verbose) message("Finished preprocessing.") |
1148 | 1301 |
snp.index = which(is.snp) |
1149 | 1302 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,10 @@ |
1 |
+krlmm = function(cnSet, cdfName, gender) { # Doesn't do anything as yet - Jenny/Cynthia to add functionality based on their code |
|
2 |
+ if(is.null(gender)) { |
|
3 |
+ gender=rep(1, ncol(cnSet)) # Need to impute gender if not specified - Cynthia is working on this using Y chr SNPs |
|
4 |
+ } |
|
5 |
+ open(cnSet$gender) |
|
6 |
+ cnSet$gender[,] = gender |
|
7 |
+ close(cnSet$gender) |
|
8 |
+ |
|
9 |
+ TRUE |
|
10 |
+} |
... | ... |
@@ -166,10 +166,12 @@ illuminaCdfNames <- function(){ |
166 | 166 |
"human1mduov3b", # 1M Duo |
167 | 167 |
"humanomni1quadv1b", # Omni1 quad |
168 | 168 |
"humanomni25quadv1b", # Omni2.5 quad |
169 |
- "humanomni258v1a", # Omni2.5 8 sample |
|
169 |
+ "humanomni258v1a", # Omni2.5 8 |
|
170 |
+ "humanomni5quadv1b", # Omni5 quad |
|
170 | 171 |
"humanomniexpress12v1b", # Omni express 12 |
171 | 172 |
"humanimmuno12v1b", # Immuno chip 12 |
172 |
- "humancytosnp12v2p1h") # CytoSNP 12 |
|
173 |
+ "humancytosnp12v2p1h", # CytoSNP 12 |
|
174 |
+ "humanomniexpexome8v1p1b") |
|
173 | 175 |
} |
174 | 176 |
|
175 | 177 |
affyCdfNames <- function(){ |
... | ... |
@@ -194,16 +194,6 @@ table(c("male", "female")[cnSet$gender[]]) |
194 | 194 |
if(any(is.na(cnSet$gender[]))) stop("missing genders") |
195 | 195 |
@ |
196 | 196 |
|
197 |
-%Parameters for determining log R ratios and B allele frequencies are |
|
198 |
-%estimated using the function \Rfunction{crlmmCopynumber}: |
|
199 |
-% |
|
200 |
-%<<crlmmCopynumber>>= |
|
201 |
-%crlmmCopynumber(cnSet, fit.linearModel=FALSE) |
|
202 |
-%@ |
|
203 |
-%<<LDS_genotype, cache=TRUE>>= |
|
204 |
-%cnSet <- genotype(celFiles, batch=plates, cdfName=cdfName, genome="hg19") |
|
205 |
-%@ |
|
206 |
- |
|
207 | 197 |
The normalized intensities, genotype calls, and confidence scores are |
208 | 198 |
stored as \verb+ff+ objects in the \verb+assayData+ slot. A concise |
209 | 199 |
summary of this object can be obtained throught the \Rfunction{print} |
... | ... |
@@ -220,10 +210,6 @@ genotype calls are stored on disk rather than in active memory. |
220 | 210 |
print(object.size(cnSet), units="Mb") |
221 | 211 |
@ |
222 | 212 |
|
223 |
-%Users can proceed to the \verb+copynumber+ vignette for copy number |
|
224 |
-%analyses. See the \verb+Infrastructure+ vignette for additional |
|
225 |
-%details on the \Robject{CNSet} class, including an overview of the |
|
226 |
-%available accessors. |
|
227 | 213 |
\SweaveInput{copynumber} |
228 | 214 |
|
229 | 215 |
A sample-specific estimate of the signal to noise ratio (SNR) |
... | ... |
@@ -53,15 +53,24 @@ print(histogram(~snr, |
53 | 53 |
|
54 | 54 |
\section{Copy number estimation} |
55 | 55 |
|
56 |
-As described in \cite{Scharpf2011}, the CRLMM-CopyNumber algorithm |
|
57 |
-fits a linear model to the normalized intensities stratified by the |
|
58 |
-diallic genotype call. The intercept and slope from the linear model |
|
59 |
-are both SNP- and batch-specific. The implementation in the \crlmm{} |
|
60 |
-package is encapsulated by the function \Rfunction{crlmmCopynumber} |
|
61 |
-that, using the default settings, can be called by passing a single |
|
62 |
-object of class \Rclass{CNSet}. See the appropriate |
|
63 |
-preprocessing/genotyping vignette for the construction of an object of |
|
64 |
-class \Rclass{CNSet}. |
|
56 |
+There are two ways to obtain marker-level estimates of copy number |
|
57 |
+that are supported by \Rpackage{crlmm}. One approach is to fit a |
|
58 |
+linear model to the normalized intensities stratified by the diallelic |
|
59 |
+genotype call at each SNP, as described in \cite{Scharpf2011}. |
|
60 |
+Another alternative is to compute log R ratio and B allele |
|
61 |
+frequencies. The latter is often better supported by downstream hidden |
|
62 |
+Markov models such as those in the \Rpackage{VanillaICE} package. We |
|
63 |
+describe each approach in the following two sections. |
|
64 |
+ |
|
65 |
+\subsection{Linear model for normalized intensities} |
|
66 |
+ |
|
67 |
+In this section, we fit a linear model to the normalized intensities |
|
68 |
+stratified by the diallic genotype call. The intercept and slope from |
|
69 |
+the linear model are both SNP- and batch-specific. The implementation |
|
70 |
+in the \crlmm{} package is encapsulated by the function |
|
71 |
+\Rfunction{crlmmCopynumber} that, using the default settings, can be |
|
72 |
+called by passing a single object of class \Rclass{CNSet}. |
|
73 |
+ |
|
65 | 74 |
<<LDS_copynumber,cache=TRUE>>= |
66 | 75 |
crlmmCopynumber(cnSet) |
67 | 76 |
@ |
... | ... |
@@ -105,12 +114,7 @@ described in the \R{} package \Rpackage{ff}. The value returned by |
105 | 114 |
\Rfunction{crlmmCopynumber} is \verb+TRUE+, indicating that the files |
106 | 115 |
on disk have been successfully updated. Note that while the |
107 | 116 |
\Robject{cnSet} object is unchanged, the values on disk are different. |
108 |
-On the other hand, subsetting the \Robject{cnSet} with the `[' method |
|
109 |
-coerces all of the elements to class \Rclass{matrix}. The |
|
110 |
-batch-specific summaries are now ordinary matrices stored in RAM. The |
|
111 |
-object returned by \Robject{crlmmCopynumber} is an object of class |
|
112 |
-\Rclass{CNSet} with the matrices in the \verb+batchStatistics+ slot |
|
113 |
-updated. |
|
117 |
+ |
|
114 | 118 |
|
115 | 119 |
<<chr1index>>= |
116 | 120 |
chr1.index <- which(chromosome(cnSet) == 1) |
... | ... |
@@ -127,20 +131,10 @@ for(i in seq_along(nms)) cls[i] <- class(batchStatistics(cnSet2)[[nms[i]]])[1] |
127 | 131 |
all(cls == "matrix") |
128 | 132 |
@ |
129 | 133 |
|
130 |
-<<matrix_copynumber,cache=TRUE>>= |
|
131 |
-cnSet3 <- crlmmCopynumber(cnSet2) |
|
132 |
-class(cnSet3) |
|
133 |
-@ |
|
134 |
- |
|
135 | 134 |
<<clean, echo=FALSE, results=hide>>= |
136 | 135 |
rm(cnSet2); gc() |
137 | 136 |
@ |
138 | 137 |
|
139 |
-\subsection{Marker-specific estimates} |
|
140 |
-%\subsection{Raw copy number} |
|
141 |
- |
|
142 |
-%\paragraph{Log R ratios and B allele frequencies.} |
|
143 |
- |
|
144 | 138 |
\paragraph{Raw total copy number.} |
145 | 139 |
Several functions are available that will compute relatively quickly |
146 | 140 |
the allele-specific, \emph{raw} copy number estimates. At allele $k$, |
... | ... |
@@ -186,38 +180,49 @@ cb <- CB(cnSet, i=snp.index, j=1:2) |
186 | 180 |
|
187 | 181 |
|
188 | 182 |
\subsection{Container for log R ratios and B allele frequencies} |
183 |
+\label{sec:lrrBaf} |
|
189 | 184 |
|
190 | 185 |
A useful container for storing the \crlmm{} genotypes, genotype |
191 | 186 |
confidence scores, and the total or relative copy number at each |
192 |
-marker is the \Rclass{oligoSetList} class. Coercion of a |
|
193 |
-\Rclass{CNSet} object to a \Rfunction{oligoSnpSet} object can be |
|
194 |
-acheived by using the function \Rfunction{constructOligoSetFrom} as |
|
187 |
+marker is the \Rclass{BafLrrSetList} class. Coercion of a |
|
188 |
+\Rclass{CNSet} object to a \Rfunction{BafLrrSetList} object can be |
|
189 |
+acheived by the function \Rfunction{BafLrrSetList} as |
|
195 | 190 |
illustrated below. Users should note that if the \verb+assayData+ |
196 | 191 |
elements in the \Rclass{CNSet} instance are \Rpackage{ff} objects, the |
197 |
-\verb+assayData+ elements of each element in the \Rclass{oligoSetList} |
|
192 |
+\verb+assayData+ elements of each element in the \Rclass{BafLrrSetList} |
|
198 | 193 |
object will be \Rpackage{ff}-dervied objects (a new |
199 | 194 |
\verb+total_cn*.ff+ file will be created in the \verb+ldPath()+ |
200 |
-directory). |
|
195 |
+directory). The following code-chunk is not evalutated. |
|
196 |
+ |
|
197 |
+Estimation of log R ratios and B allele frequencies from an object of class \Rclass{CNSet} instantiated from genotyping Affymetrix CEL files or Illumina iDat files requires running the \R{} function \Rfunction{crlmmCopynumber} as a preliminary step. Specifying \texttt{fit.linearModel=FALSE} will speed up computation as this function will by default compute summary statistics unnecessary for computing BAFs and log R ratios. |
|
201 | 198 |
|
202 |
-<<oligoSnpSet>>= |
|
199 |
+<<callingCrlmmCopynumber,eval=FALSE>>= |
|
200 |
+crlmmCopynumber(cnSet, fit.linearModel=FALSE) |
|
201 |
+@ |
|
202 |
+ |
|
203 |
+<<oligoSnpSet,eval=FALSE>>= |
|
203 | 204 |
library(VanillaICE) |
204 |
-open(cnSet3) |
|
205 |
-oligoSetList <- constructOligoSetListFrom(cnSet3, batch.name=batch(cnSet3)[1]) |
|
206 |
-close(cnSet3) |
|
205 |
+open(cnSet) |
|
206 |
+oligoSetList <- BafLrrSetList(cnSet) |
|
207 |
+close(cnSet) |
|
207 | 208 |
show(oligoSetList) |
208 | 209 |
class(oligoSetList) |
209 | 210 |
## oligoSnpSet of first chromosome |
210 | 211 |
oligoSetList[[1]] |
211 | 212 |
@ |
212 | 213 |
|
213 |
-\noindent Note that log R ratios stored in the \Robject{oligoSnpSet} |
|
214 |
-object can be retrieved by the \Rfunction{copyNumber} accessor. B |
|
215 |
-allele frequences are retrieved by the \Rfunction{baf} accessor. |
|
214 |
+\noindent Log R ratios and B allele frequences can be retrieved by the |
|
215 |
+accessors \Rfunction{lrr} and \Rfunction{baf}, respectively. |
|
216 | 216 |
|
217 |
-<<testEqual>>= |
|
218 |
-lrrList <- copyNumber(oligoSetList) |
|
217 |
+<<testEqual,eval=FALSE>>= |
|
218 |
+lrrList <- lrr(oligoSetList) |
|
219 | 219 |
class(lrrList) |
220 | 220 |
dim(lrrList[[1]]) ## log R ratios for chromosome 1. |
221 | 221 |
bafList <- baf(oligoSetList) |
222 | 222 |
dim(bafList[[1]]) ## B allele frequencies for chromosome 1 |
223 | 223 |
@ |
224 |
+ |
|
225 |
+If the \Rfunction{crlmmCopynumber} function is not run prior to the |
|
226 |
+\Rfunction{BafLrrSetList} construction, the log R ratios and BAFs will |
|
227 |
+be initialized as NAs. |
|
228 |
+ |
... | ... |
@@ -9,7 +9,9 @@ |
9 | 9 |
\code{batchStatistics} will be \code{ff} objects. See details. |
10 | 10 |
} |
11 | 11 |
\usage{ |
12 |
-constructInf(sampleSheet = NULL, arrayNames = NULL, path = ".", arrayInfoColNames = list(barcode = "SentrixBarcode_A", position = "SentrixPosition_A"), highDensity = FALSE, sep = "_", fileExt = list(green = "Grn.idat", red = "Red.idat"), cdfName, verbose = FALSE, batch, saveDate = TRUE) |
|
12 |
+constructInf(sampleSheet = NULL, arrayNames = NULL, path = ".", |
|
13 |
+ arrayInfoColNames = list(barcode="SentrixBarcode_A",position="SentrixPosition_A"), highDensity = FALSE, sep = "_", |
|
14 |
+ fileExt = list(green = "Grn.idat", red = "Red.idat"), XY, cdfName, verbose = FALSE, batch=NULL, saveDate = TRUE) |
|
13 | 15 |
} |
14 | 16 |
\arguments{ |
15 | 17 |
\item{sampleSheet}{\code{data.frame} containing Illumina sample sheet |
... | ... |
@@ -35,6 +37,7 @@ constructInf(sampleSheet = NULL, arrayNames = NULL, path = ".", arrayInfoColName |
35 | 37 |
names.} |
36 | 38 |
\item{fileExt}{list containing elements 'Green' and 'Red' which |
37 | 39 |
specify the .idat file extension for the Cy3 and Cy5 channels.} |
40 |
+ \item{XY}{an \code{NChannelSet} containing X and Y intensities.} |
|
38 | 41 |
\item{cdfName}{ annotation package (see also \code{validCdfNames})} |
39 | 42 |
\item{verbose}{ 'logical.' Whether to print descriptive messages |
40 | 43 |
during processing.} |
... | ... |
@@ -10,11 +10,11 @@ |
10 | 10 |
\usage{ |
11 | 11 |
genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
12 | 12 |
arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), |
13 |
- highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), |
|
14 |
- cdfName, copynumber=TRUE, batch, saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, |
|
15 |
- mixtureSampleSize=10^5, fitMixture=TRUE, eps =0.1, verbose = TRUE, seed = 1, |
|
16 |
- sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, |
|
17 |
- gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
13 |
+ highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), XY=NULL, |
|
14 |
+ call.method="crlmm", cdfName, copynumber=TRUE, batch=NULL, saveDate=TRUE, stripNorm=TRUE, |
|
15 |
+ useTarget=TRUE, quantile.method="between", mixtureSampleSize=10^5, fitMixture=TRUE, |
|
16 |
+ eps =0.1, verbose = TRUE, seed = 1, sns, probs = rep(1/3, 3), DF = 6, SNRMin = 5, |
|
17 |
+ recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
18 | 18 |
} |
19 | 19 |
\arguments{ |
20 | 20 |
\item{sampleSheet}{\code{data.frame} containing Illumina sample sheet |
... | ... |
@@ -41,6 +41,8 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
41 | 41 |
names.} |
42 | 42 |
\item{fileExt}{list containing elements 'Green' and 'Red' which |
43 | 43 |
specify the .idat file extension for the Cy3 and Cy5 channels.} |
44 |
+ \item{XY}{\code{NChannelSet} containing X and Y intensities.} |
|
45 |
+ \item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').} |
|
44 | 46 |
\item{cdfName}{ annotation package (see also \code{validCdfNames})} |
45 | 47 |
\item{copynumber}{ 'logical.' Whether to store copy number intensities with SNP output.} |
46 | 48 |
\item{batch}{ character vector indicating the batch variable. Must be |
... | ... |
@@ -50,6 +52,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
50 | 52 |
\item{stripNorm}{'logical'. Should the data be strip-level normalized?} |
51 | 53 |
\item{useTarget}{'logical' (only used when \code{stripNorm=TRUE}). |
52 | 54 |
Should the reference HapMap intensities be used in strip-level normalization?} |
55 |
+ \item{quantile.method}{character string specifying the quantile normalization method to use ('within' or 'between' channels).} |
|
53 | 56 |
\item{mixtureSampleSize}{ Sample size to be use when fitting the mixture model.} |
54 | 57 |
\item{fitMixture}{ 'logical.' Whether to fit per-array mixture model.} |
55 | 58 |
\item{eps}{ Stop criteria.} |
... | ... |
@@ -88,8 +91,7 @@ genotype.Illumina(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", |
88 | 91 |
Rather, \code{batch} is required in order to initialize a |
89 | 92 |
\code{CNSet} container with the appropriate dimensions. |
90 | 93 |
|
91 |
- |
|
92 |
- |
|
94 |
+ The new 'krlmm' option is available for certain chip types. |
|
93 | 95 |
} |
94 | 96 |
|
95 | 97 |
\value{ A \code{SnpSuperSet} instance.} |
... | ... |
@@ -9,7 +9,7 @@ |
9 | 9 |
\description{ |
10 | 10 |
|
11 | 11 |
Genotyping of Illumina Infinium II arrays. This function |
12 |
- provides CRLMM genotypes and confidence scores for the the |
|
12 |
+ provides CRLMM/KRLMM genotypes and confidence scores for the the |
|
13 | 13 |
polymorphic markers and is a required step prior to copy |
14 | 14 |
number estimation. |
15 | 15 |
|
... | ... |
@@ -17,7 +17,7 @@ |
17 | 17 |
\usage{ |
18 | 18 |
genotypeInf(cnSet, mixtureParams, probs = rep(1/3, 3), SNRMin = 5, |
19 | 19 |
recallMin = 10, recallRegMin = 1000, verbose = TRUE, returnParams = |
20 |
-TRUE, badSNP = 0.7, gender = NULL, DF = 6, cdfName) |
|
20 |
+TRUE, badSNP = 0.7, gender = NULL, DF = 6, cdfName, call.method="crlmm") |
|
21 | 21 |
} |
22 | 22 |
%- maybe also 'usage' for other objects documented here. |
23 | 23 |
\arguments{ |
... | ... |
@@ -43,11 +43,12 @@ TRUE, badSNP = 0.7, gender = NULL, DF = 6, cdfName) |
43 | 43 |
t-distribution.} |
44 | 44 |
\item{cdfName}{\code{character} string indicating which annotation |
45 | 45 |
package to load.} |
46 |
+ \item{call.method}{character string specifying the genotype calling algorithm to use ('crlmm' or 'krlmm').} |
|
46 | 47 |
} |
47 | 48 |
|
48 | 49 |
\details{ |
49 | 50 |
|
50 |
- The CRLMM genotype calls and confidence scores are written to |
|
51 |
+ The genotype calls and confidence scores are written to |
|
51 | 52 |
file using \code{ff} protocols for I/O. For the most part, |
52 | 53 |
the calls and confidence scores can be accessed as though the |
53 | 54 |
data is in memory through the methods \code{snpCall} and |
... | ... |
@@ -19,9 +19,9 @@ |
19 | 19 |
preprocessInf(cnSet, sampleSheet=NULL, arrayNames = NULL, ids = NULL, |
20 | 20 |
path = ".", arrayInfoColNames = list(barcode = "SentrixBarcode_A", |
21 | 21 |
position = "SentrixPosition_A"), highDensity = TRUE, sep = "_", fileExt |
22 |
-= list(green = "Grn.idat", red = "Red.idat"), saveDate = TRUE, stripNorm |
|
23 |
-= TRUE, useTarget = TRUE, mixtureSampleSize = 10^5, fitMixture = TRUE, |
|
24 |
-eps = 0.1, verbose = TRUE, seed = 1, cdfName) |
|
22 |
+= list(green = "Grn.idat", red = "Red.idat"), XY, saveDate = TRUE, stripNorm |
|
23 |
+= TRUE, useTarget = TRUE, mixtureSampleSize = 10^5, fitMixture = TRUE, |
|
24 |
+quantile.method="between", eps = 0.1, verbose = TRUE, seed = 1, cdfName) |
|
25 | 25 |
} |
26 | 26 |
|
27 | 27 |
\arguments{ |
... | ... |
@@ -58,6 +58,7 @@ eps = 0.1, verbose = TRUE, seed = 1, cdfName) |
58 | 58 |
names.} |
59 | 59 |
\item{fileExt}{list containing elements 'Green' and 'Red' which |
60 | 60 |
specify the .idat file extension for the Cy3 and Cy5 channels.} |
61 |
+ \item{XY}{an \code{NChannelSet} object containing X and Y intensities.} |
|
61 | 62 |
\item{saveDate}{'logical'. Should the dates from each .idat be saved |
62 | 63 |
with sample information?} |
63 | 64 |
\item{stripNorm}{'logical'. Should the data be strip-level normalized?} |
... | ... |
@@ -65,7 +66,8 @@ eps = 0.1, verbose = TRUE, seed = 1, cdfName) |
65 | 66 |
Should the reference HapMap intensities be used in strip-level normalization?} |
66 | 67 |
\item{mixtureSampleSize}{ Sample size to be use when fitting the mixture model.} |
67 | 68 |
\item{fitMixture}{ 'logical.' Whether to fit per-array mixture |
68 |
- model. } |
|
69 |
+ model.} |
|
70 |
+ \item{quantile.method}{character string specifying the quantile normalization method to use ('within' or 'between' channels).} |
|
69 | 71 |
\item{eps}{ Stop criteria.} |
70 | 72 |
\item{verbose}{ 'logical.' Whether to print descriptive messages during processing.} |
71 | 73 |
\item{seed}{ Seed to be used when sampling. Useful for |