git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45865 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -509,3 +509,9 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error. |
509 | 509 |
** crlmmIllumina2() and crlmmIlluminaV2() now use the crlmmGT2() |
510 | 510 |
function (which expects ff objects and supports parallel processing) |
511 | 511 |
** updated crlmmIllumina.pdf vignette |
512 |
+ |
|
513 |
+2010-04-11 R. Scharpf committed version 1.5.48 |
|
514 |
+** added a few .Rd files |
|
515 |
+ |
|
516 |
+ |
|
517 |
+ |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.5.47 |
|
4 |
+Version: 1.5.48 |
|
5 | 5 |
Date: 2010-04-09 |
6 | 6 |
Author: Rafael A Irizarry, Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au> |
7 | 7 |
Maintainer: Benilton S Carvalho <carvalho@bclab.org>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
... | ... |
@@ -23,8 +23,7 @@ Imports: affyio (>= 1.15.2), |
23 | 23 |
utils |
24 | 24 |
Suggests: hapmapsnp6, |
25 | 25 |
genomewidesnp6Crlmm (>= 1.0.2), |
26 |
- snpMatrix, |
|
27 |
- metaArray |
|
26 |
+ snpMatrix |
|
28 | 27 |
Collate: AllGenerics.R |
29 | 28 |
AllClasses.R |
30 | 29 |
methods-CNSet.R |
... | ... |
@@ -29,7 +29,7 @@ importClassesFrom(oligoClasses, SnpSuperSet, AlleleSet, CNSet) |
29 | 29 |
importMethodsFrom(oligoClasses, allele, calls, "calls<-", confs, |
30 | 30 |
"confs<-", cnConfidence, "cnConfidence<-", isSnp, |
31 | 31 |
chromosome, position, CA, "CA<-", CB, "CB<-", A, B, |
32 |
- "A<-", "B<-") |
|
32 |
+ "A<-", "B<-", open, close) |
|
33 | 33 |
|
34 | 34 |
importFrom(oligoClasses, chromosome2integer, celfileDate, list.celfiles, |
35 | 35 |
copyNumber, initializeBigMatrix, initializeBigVector) |
... | ... |
@@ -56,25 +56,20 @@ importFrom(ellipse, ellipse) |
56 | 56 |
importFrom(ff, ffdf) |
57 | 57 |
|
58 | 58 |
exportClasses(CNSetLM) |
59 |
-exportMethods(copyNumber, open, "[", show, lM, "lM<-") |
|
59 |
+exportMethods(open, "[", show, lM) |
|
60 | 60 |
export(crlmm, |
61 | 61 |
crlmmCopynumber, |
62 | 62 |
crlmmIllumina, |
63 |
- crlmmIlluminaRS, |
|
64 |
- ellipse, |
|
63 |
+## ellipse, |
|
65 | 64 |
genotype, |
66 |
- getParam, |
|
67 | 65 |
readIdatFiles, |
68 | 66 |
snprma, |
69 | 67 |
snprma2, |
70 | 68 |
crlmm2, |
71 | 69 |
genotype2, |
72 |
- cnrma2, |
|
73 |
- processCEL2, |
|
74 | 70 |
batch, |
75 | 71 |
crlmmCopynumber2) |
76 |
- |
|
77 |
-export(initializeParamObject) |
|
72 |
+##export(initializeParamObject, biasAdjNP2) |
|
78 | 73 |
|
79 | 74 |
|
80 | 75 |
|
... | ... |
@@ -75,25 +75,11 @@ construct <- function(filenames, cdfName, copynumber=FALSE, |
75 | 75 |
sampleNames(callSet) <- sns |
76 | 76 |
return(callSet) |
77 | 77 |
} |
78 |
-setMethod("close", "AlleleSet", function(con, ...){ |
|
79 |
- object <- con |
|
80 |
- names <- ls(assayData(object)) |
|
81 |
- L <- length(names) |
|
82 |
- for(i in 1:L) close(eval(substitute(assayData(object)[[NAME]], list(NAME=names[i])))) |
|
83 |
- return() |
|
84 |
-}) |
|
85 |
-setMethod("open", "AlleleSet", function(con, ...){ |
|
86 |
- object <- con |
|
87 |
- names <- ls(assayData(object)) |
|
88 |
- L <- length(names) |
|
89 |
- for(i in 1:L) open(eval(substitute(assayData(object)[[NAME]], list(NAME=names[i])))) |
|
90 |
- return() |
|
91 |
-}) |
|
92 |
-##setReplaceMethod("calls", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "call", value)) |
|
93 |
-##setReplaceMethod("confs", "SnpSuperSet", function(object, value) assayDataElementReplace(object, "callProbability", value)) |
|
94 |
-##setMethod("confs", "SnpSuperSet", function(object) assayDataElement(object, "callProbability")) |
|
78 |
+ |
|
79 |
+ |
|
95 | 80 |
genotype <- function(filenames, |
96 | 81 |
cdfName, |
82 |
+ batch, |
|
97 | 83 |
mixtureSampleSize=10^5, |
98 | 84 |
eps=0.1, |
99 | 85 |
verbose=TRUE, |
... | ... |
@@ -111,6 +97,12 @@ genotype <- function(filenames, |
111 | 97 |
if(missing(cdfName)) stop("must specify cdfName") |
112 | 98 |
if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") |
113 | 99 |
if(missing(sns)) sns <- basename(filenames) |
100 |
+ if(missing(batch)){ |
|
101 |
+ warning("The batch variable is not specified. The scan date of the array will be used as a surrogate for batch. The batch variable does not affect the preprocessing or genotyping, but is important for copy number estimation.") |
|
102 |
+ } else { |
|
103 |
+ if(length(batch) != length(filenames)) |
|
104 |
+ stop("batch variable must be the same length as the filenames") |
|
105 |
+ } |
|
114 | 106 |
## callSet contains potentially very big matrices |
115 | 107 |
## More big matrices are created within snprma, that will then be removed. |
116 | 108 |
callSet <- construct(filenames=filenames, |
... | ... |
@@ -118,6 +110,9 @@ genotype <- function(filenames, |
118 | 110 |
copynumber=copynumber, |
119 | 111 |
sns=sns, |
120 | 112 |
verbose=verbose) |
113 |
+ if(missing(batch)){ |
|
114 |
+ protocolData(callSet)$batch <- as.numeric(as.factor(protocolData(callSet)$ScanDate)) |
|
115 |
+ } |
|
121 | 116 |
mixtureParams <- matrix(NA, 4, length(filenames)) |
122 | 117 |
snp.index <- which(isSnp(callSet)==1) |
123 | 118 |
batches <- splitIndicesByLength(1:ncol(callSet), ocSamples()) |
... | ... |
@@ -193,13 +188,13 @@ genotype2 <- function(filenames, |
193 | 188 |
gender=NULL, |
194 | 189 |
returnParams=TRUE, |
195 | 190 |
badSNP=0.7){ |
191 |
+ if(!isPackageLoaded("ff")) stop("Must load package 'ff'") |
|
196 | 192 |
if(!copynumber){ |
197 | 193 |
callSet <- crlmm2(filenames=filenames, |
198 | 194 |
cdfName=cdfName, |
199 | 195 |
mixtureSampleSize=mixtureSampleSize, |
200 | 196 |
eps=eps, |
201 | 197 |
verbose=verbose, |
202 |
- seed=seed, |
|
203 | 198 |
sns=sns, |
204 | 199 |
probs=probs, |
205 | 200 |
DF=DF, |
... | ... |
@@ -232,9 +227,7 @@ genotype2 <- function(filenames, |
232 | 227 |
if(!missing(batch)) protocolData(callSet)$batch <- batch |
233 | 228 |
mixtureParams <- matrix(NA, 4, length(filenames)) |
234 | 229 |
snp.index <- which(isSnp(callSet)==1) |
235 |
- snprmaRes <- snprma2(##A=A(callSet), |
|
236 |
- ##B=B(callSet), |
|
237 |
- filenames=filenames, |
|
230 |
+ snprmaRes <- snprma2(filenames=filenames, |
|
238 | 231 |
mixtureSampleSize=mixtureSampleSize, |
239 | 232 |
fitMixture=TRUE, |
240 | 233 |
eps=eps, |
... | ... |
@@ -249,13 +242,9 @@ genotype2 <- function(filenames, |
249 | 242 |
open(snprmaRes[["SKW"]]) |
250 | 243 |
open(snprmaRes[["mixtureParams"]]) |
251 | 244 |
if(verbose) message("Updating elements of callSet") |
252 |
-## A(callSet) <- snprmaRes[["A"]] |
|
253 |
-## B(callSet) <- snprmaRes[["B"]] |
|
254 | 245 |
bb = ocProbesets()*ncol(A)*8 |
255 | 246 |
ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]], BATCHBYTES=bb) |
256 | 247 |
ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]], BATCHBYTES=bb) |
257 |
- ##ffrowapply(A(callSet)[i1:i2, ] <- snprmaRes[["A"]][i1:i2, ], X=snprmaRes[["A"]]) |
|
258 |
- ##ffrowapply(B(callSet)[i1:i2, ] <- snprmaRes[["B"]][i1:i2, ], X=snprmaRes[["B"]]) |
|
259 | 248 |
if(verbose) message("Finished updating elements of callSet") |
260 | 249 |
stopifnot(identical(featureNames(callSet)[snp.index], snprmaRes$gns)) |
261 | 250 |
pData(callSet)$SKW <- snprmaRes$SKW |
... | ... |
@@ -289,435 +278,14 @@ genotype2 <- function(filenames, |
289 | 278 |
badSNP=badSNP) |
290 | 279 |
open(tmp[["calls"]]) |
291 | 280 |
open(tmp[["confs"]]) |
292 |
- ##snpCall(callSet) <- tmp[["calls"]] |
|
293 |
- ##snpCallProbability(callSet) <- tmp[["confs"]] |
|
294 | 281 |
bb = ocProbesets()*ncol(A)*8 |
295 | 282 |
ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]], BATCHBYTES=bb) |
296 | 283 |
ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]], BATCHBYTES=bb) |
297 |
-## ffrowapply(snpCall(callSet)[i1:i2, ] <- tmp[["calls"]][i1:i2, ], X=tmp[["calls"]]) |
|
298 |
-## ffrowapply(snpCallProbability(callSet)[i1:i2, ] <- tmp[["confs"]][i1:i2, ], X=tmp[["confs"]]) |
|
299 | 284 |
callSet$gender <- tmp$gender |
300 | 285 |
cnSet <- as(callSet, "CNSetLM") |
301 | 286 |
return(cnSet) |
302 | 287 |
} |
303 | 288 |
|
304 |
-snprma2RS <- function(A, B, |
|
305 |
- filenames, mixtureSampleSize=10^5, fitMixture=FALSE, |
|
306 |
- eps=0.1, verbose=TRUE, seed=1, cdfName, sns){ |
|
307 |
- if (missing(sns)) sns <- basename(filenames) |
|
308 |
- if (missing(cdfName)) |
|
309 |
- cdfName <- read.celfile.header(filenames[1])[["cdfName"]] |
|
310 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
311 |
- stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
312 |
- |
|
313 |
- if(verbose) message("Loading annotations and mixture model parameters.") |
|
314 |
- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
315 |
- pnsa <- getVarInEnv("pnsa") |
|
316 |
- pnsb <- getVarInEnv("pnsb") |
|
317 |
- gns <- getVarInEnv("gns") |
|
318 |
- |
|
319 |
- ##We will read each cel file, summarize, and run EM one by one |
|
320 |
- ##We will save parameters of EM to use later |
|
321 |
- if(verbose) message("Initializing objects.") |
|
322 |
- mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double") |
|
323 |
- SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double") |
|
324 |
- SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double") |
|
325 |
- |
|
326 |
- ## This is the sample for the fitting of splines |
|
327 |
- ## BC: I like better the idea of the user passing the seed, |
|
328 |
- ## because this might intefere with other analyses |
|
329 |
- ## (like what happened to GCRMA) |
|
330 |
- ##S will hold (A+B)/2 and M will hold A-B |
|
331 |
- ##NOTE: We actually dont need to save S. Only for pics etc... |
|
332 |
- ##f is the correction. we save to avoid recomputing |
|
333 |
-## A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer") |
|
334 |
-## B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer") |
|
335 |
- |
|
336 |
- sampleBatches <- splitIndicesByNode(seq(along=filenames)) |
|
337 |
- |
|
338 |
- if(verbose) message("Processing ", length(filenames), " files.") |
|
339 |
- |
|
340 |
- ocLapply(sampleBatches, processCEL_RS, filenames=filenames, |
|
341 |
- fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR, |
|
342 |
- mixtureParams=mixtureParams, eps=eps, seed=seed, |
|
343 |
- mixtureSampleSize=mixtureSampleSize, pkgname=pkgname, |
|
344 |
- neededPkgs=c("crlmm", pkgname)) |
|
345 |
- |
|
346 |
- list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
|
347 |
-} |
|
348 |
- |
|
349 |
-processCEL_RS <- function(i, filenames, fitMixture, A, B, SKW, SNR, |
|
350 |
- mixtureParams, eps, seed, mixtureSampleSize, |
|
351 |
- pkgname){ |
|
352 |
- |
|
353 |
- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
354 |
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
355 |
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
356 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
357 |
- pnsa <- getVarInEnv("pnsa") |
|
358 |
- pnsb <- getVarInEnv("pnsb") |
|
359 |
- fid <- getVarInEnv("fid") |
|
360 |
- reference <- getVarInEnv("reference") |
|
361 |
- aIndex <- getVarInEnv("aIndex") |
|
362 |
- bIndex <- getVarInEnv("bIndex") |
|
363 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
364 |
- theKnots <- getVarInEnv("theKnots") |
|
365 |
- gns <- getVarInEnv("gns") |
|
366 |
- |
|
367 |
- ## for mixture |
|
368 |
- set.seed(seed) |
|
369 |
- idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
370 |
- ##for skewness. no need to do everything |
|
371 |
- idx2 <- sample(length(fid), 10^5) |
|
372 |
- |
|
373 |
- open(A) |
|
374 |
- open(B) |
|
375 |
- open(SKW) |
|
376 |
- open(mixtureParams) |
|
377 |
- open(SNR) |
|
378 |
- ##RS |
|
379 |
- iii <- seq(along=pnsa) |
|
380 |
- |
|
381 |
- for (k in i){ |
|
382 |
- y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
383 |
- x <- log2(y[idx2]) |
|
384 |
- SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
385 |
- rm(x) |
|
386 |
- y <- normalize.quantiles.use.target(y, target=reference) |
|
387 |
- A[iii, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
388 |
- B[iii, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
389 |
- rm(y) |
|
390 |
- |
|
391 |
- if(fitMixture){ |
|
392 |
- S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN |
|
393 |
- M <- log2(A[idx, k])-log2(B[idx, k]) |
|
394 |
- tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
395 |
- mixtureParams[, k] <- tmp[["coef"]] |
|
396 |
- SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
397 |
- } else { |
|
398 |
- mixtureParams[, k] <- NA |
|
399 |
- SNR[k] <- NA |
|
400 |
- } |
|
401 |
- } |
|
402 |
- close(A) |
|
403 |
- close(B) |
|
404 |
- close(SKW) |
|
405 |
- close(mixtureParams) |
|
406 |
- close(SNR) |
|
407 |
- TRUE |
|
408 |
-} |
|
409 |
- |
|
410 |
-crlmmGT2_RS<- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
|
411 |
- col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
|
412 |
- SNRMin=5, recallMin=10, recallRegMin=1000, |
|
413 |
- gender=NULL, desctrucitve=FALSE, verbose=TRUE, |
|
414 |
- returnParams=FALSE, badSNP=.7){ |
|
415 |
- open(SNR) |
|
416 |
- open(A) |
|
417 |
- open(B) |
|
418 |
- open(mixtureParams) |
|
419 |
- ## expect objects to be ff |
|
420 |
- |
|
421 |
- keepIndex <- which( SNR[] > SNRMin) |
|
422 |
- if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
423 |
- |
|
424 |
- NC <- ncol(A) |
|
425 |
- NR <- nrow(A) |
|
426 |
- |
|
427 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
428 |
- stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
429 |
- |
|
430 |
- if(verbose) message("Loading annotations.") |
|
431 |
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
432 |
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
433 |
- ## this is toget rid of the 'no visible binding' notes |
|
434 |
- ## variable definitions |
|
435 |
- XIndex <- getVarInEnv("XIndex") |
|
436 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
437 |
- YIndex <- getVarInEnv("YIndex") |
|
438 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
439 |
- theKnots <- getVarInEnv("theKnots") |
|
440 |
- regionInfo <- getVarInEnv("regionInfo") |
|
441 |
- params <- getVarInEnv("params") |
|
442 |
- ##RS |
|
443 |
- pnsa <- getVarInEnv("pnsa") |
|
444 |
- NR <- length(pnsa) |
|
445 |
- |
|
446 |
- ##IF gender not provide, we predict |
|
447 |
- if(is.null(gender)){ |
|
448 |
- if(verbose) message("Determining gender.") |
|
449 |
- XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
450 |
- if(sum(SNR[] > SNRMin)==1){ |
|
451 |
- gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
452 |
- }else{ |
|
453 |
- gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]] |
|
454 |
- } |
|
455 |
- } |
|
456 |
- |
|
457 |
- Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
458 |
- cIndexes <- list(keepIndex, |
|
459 |
- keepIndex[which(gender[keepIndex]==2)], |
|
460 |
- keepIndex[which(gender[keepIndex]==1)]) |
|
461 |
- |
|
462 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
463 |
- |
|
464 |
- ## call C |
|
465 |
- fIndex <- which(gender==2) |
|
466 |
- mIndex <- which(gender==1) |
|
467 |
- |
|
468 |
- ## different here |
|
469 |
- ## use gtypeCallerR in batches |
|
470 |
- ##RS |
|
471 |
- snpBatches <- splitIndicesByLength(1:NR, ocProbesets()) |
|
472 |
- newparamsBatch <- vector("list", length(snpBatches)) |
|
473 |
- |
|
474 |
- process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
475 |
- YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
476 |
- params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){ |
|
477 |
- open(A) |
|
478 |
- open(B) |
|
479 |
- open(mixtureParams) |
|
480 |
- snps <- snpBatches[[idxBatch]] |
|
481 |
- rSnps <- range(snps) |
|
482 |
- last <- (idxBatch-1)*batchSize |
|
483 |
- IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
484 |
- XIndex[XIndex %in% snps]-last, |
|
485 |
- YIndex[YIndex %in% snps]-last) |
|
486 |
- IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
487 |
- tmpA <- as.matrix(A[snps,]) |
|
488 |
- tmpB <- as.matrix(B[snps,]) |
|
489 |
- ## newparamsBatch[[idxBatch]] |
|
490 |
- |
|
491 |
- tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex, |
|
492 |
- params[["centers"]][snps,], |
|
493 |
- params[["scales"]][snps,], |
|
494 |
- params[["N"]][snps,], |
|
495 |
- IndexesBatch, cIndexes, |
|
496 |
- sapply(IndexesBatch, length), |
|
497 |
- sapply(cIndexes, length), SMEDIAN, |
|
498 |
- theKnots, mixtureParams[], DF, probs, 0.025) |
|
499 |
- |
|
500 |
- last <- rSnps[2] |
|
501 |
- rm(snps, rSnps, IndexesBatch, tmpA, tmpB) |
|
502 |
- close(A) |
|
503 |
- close(B) |
|
504 |
- close(mixtureParams) |
|
505 |
- tmp |
|
506 |
- } |
|
507 |
- |
|
508 |
- newparamsBatch <- ocLapply(seq(along=snpBatches), process1, |
|
509 |
- snpBatches=snpBatches, |
|
510 |
- autosomeIndex=autosomeIndex, XIndex=XIndex, |
|
511 |
- YIndex=YIndex, A=A, B=B, |
|
512 |
- mixtureParams=mixtureParams, fIndex=fIndex, |
|
513 |
- mIndex=mIndex, params=params, |
|
514 |
- cIndexes=cIndexes, SMEDIAN=SMEDIAN, |
|
515 |
- theKnots=theKnots, DF=DF, probs=probs, |
|
516 |
- batchSize=ocProbesets()) |
|
517 |
-## last <- 0 |
|
518 |
-## for (idxBatch in seq(along=snpBatches)){ |
|
519 |
-## snps <- snpBatches[[idxBatch]] |
|
520 |
-## rSnps <- range(snps) |
|
521 |
-## IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
522 |
-## XIndex[XIndex %in% snps]-last, |
|
523 |
-## YIndex[YIndex %in% snps]-last) |
|
524 |
-## IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
525 |
-## tmpA <- A[snps,] |
|
526 |
-## tmpB <- B[snps,] |
|
527 |
-## newparamsBatch[[idxBatch]] <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex, |
|
528 |
-## params[["centers"]][snps,], |
|
529 |
-## params[["scales"]][snps,], |
|
530 |
-## params[["N"]][snps,], |
|
531 |
-## IndexesBatch, cIndexes, |
|
532 |
-## sapply(IndexesBatch, length), |
|
533 |
-## sapply(cIndexes, length), |
|
534 |
-## SMEDIAN, theKnots, |
|
535 |
-## mixtureParams[], DF, probs, 0.025) |
|
536 |
-## last <- rSnps[2] |
|
537 |
-## rm(snps, rSnps, IndexesBatch, tmpA, tmpB) |
|
538 |
-## } |
|
539 |
-## rm(last) |
|
540 |
- |
|
541 |
- newparams <- vector("list", 3) |
|
542 |
- names(newparams) <- c("centers", "scales", "N") |
|
543 |
- newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1)) |
|
544 |
- newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2)) |
|
545 |
- newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3)) |
|
546 |
- rm(newparamsBatch) |
|
547 |
- if(verbose) message("Done.") |
|
548 |
- if(verbose) message("Estimating recalibration parameters.") |
|
549 |
- d <- newparams[["centers"]] - params$centers |
|
550 |
- |
|
551 |
- ##regression |
|
552 |
- Index <- intersect(which(pmin(newparams[["N"]][, 1], |
|
553 |
- newparams[["N"]][, 2], |
|
554 |
- newparams[["N"]][, 3]) > recallMin & |
|
555 |
- !apply(regionInfo, 1, any)), |
|
556 |
- autosomeIndex) |
|
557 |
- if(length(Index) < recallRegMin){ |
|
558 |
- warning("Recalibration not possible. Possible cause: small sample size.") |
|
559 |
- newparams <- params |
|
560 |
- dev <- vector("numeric", nrow(newparams[["centers"]])) |
|
561 |
- SS <- matrix(Inf, 3, 3) |
|
562 |
- DD <- 0 |
|
563 |
- }else{ |
|
564 |
- data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
565 |
- names(data4reg) <- c("AA", "AB", "BB") |
|
566 |
- regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
567 |
- c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
568 |
- coef(lm(BB~AA*AB, data=data4reg))) |
|
569 |
- rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
570 |
- rm(data4reg) |
|
571 |
- |
|
572 |
- minN <- 3 |
|
573 |
- newparams[["centers"]][newparams[["N"]] < minN] <- NA |
|
574 |
- Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
575 |
- if(verbose) cat("Filling out empty centers") |
|
576 |
- for(i in Index){ |
|
577 |
- if(verbose) if(i%%10000==0)cat(".") |
|
578 |
- mu <- newparams[["centers"]][i, ] |
|
579 |
- j <- which(is.na(mu)) |
|
580 |
- newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
581 |
- } |
|
582 |
- |
|
583 |
- ##remaing NAs are made like originals |
|
584 |
- if(length(YIndex)>0){ |
|
585 |
- noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
586 |
- YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
587 |
- } |
|
588 |
- snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0) |
|
589 |
- snps2keep <- setdiff(autosomeIndex, snps2ignore) |
|
590 |
- rm(snps2ignore) |
|
591 |
- newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
592 |
- if(verbose) cat("\n") |
|
593 |
- |
|
594 |
- if(verbose) message("Calculating and standardizing size of shift.") |
|
595 |
- GG <- DD <- newparams[["centers"]] - params[["centers"]] |
|
596 |
- DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ])) |
|
597 |
- SS <- cov(DD[autosomeIndex, ]) |
|
598 |
- SSI <- solve(SS) |
|
599 |
- dev <- vector("numeric", nrow(DD)) |
|
600 |
- if(length(YIndex)){ |
|
601 |
- dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
602 |
- dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
603 |
- ##Now Y (only two params) |
|
604 |
- SSY <- SS[c(1, 3), c(1, 3)] |
|
605 |
- SSI <- solve(SSY) |
|
606 |
- dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
607 |
- dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
608 |
- } else { |
|
609 |
- dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
610 |
- dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
611 |
- } |
|
612 |
- } |
|
613 |
- |
|
614 |
- ## BC: must keep SD |
|
615 |
- params[-2] <- newparams[-2] |
|
616 |
- |
|
617 |
- rm(newparams);gc(verbose=FALSE) |
|
618 |
- if(verbose) cat("Calling", NR, "SNPs... ") |
|
619 |
- ## ################### |
|
620 |
- ## ## MOVE TO C####### |
|
621 |
- |
|
622 |
- ## running in batches |
|
623 |
- ## snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets()) |
|
624 |
- |
|
625 |
- process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
626 |
- YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
627 |
- params, cIndexes, SMEDIAN, theKnots, DF, probs, |
|
628 |
- regionInfo, batchSize){ |
|
629 |
- open(A) |
|
630 |
- open(B) |
|
631 |
- open(mixtureParams) |
|
632 |
- snps <- snpBatches[[idxBatch]] |
|
633 |
- tmpA <- as.matrix(A[snps,]) |
|
634 |
- tmpB <- as.matrix(B[snps,]) |
|
635 |
- rSnps <- range(snps) |
|
636 |
- last <- (idxBatch-1)*batchSize |
|
637 |
- IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
638 |
- XIndex[XIndex %in% snps]-last, |
|
639 |
- YIndex[YIndex %in% snps]-last) |
|
640 |
- IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
641 |
- ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex, |
|
642 |
- params[["centers"]][snps,], |
|
643 |
- params[["scales"]][snps,], |
|
644 |
- params[["N"]][snps,], |
|
645 |
- IndexesBatch, cIndexes, |
|
646 |
- sapply(IndexesBatch, length), |
|
647 |
- sapply(cIndexes, length), |
|
648 |
- SMEDIAN, theKnots, mixtureParams[], |
|
649 |
- DF, probs, 0.025, |
|
650 |
- which(regionInfo[snps, 2]), |
|
651 |
- which(regionInfo[snps, 1])) |
|
652 |
- A[snps,] <- tmpA |
|
653 |
- B[snps,] <- tmpB |
|
654 |
- last <- rSnps[2] |
|
655 |
- rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull) |
|
656 |
- close(A) |
|
657 |
- close(B) |
|
658 |
- close(mixtureParams) |
|
659 |
- } |
|
660 |
- |
|
661 |
- ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches, |
|
662 |
- autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex, |
|
663 |
- A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex, |
|
664 |
- mIndex=mIndex, params=params, cIndexes=cIndexes, |
|
665 |
- SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs, |
|
666 |
- regionInfo=regionInfo, batchSize=ocProbesets()) |
|
667 |
- |
|
668 |
-## last <- 0 |
|
669 |
-## for (idxBatch in seq(along=snpBatches)){ |
|
670 |
-## snps <- snpBatches[[idxBatch]] |
|
671 |
-## tmpA <- A[snps,] |
|
672 |
-## tmpB <- B[snps,] |
|
673 |
-## rSnps <- range(snps) |
|
674 |
-## IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
675 |
-## XIndex[XIndex %in% snps]-last, |
|
676 |
-## YIndex[YIndex %in% snps]-last) |
|
677 |
-## IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
678 |
-## ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex, |
|
679 |
-## params[["centers"]][snps,], |
|
680 |
-## params[["scales"]][snps,], |
|
681 |
-## params[["N"]][snps,], |
|
682 |
-## IndexesBatch, cIndexes, |
|
683 |
-## sapply(IndexesBatch, length), |
|
684 |
-## sapply(cIndexes, length), |
|
685 |
-## SMEDIAN, theKnots, mixtureParams[], |
|
686 |
-## DF, probs, 0.025, |
|
687 |
-## which(regionInfo[snps, 2]), |
|
688 |
-## which(regionInfo[snps, 1])) |
|
689 |
-## A[snps,] <- tmpA |
|
690 |
-## B[snps,] <- tmpB |
|
691 |
-## last <- rSnps[2] |
|
692 |
-## rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull) |
|
693 |
-## } |
|
694 |
-## |
|
695 |
-## gc(verbose=FALSE) |
|
696 |
- ## END MOVE TO C####### |
|
697 |
- ## ################## |
|
698 |
- |
|
699 |
- dev <- dev/(dev+1/383) |
|
700 |
- if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
701 |
- if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
702 |
- |
|
703 |
- if(length(Index) >= recallRegMin){ |
|
704 |
- tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1) |
|
705 |
- tmpSnpQc <- dev[autosomeIndex] |
|
706 |
- SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,]) |
|
707 |
- batchQC <- mean(diag(SS)) |
|
708 |
- }else{ |
|
709 |
- SS <- matrix(0, 3, 3) |
|
710 |
- batchQC <- Inf |
|
711 |
- } |
|
712 |
- |
|
713 |
- if(verbose) message("Done.") |
|
714 |
- if (returnParams){ |
|
715 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
716 |
- }else{ |
|
717 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
718 |
- } |
|
719 |
-} |
|
720 |
- |
|
721 | 289 |
|
722 | 290 |
##--------------------------------------------------------------------------- |
723 | 291 |
##--------------------------------------------------------------------------- |
... | ... |
@@ -853,17 +421,13 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
853 | 421 |
## Here, I'm just using the # of rows returned from the above function |
854 | 422 |
if(k == 1){ |
855 | 423 |
if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability") |
856 |
- load.obj <- loadObject("callSet", load.it) |
|
857 |
- if(!load.obj){ |
|
858 |
- callSet <- new("SnpSuperSet", |
|
859 |
- alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
860 |
- alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
861 |
- call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
862 |
- callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
863 |
- annotation=cdfName) |
|
864 |
- sampleNames(callSet) <- sns |
|
865 |
- save(callSet, file=file.path(ldPath(), "callSet.rda")) |
|
866 |
- } else load(file.path(ldPath(), "callSet.rda")) |
|
424 |
+ callSet <- new("SnpSuperSet", |
|
425 |
+ alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)), |
|
426 |
+ alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)), |
|
427 |
+ call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)), |
|
428 |
+ callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)), |
|
429 |
+ annotation=cdfName) |
|
430 |
+ sampleNames(callSet) <- sns |
|
867 | 431 |
phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet, |
868 | 432 |
arrayNames=sns, |
869 | 433 |
arrayInfoColNames=arrayInfoColNames) |
... | ... |
@@ -878,6 +442,7 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
878 | 442 |
##pData(callSet)$SNR <- rep(NA, length(sns)) |
879 | 443 |
pData(callSet)$gender <- rep(NA, length(sns)) |
880 | 444 |
mixtureParams <- initializeBigMatrix("crlmmMixt-", nr=4, nc=ncol(callSet), vmode="double") |
445 |
+ save(mixtureParams, file=file.path(ldPath(), "mixtureParams.rda")) |
|
881 | 446 |
if(missing(batch)){ |
882 | 447 |
protocolData(callSet)$batch <- rep(NA, length(sns)) |
883 | 448 |
} else{ |
... | ... |
@@ -907,6 +472,16 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
907 | 472 |
rm(res); gc() |
908 | 473 |
k <- k+1 |
909 | 474 |
} |
475 |
+ save(callSet, file=file.path(ldPath(), "callSet.rda")) |
|
476 |
+ ##otherwise, A and B get overwritten |
|
477 |
+ ##AA <- initializeBigMatrix("crlmmA", nrow(callSet), ncol(callSet), "integer") |
|
478 |
+ ##BB <- initializeBigMatrix("crlmmB", nrow(callSet), ncol(callSet), "integer") |
|
479 |
+ ##bb = ocProbesets()*ncol(A)*8 |
|
480 |
+ AA <- clone(A(callSet)) |
|
481 |
+ BB <- clone(B(callSet)) |
|
482 |
+ ##ffrowapply(AA[i1:i2, ] <- A(callSet)[i1:i2, ], X=A(callSet), BATCHBYTES=bb) |
|
483 |
+ ##ffrowapply(BB[i1:i2, ] <- B(callSet)[i1:i2, ], X=B(callSet), BATCHBYTES=bb) |
|
484 |
+ ##crlmmGT2 overwrites A and B. |
|
910 | 485 |
tmp <- crlmmGT2(A=A(callSet), |
911 | 486 |
B=B(callSet), |
912 | 487 |
SNR=callSet$SNR, |
... | ... |
@@ -925,27 +500,13 @@ crlmmIlluminaRS <- function(sampleSheet=NULL, |
925 | 500 |
badSNP=badSNP) |
926 | 501 |
open(tmp[["calls"]]) |
927 | 502 |
open(tmp[["confs"]]) |
503 |
+ A(callSet) <- AA |
|
504 |
+ B(callSet) <- BB |
|
928 | 505 |
snpCall(callSet) <- tmp[["calls"]] |
929 | 506 |
## MR: many zeros in the conf. scores (?) |
930 | 507 |
snpCallProbability(callSet) <- tmp[["confs"]] |
931 | 508 |
callSet$gender <- tmp$gender |
932 |
- if(copynumber){ |
|
933 |
- load.obj <- loadObject("cnSet", load.it) |
|
934 |
- if(!load.obj){ |
|
935 |
- cnSet <- as(callSet, "CNSetLM") |
|
936 |
- } else { |
|
937 |
- load(file.path(ldPath(), "cnSet.rda")) |
|
938 |
- A(cnSet) <- A(callSet) |
|
939 |
- B(cnSet) <- B(callSet) |
|
940 |
- snpCall(cnSet) <- snpCall(callSet) |
|
941 |
- snpCallProbability(cnSet) <- snpCallProbability(callSet) |
|
942 |
- annotation(cnSet) <- annotation(callSet) |
|
943 |
- featureData(cnSet) <- featureData(callSet) |
|
944 |
- protocolData(cnSet) <- protocolData(callSet) |
|
945 |
- phenoData(cnSet) <- phenoData(callSet) |
|
946 |
- experimentData(cnSet) <- experimentData(callSet) |
|
947 |
- } |
|
948 |
- } |
|
509 |
+ if(copynumber) cnSet <- as(callSet, "CNSetLM") |
|
949 | 510 |
close(mixtureParams) |
950 | 511 |
rm(tmp); gc() |
951 | 512 |
return(cnSet) |
... | ... |
@@ -1298,7 +859,7 @@ crlmmCopynumber <- function(object, |
1298 | 859 |
} |
1299 | 860 |
|
1300 | 861 |
crlmmCopynumber2 <- function(object, |
1301 |
- which.batches, |
|
862 |
+ which.batches, |
|
1302 | 863 |
MIN.SAMPLES=10, |
1303 | 864 |
SNRMin=5, |
1304 | 865 |
MIN.OBS=1, |
... | ... |
@@ -1313,8 +874,7 @@ crlmmCopynumber2 <- function(object, |
1313 | 874 |
MIN.NU=2^3, |
1314 | 875 |
MIN.PHI=2^3, |
1315 | 876 |
THR.NU.PHI=TRUE, |
1316 |
- thresholdCopynumber=TRUE, |
|
1317 |
- load.it=TRUE){ |
|
877 |
+ thresholdCopynumber=TRUE){ |
|
1318 | 878 |
stopifnot("batch" %in% varLabels(protocolData(object))) |
1319 | 879 |
stopifnot("chromosome" %in% fvarLabels(object)) |
1320 | 880 |
stopifnot("position" %in% fvarLabels(object)) |
... | ... |
@@ -1325,9 +885,7 @@ crlmmCopynumber2 <- function(object, |
1325 | 885 |
XIndex.nps <- (1:nrow(object))[chromosome(object) == 23 & !isSnp(object) & !is.na(chromosome(object))] |
1326 | 886 |
autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))] |
1327 | 887 |
autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))] |
1328 |
- ##Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
1329 |
- ##snpBatches <- splitIndicesByLength(1:nrow(object), ocProbesets()) |
|
1330 |
- snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets()) |
|
888 |
+ |
|
1331 | 889 |
## Do chromosome X in batches |
1332 | 890 |
Ns <- initializeBigMatrix("Ns", nrow(object), 5) |
1333 | 891 |
colnames(Ns) <- c("A", "B", "AA", "AB", "BB") |
... | ... |
@@ -1345,25 +903,26 @@ crlmmCopynumber2 <- function(object, |
1345 | 903 |
|
1346 | 904 |
} |
1347 | 905 |
if(verbose) message("Estimating allele-specific copy number at autosomal SNPs") |
906 |
+ snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets()) |
|
1348 | 907 |
ocLapply(seq(along=snpBatches), |
1349 |
- fit.lm1, |
|
1350 |
- autosomeIndex=autosomeIndex.snps, |
|
1351 |
- object=object, |
|
1352 |
- Ns=Ns, |
|
1353 |
- normal=normal, |
|
1354 |
- snpflags=snpflags, |
|
1355 |
- snpBatches=snpBatches, |
|
1356 |
- batchSize=ocProbesets(), |
|
1357 |
- SNRMin=SNRMin, |
|
1358 |
- MIN.SAMPLES=MIN.SAMPLES, |
|
1359 |
- MIN.OBS=MIN.OBS, |
|
1360 |
- DF=DF.PRIOR, |
|
1361 |
- GT.CONF.THR=GT.CONF.THR, |
|
1362 |
- THR.NU.PHI=THR.NU.PHI, |
|
1363 |
- MIN.NU=MIN.NU, |
|
1364 |
- MIN.PHI=MIN.PHI, |
|
1365 |
- verbose=verbose, |
|
1366 |
- neededPkgs="crlmm") |
|
908 |
+ fit.lm1, |
|
909 |
+ autosomeIndex=autosomeIndex.snps, |
|
910 |
+ object=object, |
|
911 |
+ Ns=Ns, |
|
912 |
+ normal=normal, |
|
913 |
+ snpflags=snpflags, |
|
914 |
+ snpBatches=snpBatches, |
|
915 |
+ batchSize=ocProbesets(), |
|
916 |
+ SNRMin=SNRMin, |
|
917 |
+ MIN.SAMPLES=MIN.SAMPLES, |
|
918 |
+ MIN.OBS=MIN.OBS, |
|
919 |
+ DF=DF.PRIOR, |
|
920 |
+ GT.CONF.THR=GT.CONF.THR, |
|
921 |
+ THR.NU.PHI=THR.NU.PHI, |
|
922 |
+ MIN.NU=MIN.NU, |
|
923 |
+ MIN.PHI=MIN.PHI, |
|
924 |
+ verbose=verbose, |
|
925 |
+ neededPkgs="crlmm") |
|
1367 | 926 |
## autosomal NPs |
1368 | 927 |
snpBatches <- splitIndicesByLength(autosomeIndex.nps, ocProbesets()) |
1369 | 928 |
if(verbose) message("Estimating total copy number at nonpolymorphic loci") |
... | ... |
@@ -1387,7 +946,7 @@ crlmmCopynumber2 <- function(object, |
1387 | 946 |
verbose=verbose, |
1388 | 947 |
neededPkgs="crlmm") |
1389 | 948 |
snpBatches <- splitIndicesByLength(XIndex.snps, ocProbesets()) |
1390 |
- if(verbose) message("Estimating total copy number at polymorphic loci on chromosome X") |
|
949 |
+ if(verbose) message("Estimating allele-specific copy number at polymorphic loci on chromosome X") |
|
1391 | 950 |
ocLapply(seq(along=snpBatches), |
1392 | 951 |
fit.lm3, |
1393 | 952 |
autosomeIndex=XIndex.snps, |
... | ... |
@@ -1409,7 +968,6 @@ crlmmCopynumber2 <- function(object, |
1409 | 968 |
neededPkgs="crlmm") |
1410 | 969 |
if(verbose) message("Estimating total copy number for nonpolymorphic loci on chromosome X") |
1411 | 970 |
snpBatches <- splitIndicesByLength(XIndex.nps, ocProbesets()) |
1412 |
- if(verbose) message("Estimating total copy number at nonpolymorphic loci on chromosome X") |
|
1413 | 971 |
tmp <- ocLapply(seq(along=snpBatches), |
1414 | 972 |
fit.lm4, |
1415 | 973 |
XIndex=XIndex.nps, |
... | ... |
@@ -1432,6 +990,7 @@ crlmmCopynumber2 <- function(object, |
1432 | 990 |
return(object) |
1433 | 991 |
} |
1434 | 992 |
|
993 |
+ |
|
1435 | 994 |
fit.lm1 <- function(idxBatch, |
1436 | 995 |
snpBatches, |
1437 | 996 |
autosomeIndex, |
... | ... |
@@ -1449,25 +1008,32 @@ fit.lm1 <- function(idxBatch, |
1449 | 1008 |
MIN.NU, |
1450 | 1009 |
MIN.PHI, |
1451 | 1010 |
verbose, ...){ |
1452 |
- ## which.batches, ...){ |
|
1453 | 1011 |
if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches)) |
1012 |
+ snps <- snpBatches[[idxBatch]] |
|
1013 |
+ batches <- split(seq(along=batch(object)), batch(object)) |
|
1014 |
+ batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
|
1015 |
+ |
|
1454 | 1016 |
open(object) |
1455 | 1017 |
open(snpflags) |
1456 | 1018 |
open(normal) |
1457 |
- snps <- snpBatches[[idxBatch]] |
|
1458 |
- batches <- split(seq(along=batch(object)), batch(object)) |
|
1459 |
- batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
|
1460 |
- corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batches))) |
|
1019 |
+ |
|
1020 |
+ corr <- corrA.BB <- corrB.AA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batch(object)))) |
|
1461 | 1021 |
flags <- nuA <- nuB <- phiA <- phiB <- corr |
1022 |
+ |
|
1023 |
+ normal.snps <- normal[snps, ] |
|
1462 | 1024 |
cB <- cA <- matrix(NA, length(snps), ncol(object)) |
1025 |
+ GG <- as.matrix(calls(object)[snps, ]) |
|
1026 |
+ CP <- as.matrix(snpCallProbability(object)[snps, ]) |
|
1027 |
+ AA <- as.matrix(A(object)[snps, ]) |
|
1028 |
+ BB <- as.matrix(B(object)[snps, ]) |
|
1463 | 1029 |
for(k in batches){ |
1464 |
- G <- calls(object)[snps, k] |
|
1465 |
- NORM <- normal[snps, k] |
|
1466 |
- xx <- snpCallProbability(object)[snps, k] |
|
1030 |
+ G <- GG[, k] |
|
1031 |
+ NORM <- normal.snps[, k] |
|
1032 |
+ xx <- CP[, k] |
|
1467 | 1033 |
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR |
1468 | 1034 |
G <- G*highConf*NORM |
1469 |
- A <- A(object)[snps, k] |
|
1470 |
- B <- B(object)[snps, k] |
|
1035 |
+ A <- AA[, k] |
|
1036 |
+ B <- BB[, k] |
|
1471 | 1037 |
##index <- GT.B <- GT.A <- vector("list", 3) |
1472 | 1038 |
##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB") |
1473 | 1039 |
Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G) |
... | ... |
@@ -1490,7 +1056,7 @@ fit.lm1 <- function(idxBatch, |
1490 | 1056 |
corr[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=2, DF.PRIOR) |
1491 | 1057 |
corrB.AA[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=1, DF.PRIOR) |
1492 | 1058 |
corrA.BB[, J] <- corByGenotype(A=A, B=B, G=G, Ns=Ns, which.cluster=3, DF.PRIOR) |
1493 |
- ##formerly oneBatch()... |
|
1059 |
+ |
|
1494 | 1060 |
##--------------------------------------------------------------------------- |
1495 | 1061 |
## Impute sufficient statistics for unobserved genotypes (plate-specific) |
1496 | 1062 |
##--------------------------------------------------------------------------- |
... | ... |
@@ -1532,6 +1098,8 @@ fit.lm1 <- function(idxBatch, |
1532 | 1098 |
muA[index[[j]], -kk] <- mus[, 1:2] |
1533 | 1099 |
muB[index[[j]], -kk] <- mus[, 3:4] |
1534 | 1100 |
} |
1101 |
+ rm(betahat, X, Y, mus, index, noAA, noAB, noBB, res) |
|
1102 |
+ gc() |
|
1535 | 1103 |
negA <- rowSums(muA < 0) > 0 |
1536 | 1104 |
negB <- rowSums(muB < 0) > 0 |
1537 | 1105 |
flags[, J] <- rowSums(Ns == 0) > 0 |
... | ... |
@@ -1561,21 +1129,53 @@ fit.lm1 <- function(idxBatch, |
1561 | 1129 |
## formerly polymorphic(): calculate copy number |
1562 | 1130 |
cA[, k] <- matrix((1/phiA[, J]*(A-nuA[, J])), nrow(A), ncol(A)) |
1563 | 1131 |
cB[, k] <- matrix((1/phiB[, J]*(B-nuB[, J])), nrow(B), ncol(B)) |
1132 |
+ rm(G, A, B, NORM, wA, wB, YA,YB, res, negA, negB, Np, Ns) |
|
1133 |
+ gc() |
|
1564 | 1134 |
} |
1135 |
+ |
|
1136 |
+ cA[cA < 0.05] <- 0.05 |
|
1137 |
+ cB[cB < 0.05] <- 0.05 |
|
1138 |
+ cA[cA > 5] <- 5 |
|
1139 |
+ cB[cB > 5] <- 5 |
|
1565 | 1140 |
cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA)) |
1566 | 1141 |
cB <- matrix(as.integer(cB*100), nrow(cB), ncol(cB)) |
1142 |
+ |
|
1143 |
+ |
|
1144 |
+ |
|
1567 | 1145 |
CA(object)[snps, ] <- cA |
1568 | 1146 |
CB(object)[snps, ] <- cB |
1147 |
+ |
|
1148 |
+ |
|
1569 | 1149 |
snpflags[snps, ] <- flags |
1570 | 1150 |
lapply(lM(object), open) |
1571 |
- lM(object)$tau2A[snps, ] <- tau2A |
|
1572 |
- lM(object)$tau2B[snps, ] <- tau2B |
|
1573 |
- lM(object)$sig2A[snps, ] <- sig2A |
|
1574 |
- lM(object)$sig2B[snps, ] <- sig2B |
|
1575 |
- lM(object)$nuA[snps, ] <- nuA |
|
1576 |
- lM(object)$nuB[snps, ] <- nuB |
|
1577 |
- lM(object)$phiA[snps, ] <- phiA |
|
1578 |
- lM(object)$phiB[snps, ] <- phiB |
|
1151 |
+ |
|
1152 |
+ tmp <- physical(lM(object))$tau2A |
|
1153 |
+ tmp[snps, ] <- tau2A |
|
1154 |
+ lM(object)$tau2A <- tmp |
|
1155 |
+ tmp <- physical(lM(object))$tau2B |
|
1156 |
+ tmp[snps, ] <- tau2B |
|
1157 |
+ lM(object)$tau2B <- tmp |
|
1158 |
+ tmp <- physical(lM(object))$tau2B |
|
1159 |
+ tmp[snps, ] <- tau2B |
|
1160 |
+ lM(object)$tau2B <- tmp |
|
1161 |
+ tmp <- physical(lM(object))$sig2A |
|
1162 |
+ tmp[snps, ] <- sig2A |
|
1163 |
+ lM(object)$sig2A <- tmp |
|
1164 |
+ tmp <- physical(lM(object))$sig2B |
|
1165 |
+ tmp[snps, ] <- sig2B |
|
1166 |
+ lM(object)$sig2B <- tmp |
|
1167 |
+ tmp <- physical(lM(object))$nuA |
|
1168 |
+ tmp[snps, ] <- nuA |
|
1169 |
+ lM(object)$nuA <- tmp |
|
1170 |
+ tmp <- physical(lM(object))$nuB |
|
1171 |
+ tmp[snps, ] <- nuB |
|
1172 |
+ lM(object)$nuB <- tmp |
|
1173 |
+ tmp <- physical(lM(object))$phiA |
|
1174 |
+ tmp[snps, ] <- phiA |
|
1175 |
+ lM(object)$phiA <- tmp |
|
1176 |
+ tmp <- physical(lM(object))$phiB |
|
1177 |
+ tmp[snps, ] <- phiB |
|
1178 |
+ lM(object)$phiB <- tmp |
|
1579 | 1179 |
lapply(assayData(object), close) |
1580 | 1180 |
lapply(lM(object), close) |
1581 | 1181 |
TRUE |
... | ... |
@@ -1600,45 +1200,56 @@ fit.lm2 <- function(idxBatch, |
1600 | 1200 |
verbose, ...){ |
1601 | 1201 |
## which.batches, ...){ |
1602 | 1202 |
if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches)) |
1603 |
- open(object) |
|
1604 |
- open(snpflags) |
|
1605 |
- open(normal) |
|
1606 | 1203 |
snps <- snpBatches[[idxBatch]] |
1607 | 1204 |
batches <- split(seq(along=batch(object)), batch(object)) |
1608 | 1205 |
batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
1206 |
+ |
|
1207 |
+ open(object) |
|
1208 |
+ open(snpflags) |
|
1209 |
+ open(normal) |
|
1210 |
+ |
|
1211 |
+ |
|
1609 | 1212 |
cA <- matrix(NA, length(snps), ncol(object)) |
1610 | 1213 |
ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object)) |
1611 |
- flags <- snpflags[, ] |
|
1612 |
- noflags <- rowSums(flags) == 0 |
|
1214 |
+ flags <- as.matrix(snpflags[,]) |
|
1215 |
+ noflags <- rowSums(flags, na.rm=TRUE) == 0 ##NA's for unevaluated batches |
|
1613 | 1216 |
## We do not want to write to discuss for each batch. More efficient to |
1614 | 1217 |
## write to disk after estimating these parameters for all batches. |
1615 |
- nuA.np <- phiA.np <- sig2A.np <- matrix(NA, length(snps), length(unique(batches))) |
|
1218 |
+ nuA.np <- phiA.np <- sig2A.np <- matrix(NA, length(snps), length(unique(batch(object)))) |
|
1616 | 1219 |
## for imputation, we need the corresponding parameters of the snps |
1617 |
- nuA <- lM(object)$nuA |
|
1618 |
- phiA <- lM(object)$phiA |
|
1619 |
- nuB <- lM(object)$nuB |
|
1620 |
- phiB <- lM(object)$phiB |
|
1621 |
- snp.ind <- ii & noflags |
|
1622 |
- NORM.np <- normal[snps, ] |
|
1220 |
+ NN <- min(10e3, length(which(ii & noflags))) |
|
1221 |
+ snp.ind <- sample(which(ii & noflags), NN) |
|
1222 |
+ nnuA.snp <- as.matrix(physical(lM(object))$nuA[snp.ind,]) |
|
1223 |
+ pphiA.snp <- as.matrix(physical(lM(object))$phiA[snp.ind,]) |
|
1224 |
+ nnuB.snp <- as.matrix(physical(lM(object))$nuB[snp.ind,]) |
|
1225 |
+ pphiB.snp <- as.matrix(physical(lM(object))$phiB[snp.ind,]) |
|
1226 |
+ |
|
1227 |
+ AA.snp <- as.matrix(A(object)[snp.ind, ]) |
|
1228 |
+ BB.snp <- as.matrix(B(object)[snp.ind, ]) |
|
1229 |
+ NNORM.snp <- as.matrix(normal[snp.ind, ]) |
|
1230 |
+ NORM.np <- as.matrix(normal[snps, ]) |
|
1231 |
+ AA.np <- as.matrix(A(object)[snps, ]) |
|
1232 |
+ GG <- as.matrix(calls(object)[snp.ind, ]) |
|
1233 |
+ CP <- as.matrix(snpCallProbability(object)[snp.ind, ]) |
|
1623 | 1234 |
for(k in batches){ |
1624 | 1235 |
##if(verbose) message("SNP batch ", ii, " of ", length(batches)) |
1625 | 1236 |
J <- match(unique(batch(object)[k]), unique(batch(object))) |
1626 |
- snp.index <- snp.ind & nuA[, J] > 20 & nuB[, J] > 20 & phiA[, J] > 20 & phiB[, J] > 20 |
|
1627 |
- if(sum(snp.index) >= 5000){ |
|
1628 |
- snp.index <- sample(which(snp.index), 5000) |
|
1629 |
- } |
|
1630 |
- phiA.snp <- phiA[snp.index, J] |
|
1631 |
- phiB.snp <- phiB[snp.index, J] |
|
1632 |
- A.snp <- A(object)[snp.index, k] |
|
1633 |
- B.snp <- B(object)[snp.index, k] |
|
1634 |
- NORM.snp <- normal[snp.index, k] |
|
1635 |
- G <- calls(object)[snp.index, k] |
|
1636 |
- xx <- snpCallProbability(object)[snp.index, k] |
|
1237 |
+## snp.index <- snp.ind & nuA[, J] > 20 & nuB[, J] > 20 & phiA[, J] > 20 & phiB[, J] > 20 |
|
1238 |
+## if(sum(snp.index) >= 5000){ |
|
1239 |
+## snp.index <- sample(which(snp.index), 5000) |
|
1240 |
+## } else snp.index <- which(snp.index) |
|
1241 |
+ phiA.snp <- pphiA.snp[, J] |
|
1242 |
+ phiB.snp <- pphiB.snp[, J] |
|
1243 |
+ A.snp <- AA.snp[, k] |
|
1244 |
+ B.snp <- BB.snp[, k] |
|
1245 |
+ NORM.snp <- NNORM.snp[, k] |
|
1246 |
+ G <- GG[, k] |
|
1247 |
+ xx <- CP[, k] |
|
1637 | 1248 |
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR |
1638 |
- G <- G*highConf*NORM.snp[, k] |
|
1249 |
+ G <- G*highConf*NORM.snp |
|
1639 | 1250 |
G[G==0] <- NA |
1640 | 1251 |
##nonpolymorphic |
1641 |
- A.np <- A(object)[snps, k] |
|
1252 |
+ A.np <- AA.np[, k] |
|
1642 | 1253 |
Ns <- applyByGenotype(matrix(1, nrow(G), ncol(G)), rowSums, G) |
1643 | 1254 |
muA <- applyByGenotype(A.snp, rowMedians, G) |
1644 | 1255 |
muB <- applyByGenotype(B.snp, rowMedians, G) |
... | ... |
@@ -1660,13 +1271,27 @@ fit.lm2 <- function(idxBatch, |
1660 | 1271 |
} |
1661 | 1272 |
cA[, k] <- 1/phiA.np[, J] * (A.np - nuA.np[, J]) |
1662 | 1273 |
sig2A.np[, J] <- rowMAD(log2(A.np*NORM.np[, k]), na.rm=TRUE) |
1274 |
+ rm(NORM.snp, highConf, xx, G, Ns, A.np, X, Y, betahat, mus, logPhiT) |
|
1275 |
+ gc() |
|
1663 | 1276 |
} |
1277 |
+ |
|
1278 |
+ cA[cA < 0.05] <- 0.05 |
|
1279 |
+ cA[cA > 5] <- 5 |
|
1664 | 1280 |
cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA)) |
1665 | 1281 |
CA(object)[snps, ] <- cA |
1666 | 1282 |
##open(lM(object)) |
1667 |
- lM(object)$sig2A[snps, ] <- sig2A.np |
|
1668 |
- lM(object)$nuA[snps, ] <- nuA.np |
|
1669 |
- lM(object)$phiA[snps, ] <- phiA.np |
|
1283 |
+ tmp <- physical(lM(object))$nuA |
|
1284 |
+ tmp[snps, ] <- nuA.np |
|
1285 |
+ lM(object)$nuA <- tmp |
|
1286 |
+ tmp <- physical(lM(object))$sig2A |
|
1287 |
+ tmp[snps, ] <- sig2A.np |
|
1288 |
+ lM(object)$sig2A <- tmp |
|
1289 |
+ tmp <- physical(lM(object))$phiA |
|
1290 |
+ tmp[snps, ] <- phiA.np |
|
1291 |
+ lM(object)$sig2A <- tmp |
|
1292 |
+## lM(object)$sig2A[snps, ] <- sig2A.np |
|
1293 |
+## lM(object)$nuA[snps, ] <- nuA.np |
|
1294 |
+## lM(object)$phiA[snps, ] <- phiA.np |
|
1670 | 1295 |
lapply(assayData(object), close) |
1671 | 1296 |
lapply(lM(object), close) |
1672 | 1297 |
TRUE |
... | ... |
@@ -1692,13 +1317,14 @@ fit.lm3 <- function(idxBatch, |
1692 | 1317 |
verbose, ...){ |
1693 | 1318 |
## which.batches, ...){ |
1694 | 1319 |
if(verbose) message("Probe batch ", idxBatch, " of ", length(snpBatches)) |
1320 |
+ snps <- snpBatches[[idxBatch]] |
|
1321 |
+ batches <- split(seq(along=batch(object)), batch(object)) |
|
1322 |
+ batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
|
1323 |
+ |
|
1695 | 1324 |
open(snpflags) |
1696 | 1325 |
open(normal) |
1697 | 1326 |
open(object) |
1698 |
- snps <- snpBatches[[idxBatch]] |
|
1699 |
- batches <- split(seq(along=batch(object)), batch(object)) |
|
1700 |
- batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
|
1701 |
- corrAB <- corrBB <- corrAA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batches))) |
|
1327 |
+ corrAB <- corrBB <- corrAA <- sig2B <- sig2A <- tau2B <- tau2A <- matrix(NA, length(snps), length(unique(batch(object)))) |
|
1702 | 1328 |
phiA2 <- phiB2 <- tau2A |
1703 | 1329 |
flags <- nuA <- nuB <- phiA <- phiB <- corrAB |
1704 | 1330 |
cB <- cA <- matrix(NA, length(snps), ncol(object)) |
... | ... |
@@ -1706,15 +1332,21 @@ fit.lm3 <- function(idxBatch, |
1706 | 1332 |
IX <- matrix(gender, length(snps), ncol(object)) |
1707 | 1333 |
NORM <- normal[snps,] |
1708 | 1334 |
IX <- IX==2 |
1335 |
+ |
|
1336 |
+ GG <- as.matrix(calls(object)[snps, ]) |
|
1337 |
+ CP <- as.matrix(snpCallProbability(object)[snps,]) |
|
1338 |
+ AA <- as.matrix(A(object)[snps, ]) |
|
1339 |
+ BB <- as.matrix(B(object)[snps, ]) |
|
1709 | 1340 |
for(k in batches){ |
1710 | 1341 |
##if(verbose) message("SNP batch ", ii, " of ", length(batches)) |
1711 | 1342 |
## within-genotype moments |
1712 |
- G <- calls(object)[snps, k] |
|
1713 |
- xx <- snpCallProbability(object)[snps, k] |
|
1343 |
+ gender <- object$gender[k] |
|
1344 |
+ G <- GG[, k] |
|
1345 |
+ xx <- CP[, k] |
|
1714 | 1346 |
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR |
1715 | 1347 |
G <- G*highConf*NORM[, k] |
1716 |
- A <- A(object)[snps, k] |
|
1717 |
- B <- B(object)[snps, k] |
|
1348 |
+ A <- AA[, k] |
|
1349 |
+ B <- BB[, k] |
|
1718 | 1350 |
##index <- GT.B <- GT.A <- vector("list", 3) |
1719 | 1351 |
##names(index) <- names(GT.B) <- names(GT.A) <- c("AA", "AB", "BB") |
1720 | 1352 |
Ns.F <- applyByGenotype(matrix(1, nrow(G), sum(gender==2)), rowSums, G[, gender==2]) |
... | ... |
@@ -1753,10 +1385,10 @@ fit.lm3 <- function(idxBatch, |
1753 | 1385 |
correct.orderB <- muB.F[, 3] > muB.F[, 1] |
1754 | 1386 |
index.complete <- intersect(which(correct.orderA & correct.orderB), intersect(index[[1]], intersect(index[[2]], index[[3]]))) |
1755 | 1387 |
size <- min(5000, length(index.complete)) |
1756 |
- if(size == 5000) index.complete <- sample(index.complete, 5000, replace=TRUE) |
|
1757 | 1388 |
if(length(index.complete) < 200){ |
1758 | 1389 |
stop("fewer than 200 snps pass criteria for predicting the sufficient statistics") |
1759 | 1390 |
} |
1391 |
+ if(size==5000) index.complete <- sample(index.complete, size) |
|
1760 | 1392 |
index <- vector("list", 3) |
1761 | 1393 |
index[[1]] <- which(Ns.F[, 1] == 0 & (Ns.F[, 2] >= MIN.OBS & Ns.F[, 3] >= MIN.OBS)) |
1762 | 1394 |
index[[2]] <- which(Ns.F[, 2] == 0 & (Ns.F[, 1] >= MIN.OBS & Ns.F[, 3] >= MIN.OBS)) |
... | ... |
@@ -1772,10 +1404,11 @@ fit.lm3 <- function(idxBatch, |
1772 | 1404 |
complete[[2]] <- which(correct.orderA & correct.orderB & nobsB & notMissing) ##be selective here |
1773 | 1405 |
size <- min(5000, length(complete[[1]])) |
1774 | 1406 |
if(size > 5000) complete <- lapply(complete, function(x) sample(x, size)) |
1407 |
+ ## |
|
1775 | 1408 |
res <- imputeCenterX(muA.M, muB.M, Ns.M, complete, MIN.OBS) |
1776 | 1409 |
muA.M <- res[[1]] |
1777 | 1410 |
muB.M <- res[[2]] |
1778 |
- |
|
1411 |
+ ## |
|
1779 | 1412 |
## Monomorphic SNPs. Mixture model may be better |
1780 | 1413 |
## Improve estimation by borrowing strength across batch |
1781 | 1414 |
noAA <- Ns.F[, 1] < MIN.OBS |
... | ... |
@@ -1833,9 +1466,17 @@ fit.lm3 <- function(idxBatch, |
1833 | 1466 |
phistar <- phiB2[, J]/phiA[, J] |
1834 | 1467 |
tmp <- (B-nuB[, J] - phistar*A + phistar*nuA[, J])/phiB[, J] |
1835 | 1468 |
cB[, k] <- tmp/(1-phistar*phiA2[, J]/phiB[, J]) |
1836 |
- cA[, k] <- (A-nuA[, J]-phiA2[, J]*cB)/phiA[, J] |
|
1469 |
+ cA[, k] <- (A-nuA[, J]-phiA2[, J]*cB[, k])/phiA[, J] |
|
1837 | 1470 |
##some of the snps are called for the men, but not the women |
1471 |
+ rm(YA, YB, wA, wB, res, tmp, phistar, A, B, G, index) |
|
1472 |
+ gc() |
|
1838 | 1473 |
} |
1474 |
+ |
|
1475 |
+ cA[cA < 0.05] <- 0.05 |
|
1476 |
+ cB[cB < 0.05] <- 0.05 |
|
1477 |
+ cA[cA > 5] <- 5 |
|
1478 |
+ cB[cB > 5] <- 5 |
|
1479 |
+ |
|
1839 | 1480 |
##-------------------------------------------------- |
1840 | 1481 |
##RS: need to fix. why are there NA's by coercion |
1841 | 1482 |
cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA)) |
... | ... |
@@ -1846,16 +1487,49 @@ fit.lm3 <- function(idxBatch, |
1846 | 1487 |
CA(object)[snps, ] <- cA |
1847 | 1488 |
CB(object)[snps, ] <- cB |
1848 | 1489 |
snpflags[snps, ] <- flags |
1849 |
- lM(object)$tau2A[snps, ] <- tau2A |
|
1850 |
- lM(object)$tau2B[snps, ] <- tau2B |
|
1851 |
- lM(object)$sig2A[snps, ] <- sig2A |
|
1852 |
- lM(object)$sig2B[snps, ] <- sig2B |
|
1853 |
- lM(object)$nuA[snps, ] <- nuA |
|
1854 |
- lM(object)$nuB[snps, ] <- nuB |
|
1855 |
- lM(object)$phiA[snps, ] <- phiA |
|
1856 |
- lM(object)$phiB[snps, ] <- phiB |
|
1857 |
- lM(object)$phiPrimeA[snps, ] <- phiA2 |
|
1858 |
- lM(object)$phiPrimeB[snps, ] <- phiB2 |
|
1490 |
+ tmp <- physical(lM(object))$tau2A |
|
1491 |
+ tmp[snps, ] <- tau2A |
|
1492 |
+ lM(object)$tau2A <- tmp |
|
1493 |
+ tmp <- physical(lM(object))$tau2B |
|
1494 |
+ tmp[snps, ] <- tau2B |
|
1495 |
+ lM(object)$tau2B <- tmp |
|
1496 |
+ tmp <- physical(lM(object))$tau2B |
|
1497 |
+ tmp[snps, ] <- tau2B |
|
1498 |
+ lM(object)$tau2B <- tmp |
|
1499 |
+ tmp <- physical(lM(object))$sig2A |
|
1500 |
+ tmp[snps, ] <- sig2A |
|
1501 |
+ lM(object)$sig2A <- tmp |
|
1502 |
+ tmp <- physical(lM(object))$sig2B |
|
1503 |
+ tmp[snps, ] <- sig2B |
|
1504 |
+ lM(object)$sig2B <- tmp |
|
1505 |
+ tmp <- physical(lM(object))$nuA |
|
1506 |
+ tmp[snps, ] <- nuA |
|
1507 |
+ lM(object)$nuA <- tmp |
|
1508 |
+ tmp <- physical(lM(object))$nuB |
|
1509 |
+ tmp[snps, ] <- nuB |
|
1510 |
+ lM(object)$nuB <- tmp |
|
1511 |
+ tmp <- physical(lM(object))$phiA |
|
1512 |
+ tmp[snps, ] <- phiA |
|
1513 |
+ lM(object)$phiA <- tmp |
|
1514 |
+ tmp <- physical(lM(object))$phiB |
|
1515 |
+ tmp[snps, ] <- phiB |
|
1516 |
+ lM(object)$phiB <- tmp |
|
1517 |
+ tmp <- physical(lM(object))$phiPrimeA |
|
1518 |
+ tmp[snps, ] <- phiA2 |
|
1519 |
+ lM(object)$phiPrimeA <- tmp |
|
1520 |
+ tmp <- physical(lM(object))$phiPrimeB |
|
1521 |
+ tmp[snps, ] <- phiB2 |
|
1522 |
+ lM(object)$phiPrimeB <- tmp |
|
1523 |
+## lM(object)$tau2A[snps, ] <- tau2A |
|
1524 |
+## lM(object)$tau2B[snps, ] <- tau2B |
|
1525 |
+## lM(object)$sig2A[snps, ] <- sig2A |
|
1526 |
+## lM(object)$sig2B[snps, ] <- sig2B |
|
1527 |
+## lM(object)$nuA[snps, ] <- nuA |
|
1528 |
+## lM(object)$nuB[snps, ] <- nuB |
|
1529 |
+## lM(object)$phiA[snps, ] <- phiA |
|
1530 |
+## lM(object)$phiB[snps, ] <- phiB |
|
1531 |
+## lM(object)$phiPrimeA[snps, ] <- phiA2 |
|
1532 |
+## lM(object)$phiPrimeB[snps, ] <- phiB2 |
|
1859 | 1533 |
lapply(assayData(object), close) |
1860 | 1534 |
lapply(lM(object), close) |
1861 | 1535 |
TRUE |
... | ... |
@@ -1885,39 +1559,51 @@ fit.lm4 <- function(idxBatch, |
1885 | 1559 |
snps <- snpBatches[[idxBatch]] |
1886 | 1560 |
batches <- split(seq(along=batch(object)), batch(object)) |
1887 | 1561 |
batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
1888 |
- nuA <- phiA <- sig2A <- tau2A <- matrix(NA, length(snps), length(unique(batches))) |
|
1562 |
+ nuA <- phiA <- sig2A <- tau2A <- matrix(NA, length(snps), length(unique(batch(object)))) |
|
1889 | 1563 |
cA <- matrix(NA, length(snps), ncol(object)) |
1890 | 1564 |
ii <- isSnp(object) & chromosome(object) < 23 & !is.na(chromosome(object)) |
1891 | 1565 |
flags <- snpflags[ii, , drop=FALSE] |
1892 |
- noflags <- rowSums(flags) == 0 |
|
1566 |
+ noflags <- rowSums(flags, na.rm=TRUE) == 0 |
|
1893 | 1567 |
lapply(lM(object), open) |
1894 |
- nuIA <- lM(object)$nuA[ii, ] |
|
1895 |
- nuIB <- lM(object)$nuB[ii, ] |
|
1896 |
- phiIA <- lM(object)$phiA[ii,] |
|
1897 |
- phiIB <- lM(object)$phiB[ii,] |
|
1898 |
- |
|
1899 |
- snp.index <- which(nuIA > 20 & nuIB & 20 & phiIA > 20 & phiIB > 20 & noflags) |
|
1568 |
+ nuIA <- physical(lM(object))$nuA[ii, ] |
|
1569 |
+ nuIB <- physical(lM(object))$nuB[ii, ] |
|
1570 |
+ phiIA <- physical(lM(object))$phiA[ii,] |
|
1571 |
+ phiIB <- physical(lM(object))$phiB[ii,] |
|
1572 |
+ |
|
1573 |
+ i1 <- rowSums(nuIA < 20, na.rm=TRUE) == 0 |
|
1574 |
+ i2 <- rowSums(nuIB < 20, na.rm=TRUE) == 0 |
|
1575 |
+ i3 <- rowSums(phiIA < 20, na.rm=TRUE) == 0 |
|
1576 |
+ i4 <- rowSums(phiIB < 20, na.rm=TRUE) == 0 |
|
1577 |
+ |
|
1578 |
+ snp.index <- which(i1 & i2 & i3 & i4 & noflags) |
|
1579 |
+ if(length(snp.index) == 0){ |
|
1580 |
+ warning("No snps meet the following criteria: (1) nu and phi > 20 and (2) at least MIN.OBS in each genotype cluster. CN not estimated for nonpolymorphic loci on X") |
|
1581 |
+ return(TRUE) |
|
1582 |
+ } |
|
1900 | 1583 |
if(length(snp.index) >= 5000){ |
1901 | 1584 |
snp.index <- sample(snp.index, 5000) |
1902 | 1585 |
} |
1903 |
- phiA.snp <- lM(object)$phiA[snp.index, , drop=FALSE] |
|
1904 |
- phiB.snp <- lM(object)$phiB[snp.index, , drop=FALSE] |
|
1905 |
- A.snp <- A(object)[snp.index, ] |
|
1906 |
- B.snp <- B(object)[snp.index, ] |
|
1907 |
- NORM.snp <- normal[snp.index, ] |
|
1908 |
- NORM.np <- normal[snps, ] |
|
1586 |
+ phiA.snp <- physical(lM(object))$phiA[snp.index, , drop=FALSE] |
|
1587 |
+ phiB.snp <- physical(lM(object))$phiB[snp.index, , drop=FALSE] |
|
1588 |
+ A.snp <- as.matrix(A(object)[snp.index, ]) |
|
1589 |
+ B.snp <- as.matrix(B(object)[snp.index, ]) |
|
1590 |
+ NORM.snp <- as.matrix(normal[snp.index, ]) |
|
1591 |
+ NORM.np <- as.matrix(normal[snps, ]) |
|
1909 | 1592 |
gender <- object$gender |
1910 | 1593 |
|
1911 | 1594 |
|
1912 | 1595 |
pseudoAR <- position(object)[snps] < 2709520 | (position(object)[snps] > 154584237 & position(object)[snps] < 154913754) |
1913 | 1596 |
pseudoAR[is.na(pseudoAR)] <- FALSE |
1914 |
- |
|
1597 |
+ |
|
1598 |
+ GG <- as.matrix(calls(object)[snp.index, ]) |
|
1599 |
+ CP <- as.matrix(snpCallProbability(object)[snp.index, ]) |
|
1600 |
+ AA.np <- as.matrix(A(object)[snps, ]) |
|
1915 | 1601 |
##if(missing(which.batches)) which.batches <- seq(along=batches) |
1916 | 1602 |
##batches <- batches[which.batches] |
1917 | 1603 |
for(k in batches){ |
1918 | 1604 |
##if(verbose) message("SNP batch ", ii, " of ", length(batches)) |
1919 |
- G <- calls(object)[snp.index, k] |
|
1920 |
- xx <- snpCallProbability(object)[snp.index, k] |
|
1605 |
+ G <- GG[, k] |
|
1606 |
+ xx <- CP[, k] |
|
1921 | 1607 |
highConf <- (1-exp(-xx/1000)) > GT.CONF.THR |
1922 | 1608 |
G <- G*highConf*NORM.snp[, k] |
1923 | 1609 |
##snps |
... | ... |
@@ -1947,7 +1633,7 @@ fit.lm4 <- function(idxBatch, |
1947 | 1633 |
|
1948 | 1634 |
|
1949 | 1635 |
##nonpolymorphic |
1950 |
- A <- A(object)[snps, k] |
|
1636 |
+ A <- AA.np[, k] |
|
1951 | 1637 |
gend <- gender[k] |
1952 | 1638 |
A.M <- A[, gend==1] |
1953 | 1639 |
mu1 <- rowMedians(A.M, na.rm=TRUE) |
... | ... |
@@ -1985,185 +1671,32 @@ fit.lm4 <- function(idxBatch, |
1985 | 1671 |
tmp[, gend==1] <- CT1 |
1986 | 1672 |
tmp[, gend==2] <- CT2 |
1987 | 1673 |
cA[, k] <- tmp |
1674 |
+ rm(tmp, CT1, CT2, A.F, normal.f, G, AA, BB, Y, X, Ns) |
|
1675 |
+ gc() |
|
1988 | 1676 |
} |
1677 |
+ |
|
1678 |
+ cA[cA < 0.05] <- 0.05 |
|
1679 |
+ cA[cA > 5] <- 5 |
|
1989 | 1680 |
cA <- matrix(as.integer(cA*100), nrow(cA), ncol(cA)) |
1990 | 1681 |
CA(object)[snps, ] <- cA |
1991 | 1682 |
open(lM(object)) |
1992 |
- lM(object)$sig2A[snps, ] <- sig2A |
|
1993 |
- lM(object)$nuA[snps, ] <- nuA |
|
1994 |
- lM(object)$phiA[snps, ] <- phiA |
|
1683 |
+ tmp <- physical(lM(object))$nuA |
|
1684 |
+ tmp[snps, ] <- nuA |
|
1685 |
+ lM(object)$nuA <- tmp |
|
1686 |
+ tmp <- physical(lM(object))$sig2A |
|
1687 |
+ tmp[snps, ] <- sig2A |
|
1688 |
+ lM(object)$sig2A <- tmp |
|
1689 |
+ tmp <- physical(lM(object))$phiA |
|
1690 |
+ tmp[snps, ] <- phiA |
|
1691 |
+ lM(object)$sig2A <- tmp |
|
1692 |
+## lM(object)$sig2A[snps, ] <- sig2A |
|
1693 |
+## lM(object)$nuA[snps, ] <- nuA |
|
1694 |
+## lM(object)$phiA[snps, ] <- phiA |
|
1995 | 1695 |
lapply(assayData(object), close) |
1996 | 1696 |
lapply(lM(object), close) |
1997 | 1697 |
TRUE |
1998 | 1698 |
} |
1999 | 1699 |
|
2000 |
- |
|
2001 |
-crlmmWrapper <- function(filenames, cnOptions, ...){ |
|
2002 |
- cdfName <- cnOptions[["cdfName"]] |
|
2003 |
- load.it <- cnOptions[["load.it"]] |
|
2004 |
- save.it <- cnOptions[["save.it"]] |
|
2005 |
- splitByChr <- cnOptions[["splitByChr"]] |
|
2006 |
- crlmmFile <- cnOptions[["crlmmFile"]] |
|
2007 |
- intensityFile=cnOptions[["intensityFile"]] |
|
2008 |
- rgFile=cnOptions[["rgFile"]] |
|
2009 |
- ##use.ff=cnOptions[["use.ff"]] |
|
2010 |
- outdir <- cnOptions[["outdir"]] |
|
2011 |
- if(missing(cdfName)) stop("cdfName is missing -- a valid cdfName is required. See crlmm:::validCdfNames()") |
|
2012 |
- platform <- whichPlatform(cdfName) |
|
2013 |
- if(!(platform %in% c("affymetrix", "illumina"))){ |
|
2014 |
- stop("Only 'affymetrix' and 'illumina' platforms are supported at this time.") |
|
2015 |
- } else { |
|
2016 |
- message("Checking whether annotation package for the ", platform, " platform is available") |
|
2017 |
- if(!isValidCdfName(cdfName)){ |
|
2018 |
- cat("FALSE\n") |
|
2019 |
- stop(cdfName, " is not a valid entry. See crlmm:::validCdfNames(platform)") |
|
2020 |
- } else cat("TRUE\n") |
|
2021 |
- } |
|
2022 |
- if(missing(intensityFile)) stop("must specify 'intensityFile'.") |
|
2023 |
- if(missing(crlmmFile)) stop("must specify 'crlmmFile'.") |
|
2024 |
- if(platform == "illumina"){ |
|
2025 |
- if(missing(rgFile)){ |
|
2026 |
- ##stop("must specify 'rgFile'.") |
|
2027 |
- rgFile <- file.path(dirname(crlmmFile), "rgFile.rda") |
|
2028 |
- message("rgFile not specified. Using ", rgFile) |
|
2029 |
- } |
|
2030 |
- if(!load.it){ |
|
2031 |
- RG <- readIdatFiles(...) |
|
2032 |
- if(save.it) save(RG, file=rgFile) |
|
2033 |
- } |
|
2034 |
- if(load.it & !file.exists(rgFile)){ |
|
2035 |
- message("load.it is TRUE, but rgFile not present. Attempting to read the idatFiles.") |
|
2036 |
- RG <- readIdatFiles(...) |
|
2037 |
- if(save.it) save(RG, file=rgFile) |
|
2038 |
- } |
|
2039 |
- if(load.it & file.exists(rgFile)){ |
|
2040 |
- message("Loading RG file") |
|
2041 |
- load(rgFile) |
|
2042 |
- RG <- get("RG") |
|
2043 |
- } |
|
2044 |
- } |
|
2045 |
- if(!(file.exists(dirname(crlmmFile)))) stop(dirname(crlmmFile), " does not exist.") |
|
2046 |
- if(!(file.exists(dirname(intensityFile)))) stop(dirname(intensityFile), " does not exist.") |
|
2047 |
- ##--------------------------------------------------------------------------- |
|
2048 |
- ## FIX |
|
2049 |
- outfiles <- file.path(dirname(crlmmFile), paste("crlmmSetList_", 1:24, ".rda", sep="")) |
|
2050 |
- if(load.it & all(file.exists(outfiles))){ |
|
2051 |
- load(outfiles[1]) |
|
2052 |
- crlmmSetList <- get("crlmmSetList") |
|
2053 |
- if(!all(sampleNames(crlmmSetList) == basename(filenames))){ |
|
2054 |
- stop("load.it is TRUE, but sampleNames(crlmmSetList != basename(filenames))") |
|
2055 |
- } else{ |
|
2056 |
- return("load.it is TRUE and 'crlmmSetList_<CHR>.rda' objects found. Nothing to do...") |
|
2057 |
- } |
|
2058 |
- } |
|
2059 |
- if(load.it){ |
|
2060 |
- if(!file.exists(crlmmFile)){ |
|
2061 |
- message("load.it is TRUE, but ", crlmmFile, " does not exist. Rerunning the genotype calling algorithm") |
|
2062 |
- load.it <- FALSE |
|
2063 |
- } |
|
2064 |
- } |
|
2065 |
- if(platform == "affymetrix"){ |
|
2066 |
- if(!file.exists(crlmmFile) | !load.it){ |
|
2067 |
- callSet <- crlmm(filenames=filenames, |
|
2068 |
- cdfName=cdfName, |
|
2069 |
- save.it=TRUE, |
|
2070 |
- load.it=load.it, |
|
2071 |
- intensityFile=intensityFile) |
|
2072 |
- message("Quantile normalizing the copy number probes...") |
|
2073 |
- cnrmaResult <- cnrma(filenames=filenames, cdfName=cdfName, outdir=outdir) |
|
2074 |
- if(save.it){ |
|
2075 |
- message("Saving callSet and cnrmaResult to", crlmmFile) |
|
2076 |
- save(callSet, cnrmaResult, file=crlmmFile) |
|
2077 |
- } |
|
2078 |
- } else { |
|
2079 |
- message("Loading ", crlmmFile, "...") |
|
2080 |
- load(intensityFile) |
|
2081 |
- load(crlmmFile) |
|
2082 |
- callSet <- get("callSet") |
|
2083 |
- cnrmaResult <- get("cnrmaResult") |
|
2084 |
- } |
|
2085 |
- scanDates <- data.frame(ScanDate=sapply(filenames, celfileDate)) |
|
2086 |
- protocolData(callSet) <- new("AnnotatedDataFrame", |
|
2087 |
- data=scanDates, |
|
2088 |
- varMetadata=data.frame(labelDescription=colnames(scanDates), |
|
2089 |
- row.names=colnames(scanDates))) |
|
2090 |
- } |
|
2091 |
- if(platform == "illumina"){ |
|
2092 |
- if(!file.exists(crlmmFile) | !load.it){ |
|
2093 |
- callSet <- crlmmIllumina(RG=RG, |
|
2094 |
- cdfName=cdfName, |
|
2095 |
- sns=sampleNames(RG), |
|
2096 |
- returnParams=TRUE, |
|
2097 |
- save.it=TRUE, |
|
2098 |
- intensityFile=intensityFile) |
|
2099 |
- if(save.it) save(callSet, file=crlmmFile) |
|
2100 |
- } else { |
|
2101 |
- message("Loading ", crlmmFile, "...") |
|
2102 |
- load(crlmmFile) |
|
2103 |
- callSet <- get("callSet") |
|
2104 |
- } |
|
2105 |
- protocolData(callSet) <- protocolData(RG) |
|
2106 |
- } |
|
2107 |
- if(platform=="affymetrix") { |
|
2108 |
- protocolData(callSet)[["ScanDate"]] <- as.character(celDates(filenames)) |
|
2109 |
- sampleNames(protocolData(callSet)) <- sampleNames(callSet) |
|
2110 |
- } |
|
2111 |
- load(intensityFile) |
|
2112 |
- snprmaResult <- get("res") |
|
2113 |
- if(platform=="illumina"){ |
|
2114 |
- if(exists("cnAB")){ |
|
2115 |
- np.A <- as.integer(cnAB$A) |
|
2116 |
- np.B <- as.integer(cnAB$B) |
|
2117 |
- np <- ifelse(np.A > np.B, np.A, np.B) |
|
2118 |
- np <- matrix(np, nrow(cnAB$A), ncol(cnAB$A)) |
|
2119 |
- rownames(np) <- cnAB$gns |
|
2120 |
- colnames(np) <- cnAB$sns |
|
2121 |
- cnAB$NP <- np |
|
2122 |
- ##sampleNames(callSet) <- res$sns |
|
2123 |
- sampleNames(callSet) <- cnAB$sns |
|
2124 |
- cnrmaResult <- get("cnAB") |
|
2125 |
- } else cnrmaResult <- NULL |
|
2126 |
- } |
|
2127 |
- if(platform=="affymetrix"){ |
|
2128 |
- if(exists("cnrmaResult")){ |
|
2129 |
- cnrmaResult <- get("cnrmaResult") |
|
2130 |
- } else cnrmaResult <- NULL |
|
2131 |
- } |
|
2132 |
- crlmmResults <- list(snprmaResult=snprmaResult, |
|
2133 |
- cnrmaResult=cnrmaResult, |
|
2134 |
- callSet=callSet) |
|
2135 |
- |
|
2136 |
- if(!save.it){ |
|
2137 |
- message("Cleaning up") |
|
2138 |
- unlink(intensityFile) |
|
2139 |
- } |
|
2140 |
- return(crlmmResults) |
|
2141 |
-} |
|
2142 |
- |
|
2143 |
-## NOTE: THIS IS ALSO IN UTILS.R |
|
2144 |
-## validCdfNames <- function(){ |
|
2145 |
-## c("genomewidesnp6", |
|
2146 |
-## "genomewidesnp5", |
|
2147 |
-## "human370v1c", |
|
2148 |
-## "human370quadv3c", |
|
2149 |
-## "human550v3b", |
|
2150 |
-## "human650v3a", |
|
2151 |
-## "human610quadv1b", |
|
2152 |
-## "human660quadv1a", |
|
2153 |
-## "human1mduov3b") |
|
2154 |
-## } |
|
2155 |
- |
|
2156 |
-## NOTE: THIS IS ALSO IN UTILS.R |
|
2157 |
-## isValidCdfName <- function(cdfName){ |
|
2158 |
-## chipList <- validCdfNames() |
|
2159 |
-## result <- cdfName %in% chipList |
|
2160 |
-## if(!(result)){ |
|
2161 |
-## warning("cdfName must be one of the following: ", |
|
2162 |
-## chipList) |
|
2163 |
-## } |
|
2164 |
-## return(result) |
|
2165 |
-## } |
|
2166 |
- |
|
2167 | 1700 |
whichPlatform <- function(cdfName){ |
2168 | 1701 |
index <- grep("genomewidesnp", cdfName) |
2169 | 1702 |
if(length(index) > 0){ |
... | ... |
@@ -2316,36 +1849,6 @@ thresholdCopynumber <- function(object){ |
2316 | 1849 |
return(object) |
2317 | 1850 |
} |
2318 | 1851 |
|
2319 |
-cnOptions <- function( |
|
2320 |
- MIN.OBS=3, |
|
2321 |
- DF.PRIOR=50, |
|
2322 |
- bias.adj=FALSE, |
|
2323 |
- prior.prob=rep(1/4, 4), |
|
2324 |
- seed=123, |
|
2325 |
- verbose=TRUE, |
|
2326 |
- GT.CONF.THR=0.99, |
|
2327 |
- PHI.THR=2^6,##used in nonpolymorphic fxn for training |
|
2328 |
- nHOM.THR=5, ##used in nonpolymorphic fxn for training |
|
2329 |
- MIN.NU=2^3, |
|
2330 |
- MIN.PHI=2^3, |
|
2331 |
- THR.NU.PHI=TRUE, |
|
2332 |
- thresholdCopynumber=TRUE){ |
|
2333 |
- list( |
|
2334 |
- MIN.OBS=MIN.OBS, |
|
2335 |
- DF.PRIOR=DF.PRIOR, |
|
2336 |
- GT.CONF.THR=GT.CONF.THR, |
|
2337 |
- bias.adj=bias.adj, |
|
2338 |
- prior.prob=prior.prob, |
|
2339 |
- seed=seed, |
|
2340 |
- verbose=verbose, |
|
2341 |
- PHI.THR=PHI.THR, |
|
2342 |
- nHOM.THR=nHOM.THR, |
|
2343 |
- MIN.NU=MIN.NU, |
|
2344 |
- MIN.PHI=MIN.PHI, |
|
2345 |
- THR.NU.PHI=THR.NU.PHI, |
|
2346 |
- thresholdCopynumber=thresholdCopynumber) |
|
2347 |
-} |
|
2348 |
- |
|
2349 | 1852 |
##linear model parameters |
2350 | 1853 |
lm.parameters <- function(object, batch){##cnOptions){ |
2351 | 1854 |
fD <- fData(object) |
... | ... |
@@ -2661,23 +2164,27 @@ imputeCenter <- function(muA, muB, index.complete, index.missing){ |
2661 | 2164 |
|
2662 | 2165 |
imputeCenterX <- function(muA, muB, Ns, index.complete, MIN.OBS){ |
2663 | 2166 |
index1 <- which(Ns[, 1] == 0 & Ns[, 3] > MIN.OBS) |
2664 |
- X <- cbind(1, muA[index.complete[[1]], 3], muB[index.complete[[1]], 3]) |
|
2665 |
- Y <- cbind(1, muA[index.complete[[1]], 1], muB[index.complete[[1]], 1]) |
|
2666 |
- betahat <- solve(crossprod(X), crossprod(X,Y)) |
|
2667 |
- ##now with the incomplete SNPs |
|
2668 |
- X <- cbind(1, muA[index1, 3], muB[index1, 3]) |
|
2669 |
- mus <- X %*% betahat |
|
2670 |
- muA[index1, 1] <- mus[, 2] |
|
2671 |
- muB[index1, 1] <- mus[, 3] |
|
2167 |
+ if(length(index1) > 0){ |
|
2168 |
+ X <- cbind(1, muA[index.complete[[1]], 3], muB[index.complete[[1]], 3]) |
|
2169 |
+ Y <- cbind(1, muA[index.complete[[1]], 1], muB[index.complete[[1]], 1]) |
|
2170 |
+ betahat <- solve(crossprod(X), crossprod(X,Y)) |
|
2171 |
+ ##now with the incomplete SNPs |
|
2172 |
+ X <- cbind(1, muA[index1, 3], muB[index1, 3]) |
|
2173 |
+ mus <- X %*% betahat |
|
2174 |
+ muA[index1, 1] <- mus[, 2] |
|
2175 |
+ muB[index1, 1] <- mus[, 3] |
|
2176 |
+ } |
|
2672 | 2177 |
index1 <- which(Ns[, 3] == 0) |
2673 |
- X <- cbind(1, muA[index.complete[[2]], 1], muB[index.complete[[2]], 1]) |
|
2674 |
- Y <- cbind(1, muA[index.complete[[2]], 3], muB[index.complete[[2]], 3]) |
|
2675 |
- betahat <- solve(crossprod(X), crossprod(X,Y)) |
|
2676 |
- ##now with the incomplete SNPs |
|
2677 |
- X <- cbind(1, muA[index1, 1], muB[index1, 1]) |
|
2678 |
- mus <- X %*% betahat |
|
2679 |
- muA[index1, 3] <- mus[, 2] |
|
2680 |
- muB[index1, 3] <- mus[, 3] |
|
2178 |
+ if(length(index1) > 0){ |
|
2179 |
+ X <- cbind(1, muA[index.complete[[2]], 1], muB[index.complete[[2]], 1]) |
|
2180 |
+ Y <- cbind(1, muA[index.complete[[2]], 3], muB[index.complete[[2]], 3]) |
|
2181 |
+ betahat <- solve(crossprod(X), crossprod(X,Y)) |
|
2182 |
+ ##now with the incomplete SNPs |
|
2183 |
+ X <- cbind(1, muA[index1, 1], muB[index1, 1]) |
|
2184 |
+ mus <- X %*% betahat |
|
2185 |
+ muA[index1, 3] <- mus[, 2] |
|
2186 |
+ muB[index1, 3] <- mus[, 3] |
|
2187 |
+ } |
|
2681 | 2188 |
list(muA, muB) |
2682 | 2189 |
} |
2683 | 2190 |
|
... | ... |
@@ -3095,6 +2602,113 @@ biasAdj <- function(object, cnOptions, tmp.objects){ |
3095 | 2602 |
} |
3096 | 2603 |
|
3097 | 2604 |
|
2605 |
+bias1 <- function(idxBatch, |
|
2606 |
+ snpBatches, |
|
2607 |
+ index, |
|
2608 |
+ object, |
|
2609 |
+ normal, |
|
2610 |
+ emit, |
|
2611 |
+ prior.prob, |
|
2612 |
+ MIN.SAMPLES, |
|
2613 |
+ verbose){ |
|
2614 |
+ |
|
2615 |
+} |
|
2616 |
+ |
|
2617 |
+bias2 <- function(idxBatch, |
|
2618 |
+ snpBatches, |
|
2619 |
+ index, |
|
2620 |
+ object, |
|
2621 |
+ normal, |
|
2622 |
+ prior.prob, |
|
2623 |
+ MIN.SAMPLES, |
|
2624 |
+ verbose){ |
|
2625 |
+ open(object) |
|
2626 |
+ open(normal) |
|
2627 |
+ |
|
2628 |
+ nps <- snpBatches[[idxBatch]] |
|
2629 |
+ nuA <- lM(object)$nuA[nps, , drop=FALSE] |
|
2630 |
+ phiA <- lM(object)$phiA[nps, , drop=FALSE] |
|
2631 |
+ sig2A <- lM(object)$sig2A[nps, , drop=FALSE] |
|
2632 |
+ AA <- as.matrix(A(object)[nps, ]) |
|
2633 |
+ batches <- split(seq(along=batch(object)), batch(object)) |
|
2634 |
+ batches <- batches[sapply(batches, length) >= MIN.SAMPLES] |
|
2635 |
+ |
|
2636 |
+ cn.lik <- matrix(NA, length(nps)*ncol(object), 4) |
|
2637 |
+ argmax.cn <- emit[nps, ] |
|
2638 |
+ norm <- matrix(1L, length(nps), ncol(object)) |
|
2639 |
+ |
|
2640 |
+ for(k in batches){ |
|
2641 |
+ J <- match(unique(batch(object)[k]), unique(batch(object))) |
|
2642 |
+ lT <- log2(AA[, k]) |
|
2643 |
+ counter <- 1 ##state counter |
|
2644 |
+ for(CT in c(0, 1.5, 2, 2.5)){ |
|
2645 |
+ ##sds <- sqrt(sig2A[, J]*(CT==0) + sig2A[ , J]*(CT > 0)) |
|
2646 |
+ sds <- sqrt(sig2A[, J]) |
|
2647 |
+ means <- suppressWarnings(log2(nuA[, J]+CT*phiA[, J])) |
|
2648 |
+ lik <- log(dnorm(lT, mean=means, sd=sds)) |
|
2649 |
+ ##emit[[counter]][nps, ] <- tmp |
|
2650 |
+ cn.lik[, counter] <- as.numeric(lik) |
|
2651 |
+ counter <- counter+1 |
|
2652 |
+ } |
|
2653 |
+ outlier <- matrix(rowSums(cn.lik < -10) == 4, length(nps), ncol(object)) |
|
2654 |
+ argmax.cn.lik <- apply(cn.lik, 1, function(x) order(x, decreasing=TRUE)[1]) |
|
2655 |
+ argmax.cn <- matrix(argmax.cn.lik, length(nps), length(k)) |
|
2656 |
+ |
|
2657 |
+ isUp <- argmax.cn > 3 |
|
2658 |
+ prUp <- rowMeans(isUp) |
|
2659 |
+ |
|
2660 |
+ isDn <- argmax.cn < 3 |
|
2661 |
+ prDn <- rowMeans(isDn) |
|
2662 |
+ |
|
2663 |
+ index <- which(prUp > 0.05 & prUp > prDn) |
|
2664 |
+ ##if proportion up greater than 5%, trim the high cn est. |
|
2665 |
+ norm[index, k] <- argmax.cn[index, ] > 3 |
|
2666 |
+ |
|
2667 |
+ index <- which(prDn > 0.05 & prDn > prUp) |
|
2668 |
+ norm[index, k] <- argmax.cn[index, ] < 3 |
|
2669 |
+ norm[index, k] <- norm[index, k]*!outlier |
|
2670 |
+ } |
|
2671 |
+ normal[nps, ] <- norm |
|
2672 |
+ TRUE |
|
2673 |
+} |
|
2674 |
+ |
|
2675 |
+ |
|
2676 |
+biasAdjust <- function(object, prior.prob=rep(1/4, 4), MIN.SAMPLES=10, verbose=TRUE){ |
|
2677 |
+ load(file.path(ldPath(), "normal.rda")) |
|
2678 |
+ autosomeIndex.nps <- (1:nrow(object))[chromosome(object) < 23 & !isSnp(object) & !is.na(chromosome(object))] |
|
2679 |
+ |
|
2680 |
+## emit <- initializeBigMatrix("emit", |
|
2681 |
+## nrow(object), |
|
2682 |
+## ncol(object), |
|
2683 |
+## vmode="double") |
|
2684 |
+ if(verbose) message("Bias adjustment for nonpolymorphic loci on chromosomes 1-22.") |
|
2685 |
+ snpBatches <- splitIndicesByLength(autosomeIndex.nps, ocProbesets()) |
|
2686 |
+ ocLapply(seq(along=snpBatches), |
|
2687 |
+ bias2, |
|
2688 |
+ index=autosomeIndex.nps, |
|
2689 |
+ snpBatches=snpBatches, |
|
2690 |
+ object=object, |
|
2691 |
+ normal=normal, |
|
2692 |
+ prior.prob=prior.prob, |
|
2693 |
+ MIN.SAMPLES=MIN.SAMPLES, |
|
2694 |
+ verbose=verbose) |
|
2695 |
+ |
|
2696 |
+ if(verbose) message("Bias adjustment for polymorphic loci on chromosomes 1-22.") |
|
2697 |
+ autosomeIndex.snps <- (1:nrow(object))[chromosome(object) < 23 & isSnp(object) & !is.na(chromosome(object))] |
|
2698 |
+ snpBatches <- splitIndicesByLength(autosomeIndex.snps, ocProbesets()) |
|
2699 |
+ ocLapply(seq(along=snpBatches), |
|
2700 |
+ bias1, |
|
2701 |
+ index=autosomeIndex.snps, |
|
2702 |
+ snpBatches=snpBatches, |
|
2703 |
+ object=object, |
|
2704 |
+ normal=normal, |
|
2705 |
+ prior.prob=prior.prob, |
|
2706 |
+ emit=emit, |
|
2707 |
+ MIN.SAMPLES=MIN.SAMPLES, |
|
2708 |
+ verbose=verbose) |
|
2709 |
+} |
|
2710 |
+ |
|
2711 |
+ |
|
3098 | 2712 |
##biasAdjNP <- function(plateIndex, envir, priorProb){ |
3099 | 2713 |
biasAdjNP <- function(object, cnOptions, tmp.objects){ |
3100 | 2714 |
##batch <- unique(object$batch) |
22 | 23 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,62 @@ |
1 |
+\name{CNSetLM-class} |
|
2 |
+\Rdversion{1.1} |
|
3 |
+\docType{class} |
|
4 |
+\alias{CNSetLM-class} |
|
5 |
+\alias{[,CNSetLM-method} |
|
6 |
+\alias{lM,CNSetLM-method} |
|
7 |
+\alias{lM<-,CNSetLM,list_or_ffdf-method} |
|
8 |
+\alias{open,CNSetLM-method} |
|
9 |
+\alias{show,CNSetLM-method} |
|
10 |
+ |
|
11 |
+\title{CNSetLM class} |
|
12 |
+\description{Container for allele-specific copy number and linear model |
|
13 |
+parameters} |
|
14 |
+\section{Objects from the Class}{ |
|
15 |
+ Objects from the class can be created by calls of the |
|
16 |
+ form \code{ new("CNSetLM", CA=matrix(), CB=matrix(), alleleA=matrix(), alleleB=matrix(), call=matrix(), callProbability=matrix())} |
|
17 |
+ |
|
18 |
+} |
|
19 |
+\section{Slots}{ |
|
20 |
+ \describe{ |
|
21 |
+ \item{\code{lM}:}{Object of class \code{"list_or_ffdf"}} |
|
22 |
+ \item{\code{assayData}:}{Object of class \code{"AssayData"} } |
|
23 |
+ \item{\code{phenoData}:}{Object of class \code{"AnnotatedDataFrame"} } |
|
24 |
+ \item{\code{featureData}:}{Object of class \code{"AnnotatedDataFrame"} } |
|
25 |
+ \item{\code{experimentData}:}{Object of class \code{"MIAME"} } |
|
26 |
+ \item{\code{annotation}:}{Object of class \code{"character"} } |
|
27 |
+ \item{\code{protocolData}:}{Object of class \code{"AnnotatedDataFrame"} } |
|
28 |
+ \item{\code{.__classVersion__}:}{Object of class \code{"Versions"} } |
|
29 |
+ } |
|
30 |
+} |
|
31 |
+\section{Extends}{ |
|
32 |
+Class \code{"\linkS4class{CNSet}"}, directly. |
|
33 |
+Class \code{"\linkS4class{SnpSuperSet}"}, by class "CNSet", distance 2. |
|
34 |
+Class \code{"\linkS4class{AlleleSet}"}, by class "CNSet", distance 3. |
|
35 |
+Class \code{"\linkS4class{SnpSet}"}, by class "CNSet", distance 3. |
|
36 |
+Class \code{"\linkS4class{eSet}"}, by class "CNSet", distance 4. |
|
37 |
+Class \code{"\linkS4class{VersionedBiobase}"}, by class "CNSet", distance 5. |
|
38 |
+Class \code{"\linkS4class{Versioned}"}, by class "CNSet", distance 6. |
|
39 |
+} |
|
40 |
+\section{Methods}{ |
|
41 |
+ \describe{ |
|
42 |
+ \item{[}{\code{signature(x = "CNSetLM")}: subset \code{CNSetLM} objects} |
|
43 |
+ \item{lM}{\code{signature(object = "CNSetLM")}: Extract list or |
|
44 |
+ ffdf object containing linear model parameters} |
|
45 |
+## \item{lM<-}{\code{signature(object = "CNSetLM", value = "list_or_ffdf")}: ... } |
|
46 |
+ \item{open}{\code{signature(con = "CNSetLM")}: opens file connects |
|
47 |
+ to ff objects for assayData elements and linear model parameters} |
|
48 |
+ \item{show}{\code{signature(object = "CNSetLM")}: print method |
|
49 |
+ for the class } |
|
50 |
+ } |
|
51 |
+} |
|
52 |
+\author{ R. Scharpf} |
|
53 |
+\seealso{ |
|
54 |
+ \code{\linkS4class{SnpSuperSet}}, \code{\linkS4class{CNSet}} |
|
55 |
+} |
|
56 |
+% ~~objects to See Also as \code{\link{~~fun~~}}, ~~~ |
|
57 |
+% or \code{\linkS4class{CLASSNAME}} for links to other classes |
|
58 |
+%} |
|
59 |
+\examples{ |
|
60 |
+showClass("CNSetLM") |
|
61 |
+} |
|
62 |
+\keyword{classes} |
0 | 63 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,45 @@ |
1 |
+\name{batch} |
|
2 |
+\alias{batch} |
|
3 |
+\alias{batch,eSet-method} |
|
4 |
+\title{ |
|
5 |
+ Function to extract batch information. |
|
6 |
+} |
|
7 |
+\description{ |
|
8 |
+ Checks the phenoData and protocolData for a variable named |
|
9 |
+ batch and, if present, returns the vector. |
|
10 |
+} |
|
11 |
+\usage{ |
|
12 |
+batch(object) |
|
13 |
+} |
|
14 |
+\arguments{ |
|
15 |
+ \item{object}{ |
|
16 |
+ An object extending the \code{eSet} class. |
|
17 |
+} |
|
18 |
+} |
|
19 |
+\details{ |
|
20 |
+ |
|
21 |
+ For copy number estimation, a batch variable must be |
|
22 |
+ specified. Currently, we suggest storing this variable in the |
|
23 |
+ protocolData. |
|
24 |
+ |
|
25 |
+ Batch represents groups of samples that were processed (DNA |
|
26 |
+ preparation and collection, PCR amplification, scan date) at |
|
27 |
+ similar times. Often, the 96 well chemistry plate or scan date |
|
28 |
+ is a useful surrogate for batch. |
|
29 |
+ |
|
30 |
+} |
|
31 |
+\value{ |
|
32 |
+ |
|
33 |
+ Vector indicating batch. |
|
34 |
+ |
|
35 |
+} |
|
36 |
+ |
|
37 |
+\author{ |
|
38 |
+R. Scharpf |
|
39 |
+} |
|
40 |
+ |
|
41 |
+\seealso{ |
|
42 |
+ \code{\link{genotype}}, \code{\link{genotype2}} |
|
43 |
+} |
|
44 |
+\keyword{manip} |
|
45 |
+ |
... | ... |
@@ -1,29 +1,53 @@ |
1 | 1 |
\name{crlmmCopynumber} |
2 | 2 |
\alias{crlmmCopynumber} |
3 |
+\alias{crlmmCopynumber2} |
|
3 | 4 |
\title{Locus- and allele-specific estimation of copy number} |
4 | 5 |
\description{ |
5 | 6 |
Locus- and allele-specific estimation of copy number. |
6 | 7 |
} |
7 | 8 |
\usage{ |
8 |
-crlmmCopynumber(object, chromosome = 1:23, which.batches, MIN.SAMPLES = 10, SNRMin = 5, MIN.OBS = 3, DF.PRIOR = 50, bias.adj = FALSE, prior.prob = rep(1/4, 4), seed = 1, verbose = TRUE, GT.CONF.THR = 0.99, PHI.THR = 2^6, nHOM.THR = 5, MIN.NU = 2^3, MIN.PHI = 2^3, THR.NU.PHI = TRUE, thresholdCopynumber = TRUE) |
|
9 |
+crlmmCopynumber(object, chromosome = 1:23, which.batches, MIN.SAMPLES |
|
10 |
+= 10, SNRMin = 5, MIN.OBS = 3, DF.PRIOR = 50, bias.adj = FALSE, |
|
11 |
+prior.prob = rep(1/4, 4), seed = 1, verbose = TRUE, GT.CONF.THR = |
|
12 |
+0.99, PHI.THR = 2^6, nHOM.THR = 5, MIN.NU = 2^3, MIN.PHI = 2^3, |
|
13 |
+THR.NU.PHI = TRUE, thresholdCopynumber = TRUE) |
|
14 |
+ |
|
15 |
+crlmmCopynumber2(object, which.batches, MIN.SAMPLES = 10, SNRMin = 5, |
|
16 |
+MIN.OBS = 1, DF.PRIOR = 50, bias.adj = FALSE, prior.prob = rep(1/4, |
|
17 |
+4), seed = 1, verbose = TRUE, GT.CONF.THR = 0.99, PHI.THR = 2^6, |
|
18 |
+nHOM.THR = 5, MIN.NU = 2^3, MIN.PHI = 2^3, THR.NU.PHI = TRUE, |
|
19 |
+thresholdCopynumber = TRUE) |
|
20 |
+ |
|
9 | 21 |
} |
10 | 22 |
\arguments{ |
11 | 23 |
\item{object}{object of class \code{SnpSuperSet}. |
12 | 24 |
} |
25 |
+ |
|
13 | 26 |
\item{which.batches}{ Character vector with length equal to the number of |
14 | 27 |
samples. Used to adjust for batch effects. Chemistry plate or |
15 | 28 |
date often work well. See examples. |
29 |
+ |
|
30 |
+ Ignored in crlmmCopynumber2. |
|
31 |
+ |
|
16 | 32 |
} |
33 |
+ |
|
17 | 34 |
\item{chromosome}{Numeric vector indicating which chromosomes to |
18 | 35 |
process (length <= 23). For chromosome X, use 23. A copy number |
19 | 36 |
method for chromosome Y is not yet available. |
20 | 37 |
} |
38 |
+ |
|
21 | 39 |
\item{MIN.SAMPLES}{ 'Integer'. The minimum number of samples in a |
22 |
- batch. Bathes with fewer than MIN.SAMPLES are skipped. |
|
40 |
+ batch. Bathes with fewer than MIN.SAMPLES are skipped. Therefore, |
|
41 |
+ samples in batches with fewer than MIN.SAMPLES have NA's for the |
|
42 |
+ allele-specific copy number and NA's for the linear model |
|
43 |
+ parameters. |
|
44 |
+ |
|
23 | 45 |
} |
46 |
+ |
|
24 | 47 |
\item{SNRMin}{ Samples with low signal to noise ratios are |
25 | 48 |
excluded. |
26 | 49 |
} |
50 |
+ |
|
27 | 51 |
\item{MIN.OBS}{ |
28 | 52 |
|
29 | 53 |
For genotypes with fewer than \code{MIN.OBS}, the within-genotype |
... | ... |
@@ -56,6 +80,8 @@ crlmmCopynumber(object, chromosome = 1:23, which.batches, MIN.SAMPLES = 10, SNRM |
56 | 80 |
of the samples have a copy number alteration. For additional |
57 | 81 |
information, see Scharpf et al., 2009. |
58 | 82 |
|
83 |
+ This argument is ignored in crlmmCopynumber2. |
|
84 |
+ |
|
59 | 85 |
} |
60 | 86 |
\item{prior.prob}{ |
61 | 87 |
|
... | ... |
@@ -64,10 +90,9 @@ crlmmCopynumber(object, chromosome = 1:23, which.batches, MIN.SAMPLES = 10, SNRM |
64 | 90 |
normal copy number, and amplification, respectively. |
65 | 91 |
|
66 | 92 |
} |
67 |
- \item{seed}{ Seed for sampling. |
|
68 |
-} |
|
69 |
- \item{verbose}{ Logical. |
|
70 |
-} |
|
93 |
+ \item{seed}{ Seed for random number generation.} |
|
94 |
+ |
|
95 |
+ \item{verbose}{ Logical. } |
|
71 | 96 |
|
72 | 97 |
\item{GT.CONF.THR}{ |
73 | 98 |
|
... | ... |
@@ -101,20 +126,39 @@ crlmmCopynumber(object, chromosome = 1:23, which.batches, MIN.SAMPLES = 10, SNRM |
101 | 126 |
\code{MIN.PHI} are ignored. |
102 | 127 |
} |
103 | 128 |
\item{thresholdCopynumber}{ |
129 |
+ |
|
104 | 130 |
If \code{TRUE}, allele-specific number estimates are truncated. |
105 | 131 |
Values less than 0.05 are assigned the value 0.05; values exceeding |
106 | 132 |
5 are assigned the value 5. |
133 |
+ |
|
134 |
+ Ignored in crlmmCopynumber2. Extreme values are automatically |
|
135 |
+ truncated. |
|
136 |
+ |
|
137 |
+} |
|
107 | 138 |
} |
139 |
+ |
|
140 |
+\details{ |
|
141 |
+ |
|
142 |
+ The function crlmmCopynumber uses matrices instead of ff |
|
143 |
+ objects if the ff library is not loaded. |
|
144 |
+ |
|
145 |
+ The function crlmmCopynumber2 allows parallel processing via |
|
146 |
+ and requires large data support via the ff package. |
|
147 |
+ |
|
148 |
+ We plan to phase out crlmmCopynumber and replace this function |
|
149 |
+ by crlmmCopynumber2. |
|
150 |
+ |
|
108 | 151 |
} |
109 | 152 |
\author{R. Scharpf} |
110 | 153 |
\examples{ |
111 | 154 |
## data(example.callSet) |
112 |
-## cnSet <- crlmmCopynumber(example.callSet) |
|
155 |
+## cnSet <- crlmmCopynumber2(example.callSet) |
|
113 | 156 |
## total copy number |
114 | 157 |
## cn <- copyNumber(cnSet) |
115 | 158 |
## allele-specific copy number |
116 |
-## ca <- CA(cnSet) ## A dosage |
|
117 |
-## cb <- CB(cnSet) ## B dosage |
|
159 |
+## ca <- CA(cnSet)/100 ## A dosage |
|
160 |
+## cb <- CB(cnSet)/100 ## B dosage |
|
118 | 161 |
} |
119 | 162 |
% Add one or more standard keywords, see file 'KEYWORDS' in the |
120 | 163 |
% R documentation directory. |
164 |
+\keyword{manip} |
... | ... |
@@ -1,5 +1,6 @@ |
1 | 1 |
\name{genotype} |
2 | 2 |
\alias{genotype} |
3 |
+\alias{genotype2} |
|
3 | 4 |
\title{ |
4 | 5 |
Preprocessing and genotyping of Affymetrix arrays. |
5 | 6 |
} |
... | ... |
@@ -7,11 +8,18 @@ |
7 | 8 |
Preprocessing and genotyping of Affymetrix arrays. |
8 | 9 |
} |
9 | 10 |
\usage{ |
10 |
-genotype(filenames, cdfName, mixtureSampleSize = 10^5, eps = 0.1, verbose = TRUE, seed = 1, sns, copynumber = FALSE, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
11 |
+genotype(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps = |
|
12 |
+0.1, verbose = TRUE, seed = 1, sns, copynumber = FALSE, probs = |
|
13 |
+rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, |
|
14 |
+gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
15 |
+ |
|
16 |
+genotype2(filenames, cdfName, batch, mixtureSampleSize = 10^5, eps = 0.1, verbose = TRUE, seed = 1, sns, copynumber = FALSE, probs = rep(1/3, 3), DF = 6, SNRMin = 5, recallMin = 10, recallRegMin = 1000, gender = NULL, returnParams = TRUE, badSNP = 0.7) |
|
11 | 17 |
} |
12 | 18 |
\arguments{ |
13 | 19 |
\item{filenames}{ complete path to CEL files} |
14 |
- \item{cdfName}{ annotation package (see also \code{validCdfNames})} |
|
20 |
+ \item{cdfName}{ annotation package (see also |
|
21 |
+ \code{validCdfNames})} |
|
22 |
+ \item{batch}{ batch variable. See details. } |
|
15 | 23 |
\item{mixtureSampleSize}{ Sample size to be use when fitting the mixture model.} |
16 | 24 |
\item{eps}{ Stop criteria.} |
17 | 25 |
\item{verbose}{ Logical. Whether to print descriptive messages during processing.} |
... | ... |
@@ -30,6 +38,23 @@ genotype(filenames, cdfName, mixtureSampleSize = 10^5, eps = 0.1, verbose = TRUE |
30 | 38 |
\item{badSNP}{'numeric'. Threshold to flag as bad SNP (affects batchQC)} |
31 | 39 |
} |
32 | 40 |
|
41 |
+\details{ |
|
42 |
+ |
|
43 |
+ For large datasets it is important to utilize the large data |
|
44 |
+ support by installing and loading the ff package before |
|
45 |
+ calling the genotype or genotype2 function. |
|
46 |
+ |
|
47 |
+ Currently, two functions are provided for preprocessing and |
|
48 |
+ genotyping Affymetrix platforms: genotype and genotype2. For |
|
49 |
+ small datasets, genotype and genotype2 are identical. For |
|
50 |
+ large datasets, genotype2 provides large data support (via ff) |
|
51 |
+ and permits the use of clusters or multiple cores (via snow |
|
52 |
+ package) to speed up genotyping (similar to \code{crlmm2}). |
|
53 |
+ The \code{genotype} function will be phased out in the future |
|
54 |
+ and replaced by \code{genotype2}. |
|
55 |
+ |
|
56 |
+} |
|
57 |
+ |
|
33 | 58 |
\value{ A \code{SnpSuperSet} instance.} |
34 | 59 |
\references{ |
35 | 60 |
|
... | ... |
@@ -51,15 +76,33 @@ this will greatly reduce the RAM required for big jobs. See |
51 | 76 |
\seealso{ |
52 | 77 |
\code{\link{snprma}}, \code{\link{crlmm}}, |
53 | 78 |
\code{\link[oligoClasses]{ocSamples}}, |
54 |
- \code{\link[oligoClasses]{ldOpts}} |
|
79 |
+ \code{\link[oligoClasses]{ldOpts}}, |
|
80 |
+ \code{\link{batch}}, |
|
81 |
+ \code{\link{crlmmCopynumber}} |
|
55 | 82 |
} |
56 | 83 |
\examples{ |
57 |
-if (require(genomewidesnp6Crlmm) & require(hapmapsnp6)){ |
|
84 |
+if (require(ff) & require(genomewidesnp6Crlmm) & require(hapmapsnp6)){ |
|
85 |
+ |
|
58 | 86 |
path <- system.file("celFiles", package="hapmapsnp6") |
59 | 87 |
## the filenames with full path... |
60 | 88 |
## very useful when genotyping samples not in the working directory |
61 | 89 |
cels <- list.celfiles(path, full.names=TRUE) |
62 |
- (crlmmOutput <- genotype(cels, cdfName="genomewidesnp6")) |
|
90 |
+ |
|
91 |
+ ## To use less RAM, specify a smaller argument to ocProbesets |
|
92 |
+ ocProbesets(50e3) |
|
93 |
+ (cnSet <- genotype2(cels, cdfName="genomewidesnp6", |
|
94 |
+ copynumber=TRUE)) |
|
95 |
+ |
|
96 |
+ dim(cnSet) |
|
97 |
+ table(isSnp(cnSet)) |
|
98 |
+ |
|
99 |
+## The above is a trivial example. Typically you may have a large |
|
100 |
+## number of cel files, many of which were processed at different |
|
101 |
+## times. For such datasets, it is important to set a batch |
|
102 |
+## variable. If not specified, the scan date of the file is used |
|
103 |
+## as the batch variable. |
|
104 |
+ batch(cnSet) |
|
105 |
+ protocolData(cnSet)$ScanDate |
|
63 | 106 |
} |
64 | 107 |
} |
65 | 108 |
\keyword{ classif } |