git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@54169 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: crlmm |
2 | 2 |
Type: Package |
3 | 3 |
Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays. |
4 |
-Version: 1.9.21 |
|
4 |
+Version: 1.9.22 |
|
5 | 5 |
Date: 2010-12-10 |
6 | 6 |
Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry |
7 | 7 |
Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU> |
... | ... |
@@ -35,6 +35,7 @@ Collate: AllGenerics.R |
35 | 35 |
methods-SnpSuperSet.R |
36 | 36 |
cnrma-functions.R |
37 | 37 |
crlmm-functions.R |
38 |
+ crlmmGT2.R |
|
38 | 39 |
crlmm-illumina.R |
39 | 40 |
snprma-functions.R |
40 | 41 |
plot-methods.R |
... | ... |
@@ -202,7 +202,7 @@ genotype <- function(filenames, |
202 | 202 |
sns=sampleNames(cnSet), |
203 | 203 |
seed=seed, |
204 | 204 |
verbose=verbose) |
205 |
- tmp <- rscrlmmGT2(A=calls(cnSet), |
|
205 |
+ tmp <- crlmmGT2(A=calls(cnSet), |
|
206 | 206 |
B=snpCallProbability(cnSet), |
207 | 207 |
SNR=SNR, |
208 | 208 |
mixtureParams=mixtureParams, |
... | ... |
@@ -2384,318 +2384,55 @@ posteriorMean.snp <- function(stratum, object, index.list, CN, |
2384 | 2384 |
|
2385 | 2385 |
|
2386 | 2386 |
|
2387 |
-rscrlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
|
2388 |
- col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
|
2389 |
- SNRMin=5, recallMin=10, recallRegMin=1000, |
|
2390 |
- gender=NULL, desctrucitve=FALSE, verbose=TRUE, |
|
2391 |
- returnParams=FALSE, badSNP=.7, snp.names){ |
|
2392 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
2393 |
- stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
2394 |
- open(SNR) |
|
2395 |
- open(A) |
|
2396 |
- open(B) |
|
2397 |
- open(mixtureParams) |
|
2398 |
- ## expect objects to be ff |
|
2399 |
- keepIndex <- which( SNR[] > SNRMin) |
|
2400 |
- if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
2401 |
- if(is.null(rownames(A))){ |
|
2402 |
- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
2403 |
- gns <- getVarInEnv("gns", .crlmmPkgEnv) |
|
2404 |
- stopifnot(nrow(A) == length(gns)) |
|
2405 |
- index <- seq(length=nrow(A)) |
|
2406 |
- } |
|
2407 |
- if(!missing(snp.names)){ |
|
2408 |
- stopifnot(!is.null(rownames(A))) |
|
2409 |
- ##verify that A has only snps. otherwise, calling function must pass rownames |
|
2410 |
- index <- match(snp.names, rownames(A)) |
|
2411 |
- } |
|
2412 |
- snpBatches <- splitIndicesByLength(index, ocProbesets()) |
|
2413 |
- NR <- length(unlist(snpBatches)) |
|
2414 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
2415 |
- NC <- ncol(A) |
|
2416 |
- ## |
|
2417 |
- if(verbose) message("Loading annotations.") |
|
2418 |
- obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
2419 |
- obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
2420 |
- ## this is toget rid of the 'no visible binding' notes |
|
2421 |
- ## variable definitions |
|
2422 |
- XIndex <- getVarInEnv("XIndex") |
|
2423 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
2424 |
- YIndex <- getVarInEnv("YIndex") |
|
2425 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
2426 |
- theKnots <- getVarInEnv("theKnots") |
|
2427 |
- regionInfo <- getVarInEnv("regionInfo") |
|
2428 |
- params <- getVarInEnv("params") |
|
2429 |
- rm(list=c(obj1, obj2), envir=.crlmmPkgEnv) |
|
2430 |
- rm(obj1, obj2) |
|
2431 |
- ## |
|
2432 |
- ## IF gender not provide, we predict |
|
2433 |
- ## FIXME: XIndex may be greater than ocProbesets() |
|
2434 |
- if(is.null(gender)){ |
|
2435 |
- if(verbose) message("Determining gender.") |
|
2436 |
- ## XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
2437 |
- XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm") |
|
2438 |
- XMedian <- unlist(XMedian) |
|
2439 |
- if(sum(SNR[] > SNRMin)==1){ |
|
2440 |
- gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
2441 |
- }else{ |
|
2442 |
- gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]] |
|
2443 |
- } |
|
2444 |
- } |
|
2445 |
- ## |
|
2446 |
- Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
2447 |
- cIndexes <- list(keepIndex, |
|
2448 |
- keepIndex[which(gender[keepIndex]==2)], |
|
2449 |
- keepIndex[which(gender[keepIndex]==1)]) |
|
2450 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
2451 |
- ## call C |
|
2452 |
- fIndex <- which(gender==2) |
|
2453 |
- mIndex <- which(gender==1) |
|
2454 |
- ## different here |
|
2455 |
- ## use gtypeCallerR in batches |
|
2456 |
- ##snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets()) |
|
2457 |
- newparamsBatch <- vector("list", length(snpBatches)) |
|
2458 |
- process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
2459 |
- YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
2460 |
- params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){ |
|
2461 |
- open(A) |
|
2462 |
- open(B) |
|
2463 |
- open(mixtureParams) |
|
2464 |
- snps <- snpBatches[[idxBatch]] |
|
2465 |
- rSnps <- range(snps) |
|
2466 |
- last <- (idxBatch-1)*batchSize |
|
2467 |
- IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
2468 |
- XIndex[XIndex %in% snps]-last, |
|
2469 |
- YIndex[YIndex %in% snps]-last) |
|
2470 |
- IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
2471 |
- tmpA <- as.matrix(A[snps,]) |
|
2472 |
- tmpB <- as.matrix(B[snps,]) |
|
2473 |
- ## newparamsBatch[[idxBatch]] |
|
2474 |
- tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex, |
|
2475 |
- params[["centers"]][snps,], |
|
2476 |
- params[["scales"]][snps,], |
|
2477 |
- params[["N"]][snps,], |
|
2478 |
- IndexesBatch, cIndexes, |
|
2479 |
- sapply(IndexesBatch, length), |
|
2480 |
- sapply(cIndexes, length), SMEDIAN, |
|
2481 |
- theKnots, mixtureParams[], DF, probs, 0.025) |
|
2482 |
- rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last) |
|
2483 |
- gc(verbose=FALSE) |
|
2484 |
- close(A) |
|
2485 |
- close(B) |
|
2486 |
- close(mixtureParams) |
|
2487 |
- tmp |
|
2488 |
- } |
|
2489 |
- ## |
|
2490 |
- newparamsBatch <- ocLapply(seq(along=snpBatches), process1, |
|
2491 |
- snpBatches=snpBatches, |
|
2492 |
- autosomeIndex=autosomeIndex, XIndex=XIndex, |
|
2493 |
- YIndex=YIndex, A=A, B=B, |
|
2494 |
- mixtureParams=mixtureParams, fIndex=fIndex, |
|
2495 |
- mIndex=mIndex, params=params, |
|
2496 |
- cIndexes=cIndexes, SMEDIAN=SMEDIAN, |
|
2497 |
- theKnots=theKnots, DF=DF, probs=probs, |
|
2498 |
- batchSize=ocProbesets()) |
|
2499 |
- newparams <- vector("list", 3) |
|
2500 |
- names(newparams) <- c("centers", "scales", "N") |
|
2501 |
- newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1)) |
|
2502 |
- newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2)) |
|
2503 |
- newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3)) |
|
2504 |
- rm(newparamsBatch); gc(verbose=FALSE) |
|
2505 |
- if(verbose) message("Done.") |
|
2506 |
- if(verbose) message("Estimating recalibration parameters.") |
|
2507 |
- d <- newparams[["centers"]] - params$centers |
|
2508 |
- ## |
|
2509 |
- ##regression |
|
2510 |
- Index <- intersect(which(pmin(newparams[["N"]][, 1], |
|
2511 |
- newparams[["N"]][, 2], |
|
2512 |
- newparams[["N"]][, 3]) > recallMin & |
|
2513 |
- !apply(regionInfo, 1, any)), |
|
2514 |
- autosomeIndex) |
|
2515 |
- if(length(Index) < recallRegMin){ |
|
2516 |
- warning("Recalibration not possible. Possible cause: small sample size.") |
|
2517 |
- newparams <- params |
|
2518 |
- dev <- vector("numeric", nrow(newparams[["centers"]])) |
|
2519 |
- SS <- matrix(Inf, 3, 3) |
|
2520 |
- DD <- 0 |
|
2521 |
- }else{ |
|
2522 |
- data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
2523 |
- names(data4reg) <- c("AA", "AB", "BB") |
|
2524 |
- regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
2525 |
- c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
2526 |
- coef(lm(BB~AA*AB, data=data4reg))) |
|
2527 |
- rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
2528 |
- rm(data4reg) |
|
2529 |
- ## |
|
2530 |
- minN <- 3 |
|
2531 |
- newparams[["centers"]][newparams[["N"]] < minN] <- NA |
|
2532 |
- Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
2533 |
- if(verbose) message("Filling out empty centers", appendLF=FALSE) |
|
2534 |
- for(i in Index){ |
|
2535 |
- if(verbose) if(i%%10000==0) message(".", appendLF=FALSE) |
|
2536 |
- mu <- newparams[["centers"]][i, ] |
|
2537 |
- j <- which(is.na(mu)) |
|
2538 |
- newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
2539 |
- rm(mu, j) |
|
2540 |
- } |
|
2541 |
- ## |
|
2542 |
- ##remaing NAs are made like originals |
|
2543 |
- if(length(YIndex)>0){ |
|
2544 |
- noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
2545 |
- YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
2546 |
- } |
|
2547 |
- snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0) |
|
2548 |
- snps2keep <- setdiff(autosomeIndex, snps2ignore) |
|
2549 |
- rm(snps2ignore) |
|
2550 |
- newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
2551 |
- if(verbose) cat("\n") |
|
2552 |
- ## |
|
2553 |
- if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE) |
|
2554 |
- GG <- DD <- newparams[["centers"]] - params[["centers"]] |
|
2555 |
- DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ])) |
|
2556 |
- SS <- cov(DD[autosomeIndex, ]) |
|
2557 |
- SSI <- solve(SS) |
|
2558 |
- dev <- vector("numeric", nrow(DD)) |
|
2559 |
- if(length(YIndex)){ |
|
2560 |
- dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
2561 |
- dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
2562 |
- ##Now Y (only two params) |
|
2563 |
- SSY <- SS[c(1, 3), c(1, 3)] |
|
2564 |
- SSI <- solve(SSY) |
|
2565 |
- dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
2566 |
- dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
2567 |
- } else { |
|
2568 |
- dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
2569 |
- dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
2570 |
- } |
|
2571 |
- } |
|
2572 |
- if (verbose) message("OK") |
|
2573 |
- ## |
|
2574 |
- ## BC: must keep SD |
|
2575 |
- params[-2] <- newparams[-2] |
|
2576 |
- rm(newparams) |
|
2577 |
- gc(verbose=FALSE) |
|
2578 |
- ## |
|
2579 |
- if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE) |
|
2580 |
- ## |
|
2581 |
- ## ################### |
|
2582 |
- ## ## MOVE TO C####### |
|
2583 |
- ## |
|
2584 |
- ## running in batches |
|
2585 |
- process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
2586 |
- YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
2587 |
- params, cIndexes, SMEDIAN, theKnots, DF, probs, |
|
2588 |
- regionInfo, batchSize){ |
|
2589 |
- open(A) |
|
2590 |
- open(B) |
|
2591 |
- open(mixtureParams) |
|
2592 |
- snps <- snpBatches[[idxBatch]] |
|
2593 |
- tmpA <- as.matrix(A[snps,]) |
|
2594 |
- tmpB <- as.matrix(B[snps,]) |
|
2595 |
- rSnps <- range(snps) |
|
2596 |
- last <- (idxBatch-1)*batchSize |
|
2597 |
- IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
2598 |
- XIndex[XIndex %in% snps]-last, |
|
2599 |
- YIndex[YIndex %in% snps]-last) |
|
2600 |
- IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
2601 |
- ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex, |
|
2602 |
- params[["centers"]][snps,], |
|
2603 |
- params[["scales"]][snps,], |
|
2604 |
- params[["N"]][snps,], |
|
2605 |
- IndexesBatch, cIndexes, |
|
2606 |
- sapply(IndexesBatch, length), |
|
2607 |
- sapply(cIndexes, length), |
|
2608 |
- SMEDIAN, theKnots, mixtureParams[], |
|
2609 |
- DF, probs, 0.025, |
|
2610 |
- which(regionInfo[snps, 2]), |
|
2611 |
- which(regionInfo[snps, 1])) |
|
2612 |
- A[snps,] <- tmpA |
|
2613 |
- B[snps,] <- tmpB |
|
2614 |
- rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last) |
|
2615 |
- gc(verbose=FALSE) |
|
2616 |
- close(A) |
|
2617 |
- close(B) |
|
2618 |
- close(mixtureParams) |
|
2619 |
- } |
|
2620 |
- ## |
|
2621 |
- ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches, |
|
2622 |
- autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex, |
|
2623 |
- A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex, |
|
2624 |
- mIndex=mIndex, params=params, cIndexes=cIndexes, |
|
2625 |
- SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs, |
|
2626 |
- regionInfo=regionInfo, batchSize=ocProbesets()) |
|
2627 |
- ## END MOVE TO C####### |
|
2628 |
- ## ################## |
|
2629 |
- ## |
|
2630 |
- dev <- dev/(dev+1/383) |
|
2631 |
- if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
2632 |
- if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
2633 |
- ## |
|
2634 |
- if(length(Index) >= recallRegMin){ |
|
2635 |
- tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1) |
|
2636 |
- tmpSnpQc <- dev[autosomeIndex] |
|
2637 |
- SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,]) |
|
2638 |
- batchQC <- mean(diag(SS)) |
|
2639 |
- }else{ |
|
2640 |
- SS <- matrix(0, 3, 3) |
|
2641 |
- batchQC <- Inf |
|
2642 |
- } |
|
2643 |
- ## |
|
2644 |
- if(verbose) message("Done.") |
|
2645 |
- if (returnParams){ |
|
2646 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
2647 |
- }else{ |
|
2648 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
2649 |
- } |
|
2650 |
-} |
|
2651 | 2387 |
|
2652 |
-crlmm2.2 <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
2653 |
- probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
2654 |
- save.it=FALSE, load.it=FALSE, intensityFile, |
|
2655 |
- mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, |
|
2656 |
- cdfName, sns, recallMin=10, recallRegMin=1000, |
|
2657 |
- returnParams=FALSE, badSNP=.7){ |
|
2658 |
- if ((load.it || save.it) && missing(intensityFile)) |
|
2659 |
- stop("'intensityFile' is missing, and you chose either load.it or save.it") |
|
2660 |
- if (missing(sns)) sns <- basename(filenames) |
|
2661 |
- if (!missing(intensityFile)) |
|
2662 |
- if (load.it & !file.exists(intensityFile)){ |
|
2663 |
- load.it <- FALSE |
|
2664 |
- message("File ", intensityFile, " does not exist.") |
|
2665 |
- message("Not loading it, but running SNPRMA from scratch.") |
|
2666 |
- } |
|
2667 |
- if (!load.it){ |
|
2668 |
- res <- snprma2(filenames, fitMixture=TRUE, |
|
2669 |
- mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
2670 |
- eps=eps, cdfName=cdfName, sns=sns) |
|
2671 |
- open(res[["A"]]) |
|
2672 |
- open(res[["B"]]) |
|
2673 |
- open(res[["SNR"]]) |
|
2674 |
- open(res[["mixtureParams"]]) |
|
2675 |
- if(save.it){ |
|
2676 |
- t0 <- proc.time() |
|
2677 |
- save(res, file=intensityFile) |
|
2678 |
- t0 <- proc.time()-t0 |
|
2679 |
- if (verbose) message("Used ", t0[3], " seconds to save ", intensityFile, ".") |
|
2680 |
- } |
|
2681 |
- }else{ |
|
2682 |
- if (verbose) message("Loading ", intensityFile, ".") |
|
2683 |
- obj <- load(intensityFile) |
|
2684 |
- if (verbose) message("Done.") |
|
2685 |
- if (obj != "res") |
|
2686 |
- stop("Object in ", intensityFile, " seems to be invalid.") |
|
2687 |
- } |
|
2688 |
- if(row.names) row.names=res$gns else row.names=NULL |
|
2689 |
- if(col.names) col.names=res$sns else col.names=NULL |
|
2690 |
- res2 <- rscrlmmGT2(res[["A"]], res[["B"]], res[["SNR"]], |
|
2691 |
- res[["mixtureParams"]], res[["cdfName"]], |
|
2692 |
- gender=gender, row.names=row.names, |
|
2693 |
- col.names=col.names, recallMin=recallMin, |
|
2694 |
- recallRegMin=1000, SNRMin=SNRMin, |
|
2695 |
- returnParams=returnParams, badSNP=badSNP, |
|
2696 |
- verbose=verbose) |
|
2697 |
- |
|
2698 |
- res2[["SNR"]] <- res[["SNR"]] |
|
2699 |
- res2[["SKW"]] <- res[["SKW"]] |
|
2700 |
- return(list2SnpSet(res2, returnParams=returnParams)) |
|
2701 |
-} |
|
2388 |
+## used for testing |
|
2389 |
+##crlmm2.2 <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
2390 |
+## probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
2391 |
+## save.it=FALSE, load.it=FALSE, intensityFile, |
|
2392 |
+## mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, |
|
2393 |
+## cdfName, sns, recallMin=10, recallRegMin=1000, |
|
2394 |
+## returnParams=FALSE, badSNP=.7){ |
|
2395 |
+## if ((load.it || save.it) && missing(intensityFile)) |
|
2396 |
+## stop("'intensityFile' is missing, and you chose either load.it or save.it") |
|
2397 |
+## if (missing(sns)) sns <- basename(filenames) |
|
2398 |
+## if (!missing(intensityFile)) |
|
2399 |
+## if (load.it & !file.exists(intensityFile)){ |
|
2400 |
+## load.it <- FALSE |
|
2401 |
+## message("File ", intensityFile, " does not exist.") |
|
2402 |
+## message("Not loading it, but running SNPRMA from scratch.") |
|
2403 |
+## } |
|
2404 |
+## if (!load.it){ |
|
2405 |
+## res <- snprma2(filenames, fitMixture=TRUE, |
|
2406 |
+## mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
2407 |
+## eps=eps, cdfName=cdfName, sns=sns) |
|
2408 |
+## open(res[["A"]]) |
|
2409 |
+## open(res[["B"]]) |
|
2410 |
+## open(res[["SNR"]]) |
|
2411 |
+## open(res[["mixtureParams"]]) |
|
2412 |
+## if(save.it){ |
|
2413 |
+## t0 <- proc.time() |
|
2414 |
+## save(res, file=intensityFile) |
|
2415 |
+## t0 <- proc.time()-t0 |
|
2416 |
+## if (verbose) message("Used ", t0[3], " seconds to save ", intensityFile, ".") |
|
2417 |
+## } |
|
2418 |
+## }else{ |
|
2419 |
+## if (verbose) message("Loading ", intensityFile, ".") |
|
2420 |
+## obj <- load(intensityFile) |
|
2421 |
+## if (verbose) message("Done.") |
|
2422 |
+## if (obj != "res") |
|
2423 |
+## stop("Object in ", intensityFile, " seems to be invalid.") |
|
2424 |
+## } |
|
2425 |
+## if(row.names) row.names=res$gns else row.names=NULL |
|
2426 |
+## if(col.names) col.names=res$sns else col.names=NULL |
|
2427 |
+## res2 <- crlmmGT2(res[["A"]], res[["B"]], res[["SNR"]], |
|
2428 |
+## res[["mixtureParams"]], res[["cdfName"]], |
|
2429 |
+## gender=gender, row.names=row.names, |
|
2430 |
+## col.names=col.names, recallMin=recallMin, |
|
2431 |
+## recallRegMin=1000, SNRMin=SNRMin, |
|
2432 |
+## returnParams=returnParams, badSNP=badSNP, |
|
2433 |
+## verbose=verbose) |
|
2434 |
+## |
|
2435 |
+## res2[["SNR"]] <- res[["SNR"]] |
|
2436 |
+## res2[["SKW"]] <- res[["SKW"]] |
|
2437 |
+## return(list2SnpSet(res2, returnParams=returnParams)) |
|
2438 |
+##} |
... | ... |
@@ -339,262 +339,263 @@ predictGender <- function(cols, theA, theB, XIndex){ |
339 | 339 |
med |
340 | 340 |
} |
341 | 341 |
|
342 |
-crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
|
343 |
- col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
|
344 |
- SNRMin=5, recallMin=10, recallRegMin=1000, |
|
345 |
- gender=NULL, desctrucitve=FALSE, verbose=TRUE, |
|
346 |
- returnParams=FALSE, badSNP=.7){ |
|
347 |
- open(SNR) |
|
348 |
- open(A) |
|
349 |
- open(B) |
|
350 |
- open(mixtureParams) |
|
351 |
- ## expect objects to be ff |
|
352 |
- |
|
353 |
- keepIndex <- which( SNR[] > SNRMin) |
|
354 |
- if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
355 |
- |
|
356 |
- NC <- ncol(A) |
|
357 |
- NR <- nrow(A) |
|
358 |
- |
|
359 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
360 |
- stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
361 |
- |
|
362 |
- if(verbose) message("Loading annotations.") |
|
363 |
- obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
364 |
- obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
365 |
- ## this is toget rid of the 'no visible binding' notes |
|
366 |
- ## variable definitions |
|
367 |
- XIndex <- getVarInEnv("XIndex") |
|
368 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
369 |
- YIndex <- getVarInEnv("YIndex") |
|
370 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
371 |
- theKnots <- getVarInEnv("theKnots") |
|
372 |
- regionInfo <- getVarInEnv("regionInfo") |
|
373 |
- params <- getVarInEnv("params") |
|
374 |
- rm(list=c(obj1, obj2), envir=.crlmmPkgEnv) |
|
375 |
- rm(obj1, obj2) |
|
376 |
- |
|
377 |
- ## IF gender not provide, we predict |
|
378 |
- ## FIXME: XIndex may be greater than ocProbesets() |
|
379 |
- if(is.null(gender)){ |
|
380 |
- if(verbose) message("Determining gender.") |
|
381 |
-## XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
382 |
- XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm") |
|
383 |
- XMedian <- unlist(XMedian) |
|
384 |
- if(sum(SNR[] > SNRMin)==1){ |
|
385 |
- gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
386 |
- }else{ |
|
387 |
- gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]] |
|
388 |
- } |
|
389 |
- } |
|
390 |
- |
|
391 |
- Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
392 |
- cIndexes <- list(keepIndex, |
|
393 |
- keepIndex[which(gender[keepIndex]==2)], |
|
394 |
- keepIndex[which(gender[keepIndex]==1)]) |
|
395 |
- |
|
396 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
397 |
- |
|
398 |
- ## call C |
|
399 |
- fIndex <- which(gender==2) |
|
400 |
- mIndex <- which(gender==1) |
|
401 |
- |
|
402 |
- ## different here |
|
403 |
- ## use gtypeCallerR in batches |
|
404 |
- snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets()) |
|
405 |
- newparamsBatch <- vector("list", length(snpBatches)) |
|
406 |
- |
|
407 |
- process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
408 |
- YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
409 |
- params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){ |
|
410 |
- open(A) |
|
411 |
- open(B) |
|
412 |
- open(mixtureParams) |
|
413 |
- snps <- snpBatches[[idxBatch]] |
|
414 |
- rSnps <- range(snps) |
|
415 |
- last <- (idxBatch-1)*batchSize |
|
416 |
- IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
417 |
- XIndex[XIndex %in% snps]-last, |
|
418 |
- YIndex[YIndex %in% snps]-last) |
|
419 |
- IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
420 |
- tmpA <- as.matrix(A[snps,]) |
|
421 |
- tmpB <- as.matrix(B[snps,]) |
|
422 |
- ## newparamsBatch[[idxBatch]] |
|
423 |
- |
|
424 |
- tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex, |
|
425 |
- params[["centers"]][snps,], |
|
426 |
- params[["scales"]][snps,], |
|
427 |
- params[["N"]][snps,], |
|
428 |
- IndexesBatch, cIndexes, |
|
429 |
- sapply(IndexesBatch, length), |
|
430 |
- sapply(cIndexes, length), SMEDIAN, |
|
431 |
- theKnots, mixtureParams[], DF, probs, 0.025) |
|
432 |
- rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last) |
|
433 |
- gc(verbose=FALSE) |
|
434 |
- close(A) |
|
435 |
- close(B) |
|
436 |
- close(mixtureParams) |
|
437 |
- tmp |
|
438 |
- } |
|
439 |
- |
|
440 |
- newparamsBatch <- ocLapply(seq(along=snpBatches), process1, |
|
441 |
- snpBatches=snpBatches, |
|
442 |
- autosomeIndex=autosomeIndex, XIndex=XIndex, |
|
443 |
- YIndex=YIndex, A=A, B=B, |
|
444 |
- mixtureParams=mixtureParams, fIndex=fIndex, |
|
445 |
- mIndex=mIndex, params=params, |
|
446 |
- cIndexes=cIndexes, SMEDIAN=SMEDIAN, |
|
447 |
- theKnots=theKnots, DF=DF, probs=probs, |
|
448 |
- batchSize=ocProbesets()) |
|
449 |
- newparams <- vector("list", 3) |
|
450 |
- names(newparams) <- c("centers", "scales", "N") |
|
451 |
- newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1)) |
|
452 |
- newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2)) |
|
453 |
- newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3)) |
|
454 |
- rm(newparamsBatch); gc(verbose=FALSE) |
|
455 |
- if(verbose) message("Done.") |
|
456 |
- if(verbose) message("Estimating recalibration parameters.") |
|
457 |
- d <- newparams[["centers"]] - params$centers |
|
458 |
- |
|
459 |
- ##regression |
|
460 |
- Index <- intersect(which(pmin(newparams[["N"]][, 1], |
|
461 |
- newparams[["N"]][, 2], |
|
462 |
- newparams[["N"]][, 3]) > recallMin & |
|
463 |
- !apply(regionInfo, 1, any)), |
|
464 |
- autosomeIndex) |
|
465 |
- if(length(Index) < recallRegMin){ |
|
466 |
- warning("Recalibration not possible. Possible cause: small sample size.") |
|
467 |
- newparams <- params |
|
468 |
- dev <- vector("numeric", nrow(newparams[["centers"]])) |
|
469 |
- SS <- matrix(Inf, 3, 3) |
|
470 |
- DD <- 0 |
|
471 |
- }else{ |
|
472 |
- data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
473 |
- names(data4reg) <- c("AA", "AB", "BB") |
|
474 |
- regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
475 |
- c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
476 |
- coef(lm(BB~AA*AB, data=data4reg))) |
|
477 |
- rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
478 |
- rm(data4reg) |
|
479 |
- |
|
480 |
- minN <- 3 |
|
481 |
- newparams[["centers"]][newparams[["N"]] < minN] <- NA |
|
482 |
- Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
483 |
- if(verbose) message("Filling out empty centers", appendLF=FALSE) |
|
484 |
- for(i in Index){ |
|
485 |
- if(verbose) if(i%%10000==0) message(".", appendLF=FALSE) |
|
486 |
- mu <- newparams[["centers"]][i, ] |
|
487 |
- j <- which(is.na(mu)) |
|
488 |
- newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
489 |
- rm(mu, j) |
|
490 |
- } |
|
491 |
- |
|
492 |
- ##remaing NAs are made like originals |
|
493 |
- if(length(YIndex)>0){ |
|
494 |
- noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
495 |
- YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
496 |
- } |
|
497 |
- snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0) |
|
498 |
- snps2keep <- setdiff(autosomeIndex, snps2ignore) |
|
499 |
- rm(snps2ignore) |
|
500 |
- newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
501 |
- if(verbose) cat("\n") |
|
502 |
- |
|
503 |
- if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE) |
|
504 |
- GG <- DD <- newparams[["centers"]] - params[["centers"]] |
|
505 |
- DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ])) |
|
506 |
- SS <- cov(DD[autosomeIndex, ]) |
|
507 |
- SSI <- solve(SS) |
|
508 |
- dev <- vector("numeric", nrow(DD)) |
|
509 |
- if(length(YIndex)){ |
|
510 |
- dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
511 |
- dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
512 |
- ##Now Y (only two params) |
|
513 |
- SSY <- SS[c(1, 3), c(1, 3)] |
|
514 |
- SSI <- solve(SSY) |
|
515 |
- dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
516 |
- dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
517 |
- } else { |
|
518 |
- dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
519 |
- dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
520 |
- } |
|
521 |
- } |
|
522 |
- if (verbose) message("OK") |
|
523 |
- |
|
524 |
- ## BC: must keep SD |
|
525 |
- params[-2] <- newparams[-2] |
|
526 |
- rm(newparams) |
|
527 |
- gc(verbose=FALSE) |
|
528 |
- |
|
529 |
- if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE) |
|
530 |
- |
|
531 |
- ## ################### |
|
532 |
- ## ## MOVE TO C####### |
|
533 |
- |
|
534 |
- ## running in batches |
|
535 |
- process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
536 |
- YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
537 |
- params, cIndexes, SMEDIAN, theKnots, DF, probs, |
|
538 |
- regionInfo, batchSize){ |
|
539 |
- open(A) |
|
540 |
- open(B) |
|
541 |
- open(mixtureParams) |
|
542 |
- snps <- snpBatches[[idxBatch]] |
|
543 |
- tmpA <- as.matrix(A[snps,]) |
|
544 |
- tmpB <- as.matrix(B[snps,]) |
|
545 |
- rSnps <- range(snps) |
|
546 |
- last <- (idxBatch-1)*batchSize |
|
547 |
- IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
548 |
- XIndex[XIndex %in% snps]-last, |
|
549 |
- YIndex[YIndex %in% snps]-last) |
|
550 |
- IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
551 |
- ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex, |
|
552 |
- params[["centers"]][snps,], |
|
553 |
- params[["scales"]][snps,], |
|
554 |
- params[["N"]][snps,], |
|
555 |
- IndexesBatch, cIndexes, |
|
556 |
- sapply(IndexesBatch, length), |
|
557 |
- sapply(cIndexes, length), |
|
558 |
- SMEDIAN, theKnots, mixtureParams[], |
|
559 |
- DF, probs, 0.025, |
|
560 |
- which(regionInfo[snps, 2]), |
|
561 |
- which(regionInfo[snps, 1])) |
|
562 |
- A[snps,] <- tmpA |
|
563 |
- B[snps,] <- tmpB |
|
564 |
- rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last) |
|
565 |
- gc(verbose=FALSE) |
|
566 |
- close(A) |
|
567 |
- close(B) |
|
568 |
- close(mixtureParams) |
|
569 |
- } |
|
570 |
- |
|
571 |
- ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches, |
|
572 |
- autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex, |
|
573 |
- A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex, |
|
574 |
- mIndex=mIndex, params=params, cIndexes=cIndexes, |
|
575 |
- SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs, |
|
576 |
- regionInfo=regionInfo, batchSize=ocProbesets()) |
|
577 |
- ## END MOVE TO C####### |
|
578 |
- ## ################## |
|
579 |
- |
|
580 |
- dev <- dev/(dev+1/383) |
|
581 |
- if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
582 |
- if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
583 |
- |
|
584 |
- if(length(Index) >= recallRegMin){ |
|
585 |
- tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1) |
|
586 |
- tmpSnpQc <- dev[autosomeIndex] |
|
587 |
- SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,]) |
|
588 |
- batchQC <- mean(diag(SS)) |
|
589 |
- }else{ |
|
590 |
- SS <- matrix(0, 3, 3) |
|
591 |
- batchQC <- Inf |
|
592 |
- } |
|
593 |
- |
|
594 |
- if(verbose) message("Done.") |
|
595 |
- if (returnParams){ |
|
596 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
597 |
- }else{ |
|
598 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
599 |
- } |
|
600 |
-} |
|
342 |
+## RS: commented |
|
343 |
+##crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
|
344 |
+## col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
|
345 |
+## SNRMin=5, recallMin=10, recallRegMin=1000, |
|
346 |
+## gender=NULL, desctrucitve=FALSE, verbose=TRUE, |
|
347 |
+## returnParams=FALSE, badSNP=.7){ |
|
348 |
+## open(SNR) |
|
349 |
+## open(A) |
|
350 |
+## open(B) |
|
351 |
+## open(mixtureParams) |
|
352 |
+## ## expect objects to be ff |
|
353 |
+## |
|
354 |
+## keepIndex <- which( SNR[] > SNRMin) |
|
355 |
+## if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
356 |
+## |
|
357 |
+## NC <- ncol(A) |
|
358 |
+## NR <- nrow(A) |
|
359 |
+## |
|
360 |
+## pkgname <- getCrlmmAnnotationName(cdfName) |
|
361 |
+## stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose)) |
|
362 |
+## |
|
363 |
+## if(verbose) message("Loading annotations.") |
|
364 |
+## obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
365 |
+## obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
366 |
+## ## this is toget rid of the 'no visible binding' notes |
|
367 |
+## ## variable definitions |
|
368 |
+## XIndex <- getVarInEnv("XIndex") |
|
369 |
+## autosomeIndex <- getVarInEnv("autosomeIndex") |
|
370 |
+## YIndex <- getVarInEnv("YIndex") |
|
371 |
+## SMEDIAN <- getVarInEnv("SMEDIAN") |
|
372 |
+## theKnots <- getVarInEnv("theKnots") |
|
373 |
+## regionInfo <- getVarInEnv("regionInfo") |
|
374 |
+## params <- getVarInEnv("params") |
|
375 |
+## rm(list=c(obj1, obj2), envir=.crlmmPkgEnv) |
|
376 |
+## rm(obj1, obj2) |
|
377 |
+## |
|
378 |
+## ## IF gender not provide, we predict |
|
379 |
+## ## FIXME: XIndex may be greater than ocProbesets() |
|
380 |
+## if(is.null(gender)){ |
|
381 |
+## if(verbose) message("Determining gender.") |
|
382 |
+#### XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
383 |
+## XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm") |
|
384 |
+## XMedian <- unlist(XMedian) |
|
385 |
+## if(sum(SNR[] > SNRMin)==1){ |
|
386 |
+## gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
387 |
+## }else{ |
|
388 |
+## gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]] |
|
389 |
+## } |
|
390 |
+## } |
|
391 |
+## |
|
392 |
+## Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
393 |
+## cIndexes <- list(keepIndex, |
|
394 |
+## keepIndex[which(gender[keepIndex]==2)], |
|
395 |
+## keepIndex[which(gender[keepIndex]==1)]) |
|
396 |
+## |
|
397 |
+## if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
398 |
+## |
|
399 |
+## ## call C |
|
400 |
+## fIndex <- which(gender==2) |
|
401 |
+## mIndex <- which(gender==1) |
|
402 |
+## |
|
403 |
+## ## different here |
|
404 |
+## ## use gtypeCallerR in batches |
|
405 |
+## snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets()) |
|
406 |
+## newparamsBatch <- vector("list", length(snpBatches)) |
|
407 |
+## |
|
408 |
+## process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
409 |
+## YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
410 |
+## params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){ |
|
411 |
+## open(A) |
|
412 |
+## open(B) |
|
413 |
+## open(mixtureParams) |
|
414 |
+## snps <- snpBatches[[idxBatch]] |
|
415 |
+## rSnps <- range(snps) |
|
416 |
+## last <- (idxBatch-1)*batchSize |
|
417 |
+## IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
418 |
+## XIndex[XIndex %in% snps]-last, |
|
419 |
+## YIndex[YIndex %in% snps]-last) |
|
420 |
+## IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
421 |
+## tmpA <- as.matrix(A[snps,]) |
|
422 |
+## tmpB <- as.matrix(B[snps,]) |
|
423 |
+## ## newparamsBatch[[idxBatch]] |
|
424 |
+## |
|
425 |
+## tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex, |
|
426 |
+## params[["centers"]][snps,], |
|
427 |
+## params[["scales"]][snps,], |
|
428 |
+## params[["N"]][snps,], |
|
429 |
+## IndexesBatch, cIndexes, |
|
430 |
+## sapply(IndexesBatch, length), |
|
431 |
+## sapply(cIndexes, length), SMEDIAN, |
|
432 |
+## theKnots, mixtureParams[], DF, probs, 0.025) |
|
433 |
+## rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last) |
|
434 |
+## gc(verbose=FALSE) |
|
435 |
+## close(A) |
|
436 |
+## close(B) |
|
437 |
+## close(mixtureParams) |
|
438 |
+## tmp |
|
439 |
+## } |
|
440 |
+## |
|
441 |
+## newparamsBatch <- ocLapply(seq(along=snpBatches), process1, |
|
442 |
+## snpBatches=snpBatches, |
|
443 |
+## autosomeIndex=autosomeIndex, XIndex=XIndex, |
|
444 |
+## YIndex=YIndex, A=A, B=B, |
|
445 |
+## mixtureParams=mixtureParams, fIndex=fIndex, |
|
446 |
+## mIndex=mIndex, params=params, |
|
447 |
+## cIndexes=cIndexes, SMEDIAN=SMEDIAN, |
|
448 |
+## theKnots=theKnots, DF=DF, probs=probs, |
|
449 |
+## batchSize=ocProbesets()) |
|
450 |
+## newparams <- vector("list", 3) |
|
451 |
+## names(newparams) <- c("centers", "scales", "N") |
|
452 |
+## newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1)) |
|
453 |
+## newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2)) |
|
454 |
+## newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3)) |
|
455 |
+## rm(newparamsBatch); gc(verbose=FALSE) |
|
456 |
+## if(verbose) message("Done.") |
|
457 |
+## if(verbose) message("Estimating recalibration parameters.") |
|
458 |
+## d <- newparams[["centers"]] - params$centers |
|
459 |
+## |
|
460 |
+## ##regression |
|
461 |
+## Index <- intersect(which(pmin(newparams[["N"]][, 1], |
|
462 |
+## newparams[["N"]][, 2], |
|
463 |
+## newparams[["N"]][, 3]) > recallMin & |
|
464 |
+## !apply(regionInfo, 1, any)), |
|
465 |
+## autosomeIndex) |
|
466 |
+## if(length(Index) < recallRegMin){ |
|
467 |
+## warning("Recalibration not possible. Possible cause: small sample size.") |
|
468 |
+## newparams <- params |
|
469 |
+## dev <- vector("numeric", nrow(newparams[["centers"]])) |
|
470 |
+## SS <- matrix(Inf, 3, 3) |
|
471 |
+## DD <- 0 |
|
472 |
+## }else{ |
|
473 |
+## data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
474 |
+## names(data4reg) <- c("AA", "AB", "BB") |
|
475 |
+## regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
476 |
+## c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
477 |
+## coef(lm(BB~AA*AB, data=data4reg))) |
|
478 |
+## rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
479 |
+## rm(data4reg) |
|
480 |
+## |
|
481 |
+## minN <- 3 |
|
482 |
+## newparams[["centers"]][newparams[["N"]] < minN] <- NA |
|
483 |
+## Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
484 |
+## if(verbose) message("Filling out empty centers", appendLF=FALSE) |
|
485 |
+## for(i in Index){ |
|
486 |
+## if(verbose) if(i%%10000==0) message(".", appendLF=FALSE) |
|
487 |
+## mu <- newparams[["centers"]][i, ] |
|
488 |
+## j <- which(is.na(mu)) |
|
489 |
+## newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
490 |
+## rm(mu, j) |
|
491 |
+## } |
|
492 |
+## |
|
493 |
+## ##remaing NAs are made like originals |
|
494 |
+## if(length(YIndex)>0){ |
|
495 |
+## noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
496 |
+## YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
497 |
+## } |
|
498 |
+## snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0) |
|
499 |
+## snps2keep <- setdiff(autosomeIndex, snps2ignore) |
|
500 |
+## rm(snps2ignore) |
|
501 |
+## newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
502 |
+## if(verbose) cat("\n") |
|
503 |
+## |
|
504 |
+## if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE) |
|
505 |
+## GG <- DD <- newparams[["centers"]] - params[["centers"]] |
|
506 |
+## DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ])) |
|
507 |
+## SS <- cov(DD[autosomeIndex, ]) |
|
508 |
+## SSI <- solve(SS) |
|
509 |
+## dev <- vector("numeric", nrow(DD)) |
|
510 |
+## if(length(YIndex)){ |
|
511 |
+## dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
512 |
+## dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
513 |
+## ##Now Y (only two params) |
|
514 |
+## SSY <- SS[c(1, 3), c(1, 3)] |
|
515 |
+## SSI <- solve(SSY) |
|
516 |
+## dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
517 |
+## dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
518 |
+## } else { |
|
519 |
+## dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
520 |
+## dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
521 |
+## } |
|
522 |
+## } |
|
523 |
+## if (verbose) message("OK") |
|
524 |
+## |
|
525 |
+## ## BC: must keep SD |
|
526 |
+## params[-2] <- newparams[-2] |
|
527 |
+## rm(newparams) |
|
528 |
+## gc(verbose=FALSE) |
|
529 |
+## |
|
530 |
+## if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE) |
|
531 |
+## |
|
532 |
+## ## ################### |
|
533 |
+## ## ## MOVE TO C####### |
|
534 |
+## |
|
535 |
+## ## running in batches |
|
536 |
+## process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex, |
|
537 |
+## YIndex, A, B, mixtureParams, fIndex, mIndex, |
|
538 |
+## params, cIndexes, SMEDIAN, theKnots, DF, probs, |
|
539 |
+## regionInfo, batchSize){ |
|
540 |
+## open(A) |
|
541 |
+## open(B) |
|
542 |
+## open(mixtureParams) |
|
543 |
+## snps <- snpBatches[[idxBatch]] |
|
544 |
+## tmpA <- as.matrix(A[snps,]) |
|
545 |
+## tmpB <- as.matrix(B[snps,]) |
|
546 |
+## rSnps <- range(snps) |
|
547 |
+## last <- (idxBatch-1)*batchSize |
|
548 |
+## IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last, |
|
549 |
+## XIndex[XIndex %in% snps]-last, |
|
550 |
+## YIndex[YIndex %in% snps]-last) |
|
551 |
+## IndexesBatch <- lapply(IndexesBatch, as.integer) |
|
552 |
+## ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex, |
|
553 |
+## params[["centers"]][snps,], |
|
554 |
+## params[["scales"]][snps,], |
|
555 |
+## params[["N"]][snps,], |
|
556 |
+## IndexesBatch, cIndexes, |
|
557 |
+## sapply(IndexesBatch, length), |
|
558 |
+## sapply(cIndexes, length), |
|
559 |
+## SMEDIAN, theKnots, mixtureParams[], |
|
560 |
+## DF, probs, 0.025, |
|
561 |
+## which(regionInfo[snps, 2]), |
|
562 |
+## which(regionInfo[snps, 1])) |
|
563 |
+## A[snps,] <- tmpA |
|
564 |
+## B[snps,] <- tmpB |
|
565 |
+## rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last) |
|
566 |
+## gc(verbose=FALSE) |
|
567 |
+## close(A) |
|
568 |
+## close(B) |
|
569 |
+## close(mixtureParams) |
|
570 |
+## } |
|
571 |
+## |
|
572 |
+## ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches, |
|
573 |
+## autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex, |
|
574 |
+## A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex, |
|
575 |
+## mIndex=mIndex, params=params, cIndexes=cIndexes, |
|
576 |
+## SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs, |
|
577 |
+## regionInfo=regionInfo, batchSize=ocProbesets()) |
|
578 |
+## ## END MOVE TO C####### |
|
579 |
+## ## ################## |
|
580 |
+## |
|
581 |
+## dev <- dev/(dev+1/383) |
|
582 |
+## if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
583 |
+## if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
584 |
+## |
|
585 |
+## if(length(Index) >= recallRegMin){ |
|
586 |
+## tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1) |
|
587 |
+## tmpSnpQc <- dev[autosomeIndex] |
|
588 |
+## SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,]) |
|
589 |
+## batchQC <- mean(diag(SS)) |
|
590 |
+## }else{ |
|
591 |
+## SS <- matrix(0, 3, 3) |
|
592 |
+## batchQC <- Inf |
|
593 |
+## } |
|
594 |
+## |
|
595 |
+## if(verbose) message("Done.") |
|
596 |
+## if (returnParams){ |
|
597 |
+## return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
598 |
+## }else{ |
|
599 |
+## return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname)) |
|
600 |
+## } |
|
601 |
+##} |
... | ... |
@@ -1196,7 +1196,7 @@ genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3), |
1196 | 1196 |
## message("Writing complete. Begin genotyping...") |
1197 | 1197 |
## close(A(cnSet)) |
1198 | 1198 |
## close(B(cnSet)) |
1199 |
- tmp <- rscrlmmGT2(A=snpCall(cnSet), |
|
1199 |
+ tmp <- crlmmGT2(A=snpCall(cnSet), |
|
1200 | 1200 |
B=snpCallProbability(cnSet), |
1201 | 1201 |
SNR=cnSet$SNR, |
1202 | 1202 |
mixtureParams=mixtureParams, |