git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@42999 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -71,7 +71,6 @@ crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
71 | 71 |
if(verbose) message("Loading annotations.") |
72 | 72 |
loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
73 | 73 |
loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
74 |
-## data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
75 | 74 |
|
76 | 75 |
## this is toget rid of the 'no visible binding' notes |
77 | 76 |
## variable definitions |
... | ... |
@@ -85,13 +84,13 @@ crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
85 | 84 |
|
86 | 85 |
##IF gender not provide, we predict |
87 | 86 |
if(is.null(gender)){ |
88 |
- if(verbose) message("Determining gender.") |
|
89 |
- XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
90 |
- if(sum(SNR>SNRMin)==1){ |
|
91 |
- gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
92 |
- }else{ |
|
93 |
- gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
94 |
- } |
|
87 |
+ if(verbose) message("Determining gender.") |
|
88 |
+ XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
89 |
+ if(sum(SNR>SNRMin)==1){ |
|
90 |
+ gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
91 |
+ }else{ |
|
92 |
+ gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
93 |
+ } |
|
95 | 94 |
} |
96 | 95 |
|
97 | 96 |
Indexes <- list(autosomeIndex, XIndex, YIndex) |
... | ... |
@@ -206,15 +205,7 @@ crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
206 | 205 |
tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1) |
207 | 206 |
tmpSnpQc <- dev[autosomeIndex] |
208 | 207 |
SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,]) |
209 |
-#### tmp4batchQC <- DD[snps2keep,]/(params[["N"]][snps2keep,]+1) |
|
210 |
-#### tmpSnpQc <- dev[snps2keep] |
|
211 |
-#### SS <- cov(DD[snps2keep,]) |
|
212 |
- |
|
213 |
-## DD <- sweep(GG[snps2keep, ], 2, colMeans(DD[snps2keep, ])) |
|
214 |
-## tmpSnpQc <- dev[snps2keep] |
|
215 |
-## SS <- cov(DD[tmpSnpQc < badSNP, ]) |
|
216 |
-#### batchQC <- sqrt(sum(diag(cov(tmp4batchQC[tmpSnpQc < badSNP,]))))*sum(params[["N"]][snps2keep[1],]) |
|
217 |
- batchQC <- mean(diag(SS)) |
|
208 |
+ batchQC <- mean(diag(SS)) |
|
218 | 209 |
}else{ |
219 | 210 |
SS <- matrix(0, 3, 3) |
220 | 211 |
batchQC <- Inf |
... | ... |
@@ -271,663 +262,3 @@ gtypeCallerR2 <- function(A, B, fIndex, mIndex, theCenters, theScales, |
271 | 262 |
as.integer(noTraining), as.integer(noInfo), PACKAGE="crlmm") |
272 | 263 |
|
273 | 264 |
} |
274 |
- |
|
275 |
- |
|
276 |
- |
|
277 |
-############################### |
|
278 |
-####### THIS IS TEMPORARY NOT OFFICIALLY USED |
|
279 |
-##################################### |
|
280 |
-crlmmNM <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
281 |
- probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
282 |
- save.it=FALSE, load.it=FALSE, |
|
283 |
- intensityFile="tmpcrlmmintensities.rda", |
|
284 |
- desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1, |
|
285 |
- verbose=TRUE, cdfName, sns, recallMin=10, |
|
286 |
- recallRegMin=1000, returnParams=FALSE){ |
|
287 |
- |
|
288 |
- if (missing(sns)) sns <- basename(filenames) |
|
289 |
- if (load.it & !file.exists(intensityFile)){ |
|
290 |
- load.it <- FALSE |
|
291 |
- message("File ", intensityFile, " does not exist.") |
|
292 |
- message("Not loading it, but running SNPRMA from scratch.") |
|
293 |
- } |
|
294 |
- if (!load.it){ |
|
295 |
- res <- snprma(filenames, fitMixture=TRUE, |
|
296 |
- mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
297 |
- eps=eps, cdfName=cdfName, sns=sns) |
|
298 |
- if(save.it){ |
|
299 |
- t0 <- proc.time() |
|
300 |
- save(res, file=intensityFile) |
|
301 |
- t0 <- proc.time()-t0 |
|
302 |
- message("Used ", t0[3], " seconds to save ", intensityFile, ".") |
|
303 |
- } |
|
304 |
- }else{ |
|
305 |
- message("Loading ", intensityFile, ".") |
|
306 |
- obj <- load(intensityFile) |
|
307 |
- message("Done.") |
|
308 |
- if (obj != "res") |
|
309 |
- stop("Object in ", intensityFile, " seems to be invalid.") |
|
310 |
- } |
|
311 |
- if(row.names) row.names=res$gns else row.names=NULL |
|
312 |
- if(col.names) col.names=res$sns else col.names=NULL |
|
313 |
- |
|
314 |
- res2 <- crlmmGTnm(res[["A"]], res[["B"]], res[["SNR"]], |
|
315 |
- res[["mixtureParams"]], res[["cdfName"]], |
|
316 |
- gender=gender, row.names=row.names, |
|
317 |
- col.names=col.names, recallMin=recallMin, |
|
318 |
- recallRegMin=1000, SNRMin=SNRMin, |
|
319 |
- returnParams=returnParams) |
|
320 |
- |
|
321 |
- res2[["SNR"]] <- res[["SNR"]] |
|
322 |
- |
|
323 |
- return(res2) |
|
324 |
-} |
|
325 |
- |
|
326 |
-crlmmTNoN <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
327 |
- probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
328 |
- save.it=FALSE, load.it=FALSE, |
|
329 |
- intensityFile="tmpcrlmmintensities.rda", |
|
330 |
- desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1, |
|
331 |
- verbose=TRUE){ |
|
332 |
- if (load.it & !file.exists(intensityFile)){ |
|
333 |
- load.it <- FALSE |
|
334 |
- message("File ", intensityFile, " does not exist.") |
|
335 |
- message("Not loading it, but running SNPRMA from scratch.") |
|
336 |
- } |
|
337 |
- if (!load.it){ |
|
338 |
- res <- snprma(filenames, fitMixture=TRUE, |
|
339 |
- mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
340 |
- eps=eps) |
|
341 |
- if(save.it) save(res, file=intensityFile) |
|
342 |
- }else{ |
|
343 |
- message("Loading ", intensityFile, ".") |
|
344 |
- obj <- load(intensityFile) |
|
345 |
- message("Done.") |
|
346 |
- if (obj != "res") |
|
347 |
- stop("Object in ", intensityFile, " seems to be invalid.") |
|
348 |
- } |
|
349 |
- if(row.names) row.names=res$gns else row.names=NULL |
|
350 |
- if(col.names) col.names=res$sns else col.names=NULL |
|
351 |
- res2 <- crlmmGTTNoN(res[["A"]], res[["B"]], res[["SNR"]], |
|
352 |
- res[["mixtureParams"]], res[["cdfName"]], |
|
353 |
- gender=gender, row.names=row.names, |
|
354 |
- col.names=col.names) |
|
355 |
- res2[["SNR"]] <- res[["SNR"]] |
|
356 |
- return(res2) |
|
357 |
-} |
|
358 |
- |
|
359 |
-crlmmNormalNoN <- function(filenames, row.names=TRUE, col.names=TRUE, |
|
360 |
- probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, |
|
361 |
- save.it=FALSE, load.it=FALSE, |
|
362 |
- intensityFile="tmpcrlmmintensities.rda", |
|
363 |
- desctrucitve=FALSE, mixtureSampleSize=10^5, eps=0.1, |
|
364 |
- verbose=TRUE){ |
|
365 |
- if (load.it & !file.exists(intensityFile)){ |
|
366 |
- load.it <- FALSE |
|
367 |
- message("File ", intensityFile, " does not exist.") |
|
368 |
- message("Not loading it, but running SNPRMA from scratch.") |
|
369 |
- } |
|
370 |
- if (!load.it){ |
|
371 |
- res <- snprma(filenames, fitMixture=TRUE, |
|
372 |
- mixtureSampleSize=mixtureSampleSize, verbose=verbose, |
|
373 |
- eps=eps) |
|
374 |
- if(save.it) save(res, file=intensityFile) |
|
375 |
- }else{ |
|
376 |
- message("Loading ", intensityFile, ".") |
|
377 |
- obj <- load(intensityFile) |
|
378 |
- message("Done.") |
|
379 |
- if (obj != "res") |
|
380 |
- stop("Object in ", intensityFile, " seems to be invalid.") |
|
381 |
- } |
|
382 |
- if(row.names) row.names=res$gns else row.names=NULL |
|
383 |
- if(col.names) col.names=res$sns else col.names=NULL |
|
384 |
- res2 <- crlmmGTNormalNoN(res[["A"]], res[["B"]], res[["SNR"]], |
|
385 |
- res[["mixtureParams"]], res[["cdfName"]], |
|
386 |
- gender=gender, row.names=row.names, |
|
387 |
- col.names=col.names) |
|
388 |
- res2[["SNR"]] <- res[["SNR"]] |
|
389 |
- return(res2) |
|
390 |
-} |
|
391 |
- |
|
392 |
-crlmmGTTNoN <- function(A, B, SNR, mixtureParams, cdfName, |
|
393 |
- row.names=NULL, col.names=NULL, probs=c(1/3, |
|
394 |
- 1/3, 1/3), DF=6, SNRMin=6, gender=NULL, |
|
395 |
- desctrucitve=FALSE, verbose=TRUE){ |
|
396 |
- keepIndex <- which(SNR>SNRMin) |
|
397 |
- if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
398 |
- |
|
399 |
- NC <- ncol(A) |
|
400 |
- NR <- nrow(A) |
|
401 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
402 |
- |
|
403 |
- if(verbose) message("Loading annotations.") |
|
404 |
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
405 |
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
406 |
-## data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
407 |
- |
|
408 |
- ## this is toget rid of the 'no visible binding' notes |
|
409 |
- ## variable definitions |
|
410 |
- XIndex <- getVarInEnv("XIndex") |
|
411 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
412 |
- YIndex <- getVarInEnv("YIndex") |
|
413 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
414 |
- theKnots <- getVarInEnv("theKnots") |
|
415 |
- regionInfo <- getVarInEnv("regionInfo") |
|
416 |
- |
|
417 |
- ##IF gender not provide, we predict |
|
418 |
- if(is.null(gender)){ |
|
419 |
- if(verbose) message("Determining gender.") |
|
420 |
- XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
421 |
- if(sum(SNR>SNRMin)==1) gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) else gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
422 |
- } |
|
423 |
- |
|
424 |
- Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
425 |
- cIndexes <- list(keepIndex, |
|
426 |
- keepIndex[which(gender[keepIndex]==2)], |
|
427 |
- keepIndex[which(gender[keepIndex]==1)]) |
|
428 |
- |
|
429 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
430 |
- |
|
431 |
- ## call C |
|
432 |
- fIndex <- which(gender==2) |
|
433 |
- mIndex <- which(gender==1) |
|
434 |
- t0 <- proc.time() |
|
435 |
- newparams <- gtypeCallerRTNoN(A, B, fIndex, mIndex, |
|
436 |
- params[["centers"]], params[["scales"]], params[["N"]], |
|
437 |
- Indexes, cIndexes, |
|
438 |
- sapply(Indexes, length), sapply(cIndexes, length), |
|
439 |
- SMEDIAN, theKnots, |
|
440 |
- mixtureParams, DF, probs, 0.025) |
|
441 |
- t0 <- proc.time()-t0 |
|
442 |
- message("Part 1 took ", t0[3], " seconds.") |
|
443 |
- names(newparams) <- c("centers", "scales", "N") |
|
444 |
- |
|
445 |
- if(verbose) message("Done.") |
|
446 |
- if(verbose) message("Estimating recalibration parameters.") |
|
447 |
- d <- newparams[["centers"]] - params$centers |
|
448 |
- |
|
449 |
- ##regression |
|
450 |
- MIN <- 10 |
|
451 |
- Index <- intersect(which(pmin(newparams[["N"]][, 1], newparams[["N"]][, 2], newparams[["N"]][, 3])>MIN & !apply(regionInfo, 1, any)), autosomeIndex) |
|
452 |
- data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
453 |
- names(data4reg) <- c("AA", "AB", "BB") |
|
454 |
- regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
455 |
- c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
456 |
- coef(lm(BB~AA*AB, data=data4reg))) |
|
457 |
- rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
458 |
- rm(data4reg) |
|
459 |
- |
|
460 |
- minN <- 3 |
|
461 |
- newparams[["centers"]][newparams[["N"]]<minN] <- NA |
|
462 |
- Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
463 |
- if(verbose) cat("Filling out empty centers") |
|
464 |
- for(i in Index){ |
|
465 |
- if(verbose) if(i%%10000==0)cat(".") |
|
466 |
- mu <- newparams[["centers"]][i, ] |
|
467 |
- j <- which(is.na(mu)) |
|
468 |
- newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
469 |
- } |
|
470 |
- ##remaing NAs are made like originals |
|
471 |
- if(length(YIndex)>0){ |
|
472 |
- noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
473 |
- YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
474 |
- } |
|
475 |
- newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
476 |
- if(verbose) cat("\n") |
|
477 |
- |
|
478 |
- if(verbose) message("Calculating and standardizing size of shift.") |
|
479 |
- DD <- newparams[["centers"]] - params[["centers"]] |
|
480 |
- MM <- colMeans(DD[autosomeIndex, ]) |
|
481 |
- DD <- sweep(DD, 2, MM) |
|
482 |
- SS <- cov(DD[autosomeIndex, ]) |
|
483 |
- SSI <- solve(SS) |
|
484 |
- dev <- vector("numeric", nrow(DD)) |
|
485 |
- if(length(YIndex)){ |
|
486 |
- dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
487 |
- dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
488 |
- ##Now Y (only two params) |
|
489 |
- SSY <- SS[c(1, 3), c(1, 3)] |
|
490 |
- SSI <- solve(SSY) |
|
491 |
- dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
492 |
- dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
493 |
- } else { |
|
494 |
- dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
495 |
- dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
496 |
- } |
|
497 |
- |
|
498 |
- ## BC: must keep SD |
|
499 |
- params[-2] <- newparams[-2] |
|
500 |
- rm(newparams);gc(verbose=FALSE) |
|
501 |
- if(verbose) cat("Calling", NR, "SNPs... ") |
|
502 |
- ## ################### |
|
503 |
- ## ## MOVE TO C####### |
|
504 |
- t0 <- proc.time() |
|
505 |
- ImNull <- gtypeCallerR2TNoN(A, B, fIndex, mIndex, params[["centers"]], |
|
506 |
- params[["scales"]], params[["N"]], Indexes, |
|
507 |
- cIndexes, sapply(Indexes, length), |
|
508 |
- sapply(cIndexes, length), SMEDIAN, theKnots, |
|
509 |
- mixtureParams, DF, probs, 0.025, |
|
510 |
- which(regionInfo[,2]), |
|
511 |
- which(regionInfo[,1])) |
|
512 |
- t0 <- proc.time()-t0 |
|
513 |
- message("Part 2 took ", t0[3], " seconds.") |
|
514 |
- ## END MOVE TO C####### |
|
515 |
- ## ################## |
|
516 |
- |
|
517 |
- dev <- dev/(dev+1/383) |
|
518 |
- if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
519 |
- if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
520 |
- |
|
521 |
- if(verbose) message("Done.") |
|
522 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)))) |
|
523 |
-} |
|
524 |
- |
|
525 |
- |
|
526 |
-gtypeCallerRTNoN <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
527 |
- theNs, Indexes, cIndexes, nIndexes, |
|
528 |
- ncIndexes, SMEDIAN, knots, params, dft, |
|
529 |
- probs, trim){ |
|
530 |
- |
|
531 |
- stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
532 |
- nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
533 |
- nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
534 |
- ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
535 |
- |
|
536 |
- .Call("gtypeCallerPart1TNoN", A, B, fIndex, mIndex, theCenters, |
|
537 |
- theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes, |
|
538 |
- SMEDIAN, knots, params, dft, probs, trim, PACKAGE="crlmm") |
|
539 |
- |
|
540 |
-} |
|
541 |
- |
|
542 |
-gtypeCallerR2TNoN <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
543 |
- theNs, Indexes, cIndexes, nIndexes, |
|
544 |
- ncIndexes, SMEDIAN, knots, params, dft, |
|
545 |
- probs, trim, noTraining, noInfo){ |
|
546 |
- |
|
547 |
- stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
548 |
- nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
549 |
- nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
550 |
- ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
551 |
- |
|
552 |
- .Call("gtypeCallerPart2TNoN", A, B, fIndex, mIndex, theCenters, |
|
553 |
- theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes, |
|
554 |
- SMEDIAN, knots, params, dft, probs, trim, noTraining, noInfo, PACKAGE="crlmm") |
|
555 |
- |
|
556 |
-} |
|
557 |
- |
|
558 |
-crlmmGTNormalNoN <- function(A, B, SNR, mixtureParams, cdfName, |
|
559 |
- row.names=NULL, col.names=NULL, probs=c(1/3, |
|
560 |
- 1/3, 1/3), DF=6, SNRMin=6, gender=NULL, |
|
561 |
- desctrucitve=FALSE, verbose=TRUE){ |
|
562 |
- keepIndex <- which(SNR>SNRMin) |
|
563 |
- if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
564 |
- |
|
565 |
- NC <- ncol(A) |
|
566 |
- NR <- nrow(A) |
|
567 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
568 |
- |
|
569 |
- if(verbose) message("Loading annotations.") |
|
570 |
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
571 |
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
572 |
-## data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
573 |
- |
|
574 |
- ## this is toget rid of the 'no visible binding' notes |
|
575 |
- ## variable definitions |
|
576 |
- XIndex <- getVarInEnv("XIndex") |
|
577 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
578 |
- YIndex <- getVarInEnv("YIndex") |
|
579 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
580 |
- theKnots <- getVarInEnv("theKnots") |
|
581 |
- regionInfo <- getVarInEnv("regionInfo") |
|
582 |
- |
|
583 |
- ##IF gender not provide, we predict |
|
584 |
- if(is.null(gender)){ |
|
585 |
- if(verbose) message("Determining gender.") |
|
586 |
- XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
587 |
- if(sum(SNR>SNRMin)==1) gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) else gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
588 |
- } |
|
589 |
- |
|
590 |
- Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
591 |
- cIndexes <- list(keepIndex, |
|
592 |
- keepIndex[which(gender[keepIndex]==2)], |
|
593 |
- keepIndex[which(gender[keepIndex]==1)]) |
|
594 |
- |
|
595 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
596 |
- |
|
597 |
- ## call C |
|
598 |
- fIndex <- which(gender==2) |
|
599 |
- mIndex <- which(gender==1) |
|
600 |
- t0 <- proc.time() |
|
601 |
- newparams <- gtypeCallerRNormalNoN(A, B, fIndex, mIndex, |
|
602 |
- params[["centers"]], params[["scales"]], params[["N"]], |
|
603 |
- Indexes, cIndexes, |
|
604 |
- sapply(Indexes, length), sapply(cIndexes, length), |
|
605 |
- SMEDIAN, theKnots, |
|
606 |
- mixtureParams, DF, probs, 0.025) |
|
607 |
- t0 <- proc.time()-t0 |
|
608 |
- message("Part 1 took ", t0[3], " seconds.") |
|
609 |
- names(newparams) <- c("centers", "scales", "N") |
|
610 |
- |
|
611 |
- if(verbose) message("Done.") |
|
612 |
- if(verbose) message("Estimating recalibration parameters.") |
|
613 |
- d <- newparams[["centers"]] - params$centers |
|
614 |
- |
|
615 |
- ##regression |
|
616 |
- MIN <- 10 |
|
617 |
- Index <- intersect(which(pmin(newparams[["N"]][, 1], newparams[["N"]][, 2], newparams[["N"]][, 3])>MIN & !apply(regionInfo, 1, any)), autosomeIndex) |
|
618 |
- data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
619 |
- names(data4reg) <- c("AA", "AB", "BB") |
|
620 |
- regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
621 |
- c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
622 |
- coef(lm(BB~AA*AB, data=data4reg))) |
|
623 |
- rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
624 |
- rm(data4reg) |
|
625 |
- |
|
626 |
- minN <- 3 |
|
627 |
- newparams[["centers"]][newparams[["N"]]<minN] <- NA |
|
628 |
- Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
629 |
- if(verbose) cat("Filling out empty centers") |
|
630 |
- for(i in Index){ |
|
631 |
- if(verbose) if(i%%10000==0)cat(".") |
|
632 |
- mu <- newparams[["centers"]][i, ] |
|
633 |
- j <- which(is.na(mu)) |
|
634 |
- newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
635 |
- } |
|
636 |
- ##remaing NAs are made like originals |
|
637 |
- if(length(YIndex)>0){ |
|
638 |
- noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
639 |
- YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
640 |
- } |
|
641 |
- newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
642 |
- if(verbose) cat("\n") |
|
643 |
- |
|
644 |
- if(verbose) message("Calculating and standardizing size of shift.") |
|
645 |
- DD <- newparams[["centers"]] - params[["centers"]] |
|
646 |
- MM <- colMeans(DD[autosomeIndex, ]) |
|
647 |
- DD <- sweep(DD, 2, MM) |
|
648 |
- SS <- cov(DD[autosomeIndex, ]) |
|
649 |
- SSI <- solve(SS) |
|
650 |
- dev <- vector("numeric", nrow(DD)) |
|
651 |
- if(length(YIndex)){ |
|
652 |
- dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
653 |
- dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
654 |
- ##Now Y (only two params) |
|
655 |
- SSY <- SS[c(1, 3), c(1, 3)] |
|
656 |
- SSI <- solve(SSY) |
|
657 |
- dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
658 |
- dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
659 |
- } else { |
|
660 |
- dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
661 |
- dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
662 |
- } |
|
663 |
- |
|
664 |
- ## BC: must keep SD |
|
665 |
- params[-2] <- newparams[-2] |
|
666 |
- rm(newparams);gc(verbose=FALSE) |
|
667 |
- if(verbose) cat("Calling", NR, "SNPs... ") |
|
668 |
- ## ################### |
|
669 |
- ## ## MOVE TO C####### |
|
670 |
- t0 <- proc.time() |
|
671 |
- ImNull <- gtypeCallerR2NormalNoN(A, B, fIndex, mIndex, params[["centers"]], |
|
672 |
- params[["scales"]], params[["N"]], Indexes, |
|
673 |
- cIndexes, sapply(Indexes, length), |
|
674 |
- sapply(cIndexes, length), SMEDIAN, theKnots, |
|
675 |
- mixtureParams, DF, probs, 0.025, |
|
676 |
- which(regionInfo[,2]), |
|
677 |
- which(regionInfo[,1])) |
|
678 |
- t0 <- proc.time()-t0 |
|
679 |
- message("Part 2 took ", t0[3], " seconds.") |
|
680 |
- ## END MOVE TO C####### |
|
681 |
- ## ################## |
|
682 |
- |
|
683 |
- dev <- dev/(dev+1/383) |
|
684 |
- if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
685 |
- if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
686 |
- |
|
687 |
- if(verbose) message("Done.") |
|
688 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)))) |
|
689 |
-} |
|
690 |
- |
|
691 |
- |
|
692 |
-gtypeCallerRNormalNoN <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
693 |
- theNs, Indexes, cIndexes, nIndexes, |
|
694 |
- ncIndexes, SMEDIAN, knots, params, dft, |
|
695 |
- probs, trim){ |
|
696 |
- |
|
697 |
- stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
698 |
- nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
699 |
- nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
700 |
- ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
701 |
- |
|
702 |
- .Call("gtypeCallerPart1NormalNoN", A, B, fIndex, mIndex, theCenters, |
|
703 |
- theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes, |
|
704 |
- SMEDIAN, knots, params, dft, probs, trim, PACKAGE="crlmm") |
|
705 |
- |
|
706 |
-} |
|
707 |
- |
|
708 |
-gtypeCallerR2NormalNoN <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
709 |
- theNs, Indexes, cIndexes, nIndexes, |
|
710 |
- ncIndexes, SMEDIAN, knots, params, dft, |
|
711 |
- probs, trim, noTraining, noInfo){ |
|
712 |
- |
|
713 |
- stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
714 |
- nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
715 |
- nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
716 |
- ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
717 |
- |
|
718 |
- .Call("gtypeCallerPart2NormalNoN", A, B, fIndex, mIndex, theCenters, |
|
719 |
- theScales, theNs, Indexes, cIndexes, nIndexes, ncIndexes, |
|
720 |
- SMEDIAN, knots, params, dft, probs, trim, noTraining, noInfo, PACKAGE="crlmm") |
|
721 |
- |
|
722 |
-} |
|
723 |
- |
|
724 |
- |
|
725 |
-crlmmGTnm <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL, |
|
726 |
- col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6, |
|
727 |
- SNRMin=5, recallMin=10, recallRegMin=1000, |
|
728 |
- gender=NULL, desctrucitve=FALSE, verbose=TRUE, |
|
729 |
- returnParams=FALSE){ |
|
730 |
- |
|
731 |
- keepIndex <- which(SNR>SNRMin) |
|
732 |
- if(length(keepIndex)==0) stop("No arrays above quality threshold!") |
|
733 |
- |
|
734 |
- NC <- ncol(A) |
|
735 |
- NR <- nrow(A) |
|
736 |
- |
|
737 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
738 |
- if(!require(pkgname, character.only=TRUE)){ |
|
739 |
- suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
740 |
- msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
741 |
- message(strwrap(msg)) |
|
742 |
- stop("Package ", pkgname, " could not be found.") |
|
743 |
- rm(suggCall, msg) |
|
744 |
- } |
|
745 |
- |
|
746 |
- if(verbose) message("Loading annotations.") |
|
747 |
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
748 |
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
749 |
-## data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv) |
|
750 |
- |
|
751 |
- ## this is toget rid of the 'no visible binding' notes |
|
752 |
- ## variable definitions |
|
753 |
- XIndex <- getVarInEnv("XIndex") |
|
754 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
755 |
- YIndex <- getVarInEnv("YIndex") |
|
756 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
757 |
- theKnots <- getVarInEnv("theKnots") |
|
758 |
- regionInfo <- getVarInEnv("regionInfo") |
|
759 |
- |
|
760 |
- ##IF gender not provide, we predict |
|
761 |
- if(is.null(gender)){ |
|
762 |
- if(verbose) message("Determining gender.") |
|
763 |
- XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2 |
|
764 |
- if(sum(SNR>SNRMin)==1){ |
|
765 |
- gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5))) |
|
766 |
- }else{ |
|
767 |
- gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]] |
|
768 |
- } |
|
769 |
- } |
|
770 |
- |
|
771 |
- Indexes <- list(autosomeIndex, XIndex, YIndex) |
|
772 |
- cIndexes <- list(keepIndex, |
|
773 |
- keepIndex[which(gender[keepIndex]==2)], |
|
774 |
- keepIndex[which(gender[keepIndex]==1)]) |
|
775 |
- |
|
776 |
- if(verbose) cat("Calling", NR, "SNPs for recalibration... ") |
|
777 |
- |
|
778 |
- ## call C |
|
779 |
- fIndex <- which(gender==2) |
|
780 |
- mIndex <- which(gender==1) |
|
781 |
- t0 <- proc.time() |
|
782 |
- newparams <- gtypeCallerRnm(A, B, fIndex, mIndex, |
|
783 |
- params[["centers"]], params[["scales"]], params[["N"]], |
|
784 |
- Indexes, cIndexes, |
|
785 |
- sapply(Indexes, length), sapply(cIndexes, length), |
|
786 |
- SMEDIAN, theKnots, |
|
787 |
- mixtureParams, DF, probs, 0.025) |
|
788 |
- t0 <- proc.time()-t0 |
|
789 |
- message("Part 1 took ", t0[3], " seconds.") |
|
790 |
- gc(verbose=FALSE) |
|
791 |
- names(newparams) <- c("centers", "scales", "N") |
|
792 |
- |
|
793 |
- if(verbose) message("Done.") |
|
794 |
- if(verbose) message("Estimating recalibration parameters.") |
|
795 |
- d <- newparams[["centers"]] - params$centers |
|
796 |
- |
|
797 |
- ##regression |
|
798 |
- Index <- intersect(which(pmin(newparams[["N"]][, 1], |
|
799 |
- newparams[["N"]][, 2], |
|
800 |
- newparams[["N"]][, 3]) > recallMin & |
|
801 |
- !apply(regionInfo, 1, any)), |
|
802 |
- autosomeIndex) |
|
803 |
- |
|
804 |
- if(length(Index) < recallRegMin){ |
|
805 |
- warning("Recallibration not possible.") |
|
806 |
- newparams <- params |
|
807 |
- dev <- vector("numeric", nrow(newparams[["centers"]])) |
|
808 |
- SS <- matrix(Inf, 3, 3) |
|
809 |
- }else{ |
|
810 |
- data4reg <- as.data.frame(newparams[["centers"]][Index,]) |
|
811 |
- names(data4reg) <- c("AA", "AB", "BB") |
|
812 |
- regParams <- cbind( coef(lm(AA~AB*BB, data=data4reg)), |
|
813 |
- c(coef(lm(AB~AA+BB, data=data4reg)), 0), |
|
814 |
- coef(lm(BB~AA*AB, data=data4reg))) |
|
815 |
- rownames(regParams) <- c("intercept", "X", "Y", "XY") |
|
816 |
- rm(data4reg) |
|
817 |
- |
|
818 |
- minN <- 3 |
|
819 |
- newparams[["centers"]][newparams[["N"]] < minN] <- NA |
|
820 |
- Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex) |
|
821 |
- if(verbose) cat("Filling out empty centers") |
|
822 |
- for(i in Index){ |
|
823 |
- if(verbose) if(i%%10000==0)cat(".") |
|
824 |
- mu <- newparams[["centers"]][i, ] |
|
825 |
- j <- which(is.na(mu)) |
|
826 |
- newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j] |
|
827 |
- } |
|
828 |
- |
|
829 |
- ##remaing NAs are made like originals |
|
830 |
- if(length(YIndex)>0){ |
|
831 |
- noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), |
|
832 |
- YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)]) |
|
833 |
- } |
|
834 |
- newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])] |
|
835 |
- if(verbose) cat("\n") |
|
836 |
- |
|
837 |
- if(verbose) message("Calculating and standardizing size of shift.") |
|
838 |
- DD <- newparams[["centers"]] - params[["centers"]] |
|
839 |
- DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ])) |
|
840 |
- SS <- cov(DD[autosomeIndex, ]) |
|
841 |
- SSI <- solve(SS) |
|
842 |
- dev <- vector("numeric", nrow(DD)) |
|
843 |
- if(length(YIndex)){ |
|
844 |
- dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x) |
|
845 |
- dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex]) |
|
846 |
- ##Now Y (only two params) |
|
847 |
- SSY <- SS[c(1, 3), c(1, 3)] |
|
848 |
- SSI <- solve(SSY) |
|
849 |
- dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x) |
|
850 |
- dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex]) |
|
851 |
- } else { |
|
852 |
- dev=apply(DD,1,function(x) x%*%SSI%*%x) |
|
853 |
- dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev) |
|
854 |
- } |
|
855 |
- } |
|
856 |
- |
|
857 |
- ## BC: must keep SD |
|
858 |
- params[-2] <- newparams[-2] |
|
859 |
- |
|
860 |
- rm(newparams);gc(verbose=FALSE) |
|
861 |
- if(verbose) cat("Calling", NR, "SNPs... ") |
|
862 |
- ## ################### |
|
863 |
- ## ## MOVE TO C####### |
|
864 |
- t0 <- proc.time() |
|
865 |
- ImNull <- gtypeCallerR2nm(A, B, fIndex, mIndex, params[["centers"]], |
|
866 |
- params[["scales"]], params[["N"]], Indexes, |
|
867 |
- cIndexes, sapply(Indexes, length), |
|
868 |
- sapply(cIndexes, length), SMEDIAN, theKnots, |
|
869 |
- mixtureParams, DF, probs, 0.025, |
|
870 |
- which(regionInfo[,2]), |
|
871 |
- which(regionInfo[,1])) |
|
872 |
- t0 <- proc.time()-t0 |
|
873 |
- gc(verbose=FALSE) |
|
874 |
- message("\n Part 2 took ", t0[3], " seconds.") |
|
875 |
- ## END MOVE TO C####### |
|
876 |
- ## ################## |
|
877 |
- |
|
878 |
- dev <- dev/(dev+1/383) |
|
879 |
- if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names} |
|
880 |
- if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names} |
|
881 |
- |
|
882 |
- if(verbose) message("Done.") |
|
883 |
- if (returnParams){ |
|
884 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)), params=params)) |
|
885 |
- }else{ |
|
886 |
- return(list(calls=A, confs=B, SNPQC=dev, batchQC=mean(diag(SS)))) |
|
887 |
- } |
|
888 |
-} |
|
889 |
- |
|
890 |
- |
|
891 |
-gtypeCallerRnm <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
892 |
- theNs, Indexes, cIndexes, nIndexes, |
|
893 |
- ncIndexes, SMEDIAN, knots, params, dft, |
|
894 |
- probs, trim){ |
|
895 |
- |
|
896 |
- stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
897 |
- nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
898 |
- nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
899 |
- ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
900 |
- |
|
901 |
- ## make code robust |
|
902 |
- ## check types before passing to C |
|
903 |
- |
|
904 |
- .Call("gtypeCallerPart1nm", A, B, |
|
905 |
- as.integer(fIndex), as.integer(mIndex), |
|
906 |
- as.numeric(theCenters), as.numeric(theScales), |
|
907 |
- as.integer(theNs), lapply(Indexes, as.integer), lapply(cIndexes, as.integer), as.integer(nIndexes), as.integer(ncIndexes), |
|
908 |
- as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params), |
|
909 |
- as.integer(dft), as.numeric(probs), as.numeric(trim), |
|
910 |
- PACKAGE="crlmm") |
|
911 |
- |
|
912 |
-} |
|
913 |
- |
|
914 |
-gtypeCallerR2nm <- function(A, B, fIndex, mIndex, theCenters, theScales, |
|
915 |
- theNs, Indexes, cIndexes, nIndexes, |
|
916 |
- ncIndexes, SMEDIAN, knots, params, dft, |
|
917 |
- probs, trim, noTraining, noInfo){ |
|
918 |
- |
|
919 |
- stopifnot(!missing(A), !missing(B), dim(A)==dim(B), |
|
920 |
- nrow(A)==nrow(theCenters), nrow(A)==nrow(theScales), |
|
921 |
- nrow(A) == nrow(theNs), length(knots)==3, nrow(params)==4, |
|
922 |
- ncol(params)==ncol(A), length(trim)==1, length(probs)==3) |
|
923 |
- |
|
924 |
- .Call("gtypeCallerPart2nm", A, B, |
|
925 |
- as.integer(fIndex), as.integer(mIndex), |
|
926 |
- as.numeric(theCenters), as.numeric(theScales), |
|
927 |
- as.integer(theNs), Indexes, cIndexes, nIndexes, ncIndexes, |
|
928 |
- as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params), |
|
929 |
- as.integer(dft), as.numeric(probs), as.numeric(trim), |
|
930 |
- as.integer(noTraining), as.integer(noInfo), PACKAGE="crlmm") |
|
931 |
- |
|
932 |
-} |
|
933 |
- |
... | ... |
@@ -1,85 +1,85 @@ |
1 | 1 |
snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1, verbose=TRUE, seed=1, cdfName, sns){ |
2 |
- if (missing(sns)) sns <- basename(filenames) |
|
3 |
- ##ADD CHECK TO SEE IF LOADED |
|
4 |
- if (missing(cdfName)) |
|
5 |
- cdfName <- read.celfile.header(filenames[1])$cdfName |
|
6 |
- ## stuffDir <- changeToCrlmmAnnotationName(cdfName) |
|
7 |
- pkgname <- getCrlmmAnnotationName(cdfName) |
|
8 |
- if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
9 |
- suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
10 |
- msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
11 |
- message(strwrap(msg)) |
|
12 |
- stop("Package ", pkgname, " could not be found.") |
|
13 |
- rm(suggCall, msg) |
|
14 |
- } |
|
15 |
- if(verbose) message("Loading annotations and mixture model parameters.") |
|
16 |
- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
17 |
- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
18 |
- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
19 |
- autosomeIndex <- getVarInEnv("autosomeIndex") |
|
20 |
- pnsa <- getVarInEnv("pnsa") |
|
21 |
- pnsb <- getVarInEnv("pnsb") |
|
22 |
- fid <- getVarInEnv("fid") |
|
23 |
- reference <- getVarInEnv("reference") |
|
24 |
- aIndex <- getVarInEnv("aIndex") |
|
25 |
- bIndex <- getVarInEnv("bIndex") |
|
26 |
- SMEDIAN <- getVarInEnv("SMEDIAN") |
|
27 |
- theKnots <- getVarInEnv("theKnots") |
|
28 |
- gns <- getVarInEnv("gns") |
|
29 |
- |
|
30 |
- ##We will read each cel file, summarize, and run EM one by one |
|
31 |
- ##We will save parameters of EM to use later |
|
32 |
- mixtureParams <- matrix(0, 4, length(filenames)) |
|
33 |
- SNR <- vector("numeric", length(filenames)) |
|
34 |
- SKW <- vector("numeric", length(filenames)) |
|
35 |
- |
|
36 |
- ## This is the sample for the fitting of splines |
|
37 |
- ## BC: I like better the idea of the user passing the seed, |
|
38 |
- ## because this might intefere with other analyses |
|
39 |
- ## (like what happened to GCRMA) |
|
40 |
- set.seed(seed) |
|
41 |
- idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
42 |
- ##S will hold (A+B)/2 and M will hold A-B |
|
43 |
- ##NOTE: We actually dont need to save S. Only for pics etc... |
|
44 |
- ##f is the correction. we save to avoid recomputing |
|
45 |
- A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
46 |
- B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
2 |
+ if (missing(sns)) sns <- basename(filenames) |
|
3 |
+ ##ADD CHECK TO SEE IF LOADED |
|
4 |
+ if (missing(cdfName)) |
|
5 |
+ cdfName <- read.celfile.header(filenames[1])$cdfName |
|
6 |
+ ## stuffDir <- changeToCrlmmAnnotationName(cdfName) |
|
7 |
+ pkgname <- getCrlmmAnnotationName(cdfName) |
|
8 |
+ if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ |
|
9 |
+ suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") |
|
10 |
+ msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) |
|
11 |
+ message(strwrap(msg)) |
|
12 |
+ stop("Package ", pkgname, " could not be found.") |
|
13 |
+ rm(suggCall, msg) |
|
14 |
+ } |
|
15 |
+ if(verbose) message("Loading annotations and mixture model parameters.") |
|
16 |
+ loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) |
|
17 |
+ loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) |
|
18 |
+ loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) |
|
19 |
+ autosomeIndex <- getVarInEnv("autosomeIndex") |
|
20 |
+ pnsa <- getVarInEnv("pnsa") |
|
21 |
+ pnsb <- getVarInEnv("pnsb") |
|
22 |
+ fid <- getVarInEnv("fid") |
|
23 |
+ reference <- getVarInEnv("reference") |
|
24 |
+ aIndex <- getVarInEnv("aIndex") |
|
25 |
+ bIndex <- getVarInEnv("bIndex") |
|
26 |
+ SMEDIAN <- getVarInEnv("SMEDIAN") |
|
27 |
+ theKnots <- getVarInEnv("theKnots") |
|
28 |
+ gns <- getVarInEnv("gns") |
|
29 |
+ |
|
30 |
+ ##We will read each cel file, summarize, and run EM one by one |
|
31 |
+ ##We will save parameters of EM to use later |
|
32 |
+ mixtureParams <- matrix(0, 4, length(filenames)) |
|
33 |
+ SNR <- vector("numeric", length(filenames)) |
|
34 |
+ SKW <- vector("numeric", length(filenames)) |
|
47 | 35 |
|
48 |
- if(verbose){ |
|
49 |
- message("Processing ", length(filenames), " files.") |
|
50 |
- if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
|
51 |
- } |
|
52 |
- ##We start looping throug cel files |
|
53 |
- idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything |
|
54 |
- for(i in seq(along=filenames)){ |
|
55 |
- y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
56 |
- x <- log2(y[idx2]) |
|
57 |
- SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
58 |
- rm(x) |
|
59 |
- y <- normalize.quantiles.use.target(y, target=reference) |
|
60 |
- A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
61 |
- B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
62 |
- ##Now to fit the EM |
|
63 |
- if(fitMixture){ |
|
64 |
- S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN |
|
65 |
- M <- log2(A[idx, i])-log2(B[idx, i]) |
|
36 |
+ ## This is the sample for the fitting of splines |
|
37 |
+ ## BC: I like better the idea of the user passing the seed, |
|
38 |
+ ## because this might intefere with other analyses |
|
39 |
+ ## (like what happened to GCRMA) |
|
40 |
+ set.seed(seed) |
|
41 |
+ idx <- sort(sample(autosomeIndex, mixtureSampleSize)) |
|
42 |
+ ##S will hold (A+B)/2 and M will hold A-B |
|
43 |
+ ##NOTE: We actually dont need to save S. Only for pics etc... |
|
44 |
+ ##f is the correction. we save to avoid recomputing |
|
45 |
+ A <- matrix(as.integer(0), length(pnsa), length(filenames)) |
|
46 |
+ B <- matrix(as.integer(0), length(pnsb), length(filenames)) |
|
47 |
+ |
|
48 |
+ if(verbose){ |
|
49 |
+ message("Processing ", length(filenames), " files.") |
|
50 |
+ if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3) |
|
51 |
+ } |
|
52 |
+ ##We start looping throug cel files |
|
53 |
+ idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything |
|
54 |
+ for(i in seq(along=filenames)){ |
|
55 |
+ y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid]) |
|
56 |
+ x <- log2(y[idx2]) |
|
57 |
+ SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3) |
|
58 |
+ rm(x) |
|
59 |
+ y <- normalize.quantiles.use.target(y, target=reference) |
|
60 |
+ A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa) |
|
61 |
+ B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb) |
|
62 |
+ ##Now to fit the EM |
|
63 |
+ if(fitMixture){ |
|
64 |
+ S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN |
|
65 |
+ M <- log2(A[idx, i])-log2(B[idx, i]) |
|
66 | 66 |
|
67 |
- ##we need to test the choice of eps.. it is not the max diff between funcs |
|
68 |
- tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
69 |
- |
|
70 |
- mixtureParams[, i] <- tmp[["coef"]] |
|
71 |
- SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
72 |
- } |
|
73 |
- if (verbose) |
|
74 |
- if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
|
75 |
- else cat(".") |
|
76 |
- } |
|
77 |
- if (verbose) |
|
78 |
- if (getRversion() > '2.7.0') close(pb) |
|
79 |
- else cat("\n") |
|
80 |
- if (!fitMixture) SNR <- mixtureParams <- NA |
|
81 |
- ## gns comes from preprocStuff.rda |
|
82 |
- list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
|
67 |
+ ##we need to test the choice of eps.. it is not the max diff between funcs |
|
68 |
+ tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps) |
|
69 |
+ |
|
70 |
+ mixtureParams[, i] <- tmp[["coef"]] |
|
71 |
+ SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) |
|
72 |
+ } |
|
73 |
+ if (verbose) |
|
74 |
+ if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) |
|
75 |
+ else cat(".") |
|
76 |
+ } |
|
77 |
+ if (verbose) |
|
78 |
+ if (getRversion() > '2.7.0') close(pb) |
|
79 |
+ else cat("\n") |
|
80 |
+ if (!fitMixture) SNR <- mixtureParams <- NA |
|
81 |
+ ## gns comes from preprocStuff.rda |
|
82 |
+ list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName) |
|
83 | 83 |
} |
84 | 84 |
|
85 | 85 |
fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=10, verbose=FALSE){ |
... | ... |
@@ -11,22 +11,18 @@ changeToCrlmmAnnotationName <- function(x){ |
11 | 11 |
} |
12 | 12 |
|
13 | 13 |
getCrlmmAnnotationName <- function(x){ |
14 |
- paste(tolower(gsub("_", "", x)), "Crlmm", sep="") |
|
14 |
+ paste(tolower(gsub("_", "", x)), "Crlmm", sep="") |
|
15 | 15 |
} |
16 | 16 |
|
17 | 17 |
medianSummaries <- function(mat, grps) |
18 | 18 |
.Call("R_subColSummarize_median", mat, grps, PACKAGE = "preprocessCore") |
19 | 19 |
|
20 | 20 |
intMedianSummaries <- function(mat, grps) |
21 |
- as.integer(medianSummaries(mat, grps)) |
|
22 |
- |
|
23 |
-testProb <- function(p) |
|
24 |
- .Call("test", p) |
|
25 |
- |
|
21 |
+ as.integer(medianSummaries(mat, grps)) |
|
26 | 22 |
|
27 | 23 |
list.celfiles <- function(...){ |
28 |
- files <- list.files(...) |
|
29 |
- return(files[grep("\\.[cC][eE][lL]$", files)]) |
|
24 |
+ files <- list.files(...) |
|
25 |
+ return(files[grep("\\.[cC][eE][lL]$", files)]) |
|
30 | 26 |
} |
31 | 27 |
|
32 | 28 |
## .crlmmPkgEnv is an enviroment that will |
... | ... |
@@ -36,11 +32,11 @@ list.celfiles <- function(...){ |
36 | 32 |
## R CMD check |
37 | 33 |
|
38 | 34 |
isLoaded <- function(dataset, environ=.crlmmPkgEnv) |
39 |
- exists(dataset, envir=environ) |
|
35 |
+ exists(dataset, envir=environ) |
|
40 | 36 |
getVarInEnv <- function(dataset, environ=.crlmmPkgEnv){ |
41 |
- if (!isLoaded(dataset)) |
|
42 |
- stop("Variable ", dataset, " not found in .crlmmPkgEnv") |
|
43 |
- environ[[dataset]] |
|
37 |
+ if (!isLoaded(dataset)) |
|
38 |
+ stop("Variable ", dataset, " not found in .crlmmPkgEnv") |
|
39 |
+ environ[[dataset]] |
|
44 | 40 |
} |
45 | 41 |
|
46 | 42 |
list2SnpSet <- function(x, returnParams=FALSE){ |
... | ... |
@@ -139,11 +135,11 @@ list2SnpSet <- function(x, returnParams=FALSE){ |
139 | 135 |
} |
140 | 136 |
|
141 | 137 |
loader <- function(theFile, envir, pkgname){ |
142 |
- theFile <- file.path(system.file(package=pkgname), |
|
143 |
- "extdata", theFile) |
|
144 |
- if (!file.exists(theFile)) |
|
145 |
- stop("File ", theFile, " does not exist in ", pkgname) |
|
146 |
- load(theFile, envir=envir) |
|
138 |
+ theFile <- file.path(system.file(package=pkgname), |
|
139 |
+ "extdata", theFile) |
|
140 |
+ if (!file.exists(theFile)) |
|
141 |
+ stop("File ", theFile, " does not exist in ", pkgname) |
|
142 |
+ load(theFile, envir=envir) |
|
147 | 143 |
} |
148 | 144 |
|
149 | 145 |
|
... | ... |
@@ -1,30 +1,6 @@ |
1 | 1 |
#include <R.h> |
2 | 2 |
#include <Rinternals.h> |
3 | 3 |
|
4 |
- |
|
5 |
-SEXP gtypeCallerPart1nm(SEXP *, SEXP *, SEXP *, SEXP *, SEXP *,SEXP *, SEXP *, |
|
6 |
- SEXP *, SEXP *, SEXP *, SEXP *, SEXP *,SEXP *, SEXP *, |
|
7 |
- SEXP *, SEXP *, SEXP *); |
|
8 |
- |
|
9 |
-SEXP gtypeCallerPart2nm(SEXP *, SEXP *, SEXP *, SEXP *, |
|
10 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
11 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
12 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
13 |
- SEXP *, SEXP *, SEXP *); |
|
14 |
- |
|
15 |
-SEXP test (SEXP *); |
|
16 |
- |
|
17 |
-SEXP gtypeCallerPart1NormalNoN(SEXP *, SEXP *, SEXP *, SEXP *,SEXP *, SEXP *, |
|
18 |
- SEXP *, SEXP *, SEXP *, SEXP *,SEXP *, SEXP *, |
|
19 |
- SEXP *, SEXP *, SEXP *, SEXP *,SEXP *); |
|
20 |
- |
|
21 |
-SEXP gtypeCallerPart2NormalNoN(SEXP *, SEXP *, SEXP *, SEXP *, |
|
22 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
23 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
24 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
25 |
- SEXP *, SEXP *, SEXP *); |
|
26 |
- |
|
27 |
- |
|
28 | 4 |
SEXP gtypeCallerPart1(SEXP *, SEXP *, SEXP *, SEXP *,SEXP *, SEXP *, |
29 | 5 |
SEXP *, SEXP *, SEXP *, SEXP *,SEXP *, SEXP *, |
30 | 6 |
SEXP *, SEXP *, SEXP *, SEXP *,SEXP *); |
... | ... |
@@ -34,16 +10,3 @@ SEXP gtypeCallerPart2(SEXP *, SEXP *, SEXP *, SEXP *, |
34 | 10 |
SEXP *, SEXP *, SEXP *, SEXP *, |
35 | 11 |
SEXP *, SEXP *, SEXP *, SEXP *, |
36 | 12 |
SEXP *, SEXP *, SEXP *); |
37 |
- |
|
38 |
-SEXP gtypeCallerPart1TNoN(SEXP *, SEXP *, SEXP *, SEXP *, |
|
39 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
40 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
41 |
- SEXP *, SEXP *, SEXP *, |
|
42 |
- SEXP *, SEXP *); |
|
43 |
- |
|
44 |
- |
|
45 |
-SEXP gtypeCallerPart2TNoN(SEXP *, SEXP *, SEXP *, SEXP *, |
|
46 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
47 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
48 |
- SEXP *, SEXP *, SEXP *, SEXP *, |
|
49 |
- SEXP *, SEXP *, SEXP *); |
50 | 13 |
deleted file mode 100644 |
... | ... |
@@ -1,413 +0,0 @@ |
1 |
-#include <math.h> |
|
2 |
-#include <R.h> |
|
3 |
-#include <Rdefines.h> |
|
4 |
-#include <Rmath.h> |
|
5 |
-#include <Rinternals.h> |
|
6 |
- |
|
7 |
-#include "utils.h" |
|
8 |
- |
|
9 |
-static double mydt(double x, int df){ |
|
10 |
- return(pow(1.0+pow(x, 2.0)/ (double)df, -((double)df+1.0)/2.0)); |
|
11 |
-} |
|
12 |
- |
|
13 |
-SEXP gtypeCallerPart1nm(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex, |
|
14 |
- SEXP theCenters, SEXP theScales, SEXP theNs, |
|
15 |
- SEXP Indexes, SEXP cIndexes, SEXP nIndexes, |
|
16 |
- SEXP ncIndexes, SEXP SMEDIAN, |
|
17 |
- SEXP knots, SEXP mixtureParams, SEXP df, |
|
18 |
- SEXP probs, SEXP trim){ |
|
19 |
- /* |
|
20 |
- ARGUMENTS |
|
21 |
- --------- |
|
22 |
- A: intensity matrix for allele A |
|
23 |
- B: intensity matrix for allele B |
|
24 |
- fIndex: indexes for females (columns in A/B for females) |
|
25 |
- mIndex: indexes for males (columns in A/B for males) |
|
26 |
- theCenters: matrix with SNP-specific centers (3 columns: AA/AB/BB) |
|
27 |
- theScales: matrix with SNP-specific scales (3 columns: AA/AB/BB) |
|
28 |
- theNs: matrix with SNP-specific counts (3 columns: AA/AB/BB) |
|
29 |
- Indexes: list with 3 elements (autosomeIndex, XIndex, YIndex) for SNPs |
|
30 |
- cIndexes: list with 3 elements (keepIndex, keepIndexFemale, keepIndexMale) for arrays |
|
31 |
- SMEDIAN: scalar (median S) |
|
32 |
- knots: knots for mixture |
|
33 |
- mixtureParams: mixture parameters |
|
34 |
- probs: genotype priors (1/3) for *ALL* SNPs. It's a vector of length 3 |
|
35 |
- trim: drop rate to estimate means |
|
36 |
- |
|
37 |
- ASSUMPTIONS |
|
38 |
- ----------- |
|
39 |
- A and B have the same dimensions |
|
40 |
- fIndex and mIndex are in a valid range for A/B |
|
41 |
- Number of rows of theCenters, theScales, theNs match the number of rows in A/B |
|
42 |
- Length of Indexes and cIndexes is 3 |
|
43 |
- 3 knots |
|
44 |
- 4xNSAMPLES parameters |
|
45 |
- priors has length 3 and is the same for *ALL* SNPs |
|
46 |
- trim in (0, .5) |
|
47 |
- |
|
48 |
- INTERNAL VARIABLES |
|
49 |
- -------- --------- |
|
50 |
- likelihood: matrix (nsample rows x 3 columns) |
|
51 |
- rowsAB, colsAB: dimensions of A and B |
|
52 |
- lenLists = 3, length of Indexes and cIndexes |
|
53 |
- h, i: iteration |
|
54 |
- nIndex: length of Indexes[[h]] |
|
55 |
- ii: particular value of Indexes |
|
56 |
- M: log-ratio for a particular SNP |
|
57 |
- S: adjusted average log-intensity for a particular SNP |
|
58 |
- f: f for a particular SNP |
|
59 |
- |
|
60 |
- TODO |
|
61 |
- ---- |
|
62 |
- - Get length of a vector from a list within C (adding nIndexes and ncIndexes for the moment) |
|
63 |
- */ |
|
64 |
- |
|
65 |
- /* |
|
66 |
- ========================================== |
|
67 |
- VARIABLE DECLARATION |
|
68 |
- ========================================== |
|
69 |
- */ |
|
70 |
- // Organizing variables |
|
71 |
- int nSnpClasses, k; |
|
72 |
- nSnpClasses = GET_LENGTH(Indexes); |
|
73 |
- int nSnpsPerClass[nSnpClasses]; |
|
74 |
- for (k=0; k < nSnpClasses; k++) |
|
75 |
- nSnpsPerClass[k] = GET_LENGTH(VECTOR_ELT(Indexes, k)); |
|
76 |
- |
|
77 |
- // General Variables |
|
78 |
- int rowsAB, colsAB, h, i, ii, j, elem, nMales, nFemales; |
|
79 |
- rowsAB = INTEGER(getAttrib(A, R_DimSymbol))[0]; |
|
80 |
- colsAB = INTEGER(getAttrib(A, R_DimSymbol))[1]; |
|
81 |
- double likelihood[colsAB*3], M[colsAB], S[colsAB], f[colsAB]; |
|
82 |
- |
|
83 |
- // Constants |
|
84 |
- //const int lenLists=3; |
|
85 |
- |
|
86 |
- // Buffers |
|
87 |
- int intbuffer, ibv1[colsAB], ib2; |
|
88 |
- double buffer; |
|
89 |
- |
|
90 |
- // All pointers appear here |
|
91 |
- int *ptr2nIndexes, *ptr2A, *ptr2B, *ptr2Ns, *ptr2df, *ptr2mIndex, *ptr2fIndex, *ptr2ncIndexes; |
|
92 |
- double *ptr2Smedian, *ptr2knots, *ptr2params, *ptr2Centers, *ptr2Scales, *ptr2probs, *ptr2trim; |
|
93 |
- ptr2nIndexes = INTEGER_POINTER(AS_INTEGER(nIndexes)); |
|
94 |
- ptr2A = INTEGER_POINTER(AS_INTEGER(A)); |
|
95 |
- ptr2B = INTEGER_POINTER(AS_INTEGER(B)); |
|
96 |
- ptr2Ns = INTEGER_POINTER(AS_INTEGER(theNs)); |
|
97 |
- ptr2df = INTEGER_POINTER(AS_INTEGER(df)); |
|
98 |
- ptr2mIndex = INTEGER_POINTER(AS_INTEGER(mIndex)); |
|
99 |
- ptr2fIndex = INTEGER_POINTER(AS_INTEGER(fIndex)); |
|
100 |
- ptr2ncIndexes = INTEGER_POINTER(AS_INTEGER(ncIndexes)); |
|
101 |
- ptr2Smedian = NUMERIC_POINTER(AS_NUMERIC(SMEDIAN)); |
|
102 |
- ptr2knots = NUMERIC_POINTER(AS_NUMERIC(knots)); |
|
103 |
- ptr2params = NUMERIC_POINTER(AS_NUMERIC(mixtureParams)); |
|
104 |
- ptr2Centers = NUMERIC_POINTER(AS_NUMERIC(theCenters)); |
|
105 |
- ptr2Scales = NUMERIC_POINTER(AS_NUMERIC(theScales)); |
|
106 |
- ptr2probs = NUMERIC_POINTER(AS_NUMERIC(probs)); |
|
107 |
- ptr2trim = NUMERIC_POINTER(AS_NUMERIC(trim)); |
|
108 |
- // End pointers |
|
109 |
- |
|
110 |
- // These will be returned to R |
|
111 |
- double *ptr2e1, *ptr2e2, *ptr2e3; |
|
112 |
- SEXP estimates1, estimates2, estimates3, output; |
|
113 |
- PROTECT(estimates1 = allocMatrix(REALSXP, rowsAB, 3)); |
|
114 |
- PROTECT(estimates2 = allocMatrix(REALSXP, rowsAB, 3)); |
|
115 |
- PROTECT(estimates3 = allocMatrix(REALSXP, rowsAB, 3)); |
|
116 |
- |
|
117 |
- ptr2e1 = NUMERIC_POINTER(estimates1); |
|
118 |
- ptr2e2 = NUMERIC_POINTER(estimates2); |
|
119 |
- ptr2e3 = NUMERIC_POINTER(estimates3); |
|
120 |
- /* |
|
121 |
- ========================================== |
|
122 |
- END VARIABLE DECLARATION |
|
123 |
- ========================================== |
|
124 |
- */ |
|
125 |
- nMales = GET_LENGTH(mIndex); |
|
126 |
- nFemales = GET_LENGTH(fIndex); |
|
127 |
- |
|
128 |
- for (h=0; h < nSnpClasses; h++){ |
|
129 |
- ib2 = GET_LENGTH(VECTOR_ELT(cIndexes, h)); |
|
130 |
- double dbv[ib2]; |
|
131 |
- int ibv[ib2]; |
|
132 |
- if (nSnpsPerClass[h] > 0) |
|
133 |
- for (i=0; i < ptr2nIndexes[h]; i++){ |
|
134 |
- if (i%100000 == 0) Rprintf("+"); |
|
135 |
- // Substract 1, as coming from R it is 1-based and C is 0-based. |
|
136 |
- ii=INTEGER(VECTOR_ELT(Indexes, h))[i] - 1; |
|
137 |
- for (j=0; j < colsAB; j++){ |
|
138 |
- // j is an index for vectors whose length is number of samples (SAMPLE) |
|
139 |
- // elem is an index for A and B **only** (or objs with SNP rows and SAMPLE columns) |
|
140 |
- //Rprintf("J %d I %d Rows %d\n", j, ii, rowsAB); |
|
141 |
- |
|
142 |
- elem = rowsAB * j + ii; |
|
143 |
- //Rprintf("\nElemt %d ", elem); |
|
144 |
- |
|
145 |
- M[j] = (log2(ptr2A[elem])-log2(ptr2B[elem])); |
|
146 |
- S[j] = (log2(ptr2A[elem])+log2(ptr2B[elem]))/2 - ptr2Smedian[0]; |
|
147 |
- buffer = fmax(fmin(S[j], ptr2knots[2]), ptr2knots[0]); |
|
148 |
- f[j] = ptr2params[j*4+0]+ptr2params[j*4+1]*buffer+ptr2params[j*4+2]*pow(buffer, 2.0)+ptr2params[j*4+3]*pow(fmax(0, buffer-ptr2knots[1]), 2.0); |
|
149 |
- //Rprintf("M %f S %f f %f ", M[j], S[j], f[j]); |
|
150 |
- |
|
151 |
- // buffer here is sigma |
|
152 |
- // likelihood for AA |
|
153 |
- // All likelihoods already multiplied by prior to save time |
|
154 |
- buffer = ptr2Scales[ii] * sdCorrection(&ptr2Ns[ii]); |
|
155 |
- likelihood[j] = mydt( ((M[j]-f[j])-ptr2Centers[ii])/buffer, ptr2df[0])*ptr2probs[0]; |
|
156 |
- |
|
157 |
- //Rprintf("L1 %2.4f ", likelihood[j]); |
|
158 |
- |
|
159 |
- // likelihood for AB |
|
160 |
- buffer = ptr2Scales[ii+rowsAB] * sdCorrection(&ptr2Ns[ii+rowsAB]); |
|
161 |
- likelihood[j+colsAB] = mydt( (M[j]-ptr2Centers[ii+rowsAB])/buffer, ptr2df[0])*ptr2probs[1]; |
|
162 |
- |
|
163 |
- // intbuffer (here) is the subject ID as in R (ie. 1-based) |
|
164 |
- intbuffer = j+1; |
|
165 |
- if (nMales > 0) |
|
166 |
- if (h > 0) |
|
167 |
- if (intInSet(&intbuffer, ptr2mIndex, &nMales) > 0) |
|
168 |
- likelihood[j+colsAB] = 0; |
|
169 |
- |
|
170 |
- //Rprintf("L2 %2.4f ", likelihood[j+colsAB]); |
|
171 |
- |
|
172 |
- |
|
173 |
- // likelihood for BB |
|
174 |
- buffer = ptr2Scales[ii+2*rowsAB] * sdCorrection(&ptr2Ns[ii+2*rowsAB]); |
|
175 |
- likelihood[j+2*colsAB] = mydt( ((M[j]+f[j])-ptr2Centers[ii+2*rowsAB])/buffer, ptr2df[0])*ptr2probs[2]; |
|
176 |
- |
|
177 |
- // Females on Y: 1 to avoid NAs. Later made 0 (RI) |
|
178 |
- // To save some time: 1*priors = priors |
|
179 |
- if (nFemales > 0) |
|
180 |
- if (h == 2) |
|
181 |
- if (intInSet(&intbuffer, ptr2fIndex, &nFemales) >0){ |
|
182 |
- likelihood[j] = ptr2probs[2]; |
|
183 |
- likelihood[j+colsAB] = ptr2probs[2]; |
|
184 |
- likelihood[j+2*colsAB] = ptr2probs[2]; |
|
185 |
- } |
|
186 |
- |
|
187 |
- //Rprintf("L3 %2.4f ", likelihood[j+2*colsAB]); |
|
188 |
- |
|
189 |
- |
|
190 |
- // Compute simple posterior |
|
191 |
- buffer = likelihood[j]+likelihood[j+colsAB]+likelihood[j+2*colsAB]; |
|
192 |
- likelihood[j]/=buffer; |
|
193 |
- likelihood[j+colsAB]/=buffer; |
|
194 |
- likelihood[j+2*colsAB]/=buffer; |
|
195 |
- |
|
196 |
- if (nFemales > 0) |
|
197 |
- if (h == 2) |
|
198 |
- if (intInSet(&intbuffer, ptr2fIndex, &nFemales) >0){ |
|
199 |
- likelihood[j] = 0; |
|
200 |
- likelihood[j+colsAB] = 0; |
|
201 |
- likelihood[j+2*colsAB] = 0; |
|
202 |
- } |
|
203 |
- |
|
204 |
- ibv1[j] = genotypeCall(&likelihood[j], &likelihood[j+colsAB], &likelihood[j+2*colsAB]); |
|
205 |
- } |
|
206 |
- |
|
207 |
- for (j=0; j < ib2; j++){ |
|
208 |
- intbuffer = INTEGER(VECTOR_ELT(cIndexes, h))[j]-1; |
|
209 |
- dbv[j]=M[intbuffer]-f[intbuffer]; |
|
210 |
- ibv[j]=ibv1[intbuffer]; |
|
211 |
- } |
|
212 |
- mad_median(dbv, ibv, 1, ptr2trim[0], GET_LENGTH(VECTOR_ELT(cIndexes, h)), rowsAB, ptr2e1, ptr2e2, ptr2e3, ii); |
|
213 |
- for (j=0; j < ib2; j++){ |
|
214 |
- intbuffer = INTEGER(VECTOR_ELT(cIndexes, h))[j]-1; |
|
215 |
- dbv[j]=M[intbuffer]; |
|
216 |
- } |
|
217 |
- mad_median(dbv, ibv, 2, ptr2trim[0], GET_LENGTH(VECTOR_ELT(cIndexes, h)), rowsAB, ptr2e1, ptr2e2, ptr2e3, ii); |
|
218 |
- for (j=0; j < ib2; j++){ |
|
219 |
- intbuffer = INTEGER(VECTOR_ELT(cIndexes, h))[j]-1; |
|
220 |
- dbv[j] = M[intbuffer]+f[intbuffer]; |
|
221 |
- } |
|
222 |
- mad_median(dbv, ibv, 3, ptr2trim[0], GET_LENGTH(VECTOR_ELT(cIndexes, h)), rowsAB, ptr2e1, ptr2e2, ptr2e3, ii); |
|
223 |
- } /* for Snp */ |
|
224 |
- } /* for SnpClass */ |
|
225 |
- PROTECT(output = allocVector(VECSXP,3)); |
|
226 |
- SET_VECTOR_ELT(output, 0, estimates1); |
|
227 |
- SET_VECTOR_ELT(output, 1, estimates2); |
|
228 |
- SET_VECTOR_ELT(output, 2, estimates3); |
|
229 |
- UNPROTECT(4); |
|
230 |
- return(output); |
|
231 |
-} |
|
232 |
- |
|
233 |
-SEXP gtypeCallerPart2nm(SEXP A, SEXP B, SEXP fIndex, SEXP mIndex, |
|
234 |
- SEXP theCenters, SEXP theScales, SEXP theNs, |
|
235 |
- SEXP Indexes, SEXP cIndexes, SEXP nIndexes, |
|
236 |
- SEXP ncIndexes, SEXP SMEDIAN, |
|
237 |
- SEXP knots, SEXP mixtureParams, SEXP df, |
|
238 |
- SEXP probs, SEXP trim, SEXP noTraining, SEXP noInfo){ |
|
239 |
- /* |
|
240 |
- WARNING!!! REMEMBER TO MODIFY MY TWIN TOO! |
|
241 |
- |
|
242 |
- ARGUMENTS |
|
243 |
- --------- |
|
244 |
- A: intensity matrix for allele A |
|
245 |
- B: intensity matrix for allele B |
|
246 |
- fIndex: indexes for females (columns in A/B for females) |
|
247 |
- mIndex: indexes for males (columns in A/B for males) |
|
248 |
- theCenters: matrix with SNP-specific centers (3 columns: AA/AB/BB) |
|
249 |
- theScales: matrix with SNP-specific scales (3 columns: AA/AB/BB) |
|
250 |
- theNs: matrix with SNP-specific counts (3 columns: AA/AB/BB) |
|
251 |
- Indexes: list with 3 elements (autosomeIndex, XIndex, YIndex) for SNPs |
|
252 |
- cIndexes: list with 3 elements (keepIndex, keepIndexFemale, keepIndexMale) for arrays |
|
253 |
- SMEDIAN: scalar (median S) |
|
254 |
- knots: knots for mixture |
|
255 |
- mixtureParams: mixture parameters |
|
256 |
- probs: genotype priors (1/3) for *ALL* SNPs. It's a vector of length 3 |
|
257 |
- trim: drop rate to estimate means |
|
258 |
- |
|
259 |
- ASSUMPTIONS |
|
260 |
- ----------- |
|
261 |
- A and B have the same dimensions |
|
262 |
- fIndex and mIndex are in a valid range for A/B |
|
263 |
- Number of rows of theCenters, theScales, theNs match the number of rows in A/B |
|
264 |
- Length of Indexes and cIndexes is 3 |
|
265 |
- 3 knots |
|
266 |
- 4xNSAMPLES parameters |
|
267 |
- priors has length 3 and is the same for *ALL* SNPs |
|
268 |
- trim in (0, .5) |
|
269 |
- Indexes and cIndexes have the same length. |
|
270 |
- |
|
271 |
- INTERNAL VARIABLES |
|
272 |
- -------- --------- |
|
273 |
- likelihood: matrix (nsample rows x 3 columns) |
|
274 |
- rowsAB, colsAB: dimensions of A and B |
|
275 |
- lenLists = 3, length of Indexes and cIndexes |
|
276 |
- h, i: iteration |
|
277 |
- nIndex: length of Indexes[[h]] |
|
278 |
- ii: particular value of Indexes |
|
279 |
- M: log-ratio for a particular SNP |
|
280 |
- S: adjusted average log-intensity for a particular SNP |
|
281 |
- f: f for a particular SNP |
|
282 |
- |
|
283 |
- TODO |
|
284 |
- ---- |
|
285 |
- - Get length of a vector from a list within C (adding nIndexes and ncIndexes for the moment) |
|
286 |
- */ |
|
287 |
- |
|
288 |
- /* |
|
289 |
- ========================================== |
|
290 |
- VARIABLE DECLARATION |
|
291 |
- ========================================== |
|
292 |
- */ |
|
293 |
- // Organizing variables |
|
294 |
- int nSnpClasses, k; |
|
295 |
- nSnpClasses = GET_LENGTH(Indexes); |
|
296 |
- int nSnpsPerClass[nSnpClasses]; |
|
297 |
- for (k=0; k < nSnpClasses; k++) |
|
298 |
- nSnpsPerClass[k] = GET_LENGTH(VECTOR_ELT(Indexes, k)); |
|
299 |
- |
|
300 |
- // General Variables |
|
301 |
- int rowsAB, colsAB, h, i, ii, j, elem, nMales, nFemales; |
|
302 |
- rowsAB = INTEGER(getAttrib(A, R_DimSymbol))[0]; |
|
303 |
- colsAB = INTEGER(getAttrib(A, R_DimSymbol))[1]; |
|
304 |
- double likelihood[colsAB*3], M[colsAB], S[colsAB], f[colsAB]; |
|
305 |
- |
|
306 |
- // Constants |
|
307 |
- // const int lenLists=3; |
|
308 |
- |
|
309 |
- // Buffers |
|
310 |
- int intbuffer, ib2, ib3, ibSnpLevel1=0, ibSnpLevel2=0; |
|
311 |
- double buffer; |
|
312 |
- |
|
313 |
- ib2 = GET_LENGTH(noTraining); |
|
314 |
- ib3 = GET_LENGTH(noInfo); |
|
315 |
- |
|
316 |
- // All pointers appear here |
|
317 |
- int *ptr2nIndexes, *ptr2A, *ptr2B, *ptr2Ns, *ptr2df, *ptr2mIndex, *ptr2fIndex, *ptr2ncIndexes, *ptr2noTraining, *ptr2noInfo; |
|
318 |
- double *ptr2Smedian, *ptr2knots, *ptr2params, *ptr2Centers, *ptr2Scales, *ptr2probs, *ptr2trim; |
|
319 |
- ptr2nIndexes = INTEGER_POINTER(AS_INTEGER(nIndexes)); |
|
320 |
- ptr2A = INTEGER_POINTER(AS_INTEGER(A)); |
|
321 |
- ptr2B = INTEGER_POINTER(AS_INTEGER(B)); |
|
322 |
- ptr2Ns = INTEGER_POINTER(AS_INTEGER(theNs)); |
|
323 |
- ptr2df = INTEGER_POINTER(AS_INTEGER(df)); |
|
324 |
- ptr2mIndex = INTEGER_POINTER(AS_INTEGER(mIndex)); |
|
325 |
- ptr2fIndex = INTEGER_POINTER(AS_INTEGER(fIndex)); |
|
326 |
- ptr2ncIndexes = INTEGER_POINTER(AS_INTEGER(ncIndexes)); |
|
327 |
- ptr2noTraining = INTEGER_POINTER(AS_INTEGER(noTraining)); |
|
328 |
- ptr2noInfo = INTEGER_POINTER(AS_INTEGER(noInfo)); |
|
329 |
- |
|
330 |
- ptr2Smedian = NUMERIC_POINTER(AS_NUMERIC(SMEDIAN)); |
|
331 |
- ptr2knots = NUMERIC_POINTER(AS_NUMERIC(knots)); |
|
332 |
- ptr2params = NUMERIC_POINTER(AS_NUMERIC(mixtureParams)); |
|
333 |
- ptr2Centers = NUMERIC_POINTER(AS_NUMERIC(theCenters)); |
|
334 |
- ptr2Scales = NUMERIC_POINTER(AS_NUMERIC(theScales)); |
|
335 |
- ptr2probs = NUMERIC_POINTER(AS_NUMERIC(probs)); |
|
336 |
- ptr2trim = NUMERIC_POINTER(AS_NUMERIC(trim)); |
|
337 |
- |
|
338 |
- // End pointers |
|
339 |
- |
|
340 |
- /* |
|
341 |
- ========================================== |
|
342 |
- END VARIABLE DECLARATION |
|
343 |
- ========================================== |
|
344 |
- */ |
|
345 |
- nMales = GET_LENGTH(mIndex); |
|
346 |
- nFemales = GET_LENGTH(fIndex); |
|
347 |
- |
|
348 |
- for (h=0; h < nSnpClasses; h++){ |
|
349 |
- if (nSnpsPerClass[h] > 0) |
|
350 |
- for (i=0; i < ptr2nIndexes[h]; i++){ |
|
351 |
- if (i%100000 == 0) Rprintf("+"); |
|
352 |
- // Substract 1, as coming from R it is 1-based and C is 0-based. |
|
353 |
- ii=INTEGER(VECTOR_ELT(Indexes, h))[i] - 1; |
|
354 |
- intbuffer = ii+1; |
|
355 |
- if (intInSet(&intbuffer, ptr2noTraining, &ib2) > 0) ibSnpLevel1 = 1; |
|
356 |
- if (intInSet(&intbuffer, ptr2noInfo, &ib3) > 0) ibSnpLevel2 = 1; |
|
357 |
- for (j=0; j < colsAB; j++){ |
|
358 |
- // j is an index for vectors whose length is number of samples (SAMPLE) |
|
359 |
- // elem is an index for A and B **only** (or objs with SNP rows and SAMPLE columns) |
|
360 |
- elem = rowsAB * j + ii; |
|
361 |
- M[j] = (log2(ptr2A[elem])-log2(ptr2B[elem])); |
|
362 |
- S[j] = (log2(ptr2A[elem])+log2(ptr2B[elem]))/2 - ptr2Smedian[0]; |
|
363 |
- buffer = fmax(fmin(S[j], ptr2knots[2]), ptr2knots[0]); |
|
364 |
- f[j] = ptr2params[j*4+0]+ptr2params[j*4+1]*buffer+ptr2params[j*4+2]*pow(buffer, 2.0)+ptr2params[j*4+3]*pow(fmax(0, buffer-ptr2knots[1]), 2.0); |
|
365 |
- |
|
366 |
- // buffer here is sigma |
|
367 |
- // likelihood for AA |
|
368 |
- // All likelihoods already multiplied by prior to save time |
|
369 |
- buffer = ptr2Scales[ii] * sdCorrection(&ptr2Ns[ii]); |
|
370 |
- likelihood[j] = mydt( ((M[j]-f[j])-ptr2Centers[ii])/buffer, ptr2df[0])*ptr2probs[0]; |
|
371 |
- |
|
372 |
- // likelihood for AB |
|
373 |
- buffer = ptr2Scales[ii+rowsAB] * sdCorrection(&ptr2Ns[ii+rowsAB]); |
|
374 |
- likelihood[j+colsAB] = mydt( (M[j]-ptr2Centers[ii+rowsAB])/buffer, ptr2df[0])*ptr2probs[1]; |
|
375 |
- |
|
376 |
- // intbuffer (here) is the subject ID as in R (ie. 1-based) |
|
377 |
- // h > 0 is chr X or Y |
|
378 |
- intbuffer = j+1; |
|
379 |
- if (nMales > 0) |
|
380 |
- if (h > 0 && intInSet(&intbuffer, ptr2mIndex, &nMales) > 0) likelihood[j+colsAB] = 0; |
|
381 |
- |
|
382 |
- // likelihood for BB |
|
383 |
- buffer = ptr2Scales[ii+2*rowsAB] * sdCorrection(&ptr2Ns[ii+2*rowsAB]); |
|
384 |
- likelihood[j+2*colsAB] = mydt( ((M[j]+f[j])-ptr2Centers[ii+2*rowsAB])/buffer, ptr2df[0])*ptr2probs[2]; |
|
385 |
- |
|
386 |
- // Females on Y: 1 to avoid NAs. Later made 0 (RI) |
|
387 |
- // To save some time: 1*priors = priors |
|
388 |
- if (nFemales > 0) |
|
389 |
- if (h == 2 && intInSet(&intbuffer, ptr2fIndex, &nFemales) > 0) |
|
390 |
- likelihood[j] = likelihood[j+colsAB] = likelihood[j+2*colsAB] = ptr2probs[2]; |
|
391 |
- |
|
392 |
- // Compute simple posterior |
|
393 |
- buffer = likelihood[j]+likelihood[j+colsAB]+likelihood[j+2*colsAB]; |
|
394 |
- likelihood[j]/=buffer; |
|
395 |