Browse code

Remove predictGender. Add imputeGender. crlmmGT and crlmmGT2 call imputeGender

Clean commented code in crlmm-functions.R

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@58659 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 01/10/2011 04:49:56
Showing3 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.11.36
4
+Version: 1.11.37
5 5
 Date: 2010-12-10
6 6
 Author: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>, Ingo Ruczinski <iruczins@jhsph.edu>, Rafael A Irizarry
7 7
 Maintainer: Benilton S Carvalho <Benilton.Carvalho@cancer.org.uk>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -85,25 +85,7 @@ crlmmGT <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
85 85
   ##IF gender not provide, we predict
86 86
   if(is.null(gender)){
87 87
 	  if(verbose) message("Determining gender.")
88
-	  ##XMedian <- apply(log2(A[XIndex,,
89
-	  ##drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
90
-	  a <- log2(A[XIndex,,drop=FALSE])
91
-	  b <- log2(B[XIndex,,drop=FALSE])
92
-	  meds.X <- (apply(a+b, 2, median))/2
93
-	  ##YIndex <- which(isSnp(gtSet) & chromosome(gtSet)==24)
94
-	  a <- log2(A[YIndex,,drop=FALSE])
95
-	  b <- log2(B[YIndex,,drop=FALSE])
96
-	  meds.Y <- (apply(a+b, 2, median))/2
97
-	  R <- meds.X - meds.Y
98
-	  ##SNR <- gtSet$SNR[]
99
-	  ##SNRMin=5
100
-	  if(sum(SNR > SNRMin) == 1){
101
-		  ##gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
102
-		  gender <- ifelse(R[SNR[] > SNRMin] > 0.5, 2L, 1L)
103
-	  } else{
104
-		  ##gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
105
-		  gender <- kmeans(R, c(min(R[SNR[]>SNRMin]), max(R[SNR[]>SNRMin])))[["cluster"]]
106
-	  }
88
+	  gender <- imputeGender(A, B, XIndex, YIndex)
107 89
   }
108 90
 
109 91
   Indexes <- list(autosomeIndex, XIndex, YIndex)
... ...
@@ -329,286 +311,25 @@ crlmm2 <- function(filenames, row.names=TRUE, col.names=TRUE,
329 311
   return(list2SnpSet(res2, returnParams=returnParams))
330 312
 }
331 313
 
332
-predictGender <- function(cols, theA, theB, XIndex){
333
-  n <- length(cols)
334
-  med <- numeric(n)
335
-  if (n > 0){
336
-    open(theA)
337
-    open(theB)
338
-    for (i in 1:n){
339
-##      vA <- log2(theA[XIndex, cols[i]])
340
-##      vB <- log2(theB[XIndex, cols[i]])
341
-      vA <- log2(as.integer(theA[XIndex, cols[i]]))
342
-      vB <- log2(as.integer(theB[XIndex, cols[i]]))
343
-      med[i] <- median(vA+vB)/2
344
-      rm(vA, vB)
345
-    }
346
-    close(theA)
347
-    close(theB)
348
-    rm(theA, theB)
349
-  }
350
-  rm(n)
351
-  gc(verbose=FALSE)
352
-  med
314
+imputeGender <- function(A, B, XIndex, YIndex){
315
+	##XMedian <- apply(log2(A[XIndex,,
316
+	##drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
317
+	a <- log2(A[XIndex,,drop=FALSE])
318
+	b <- log2(B[XIndex,,drop=FALSE])
319
+	meds.X <- (apply(a+b, 2, median))/2
320
+	##YIndex <- which(isSnp(gtSet) & chromosome(gtSet)==24)
321
+	a <- log2(A[YIndex,,drop=FALSE])
322
+	b <- log2(B[YIndex,,drop=FALSE])
323
+	meds.Y <- (apply(a+b, 2, median))/2
324
+	R <- meds.X - meds.Y
325
+	##SNR <- gtSet$SNR[]
326
+	##SNRMin=5
327
+	if(sum(SNR > SNRMin) == 1){
328
+		##gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
329
+		gender <- ifelse(R[SNR[] > SNRMin] > 0.5, 2L, 1L)
330
+	} else{
331
+		##gender <- kmeans(XMedian, c(min(XMedian[SNR>SNRMin]), max(XMedian[SNR>SNRMin])))[["cluster"]]
332
+		gender <- kmeans(R, c(min(R[SNR[]>SNRMin]), max(R[SNR[]>SNRMin])))[["cluster"]]
333
+	}
334
+	return(gender)
353 335
 }
354
-
355
-## RS:  commented
356
-##crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
357
-##                     col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
358
-##                     SNRMin=5, recallMin=10, recallRegMin=1000,
359
-##                     gender=NULL, desctrucitve=FALSE, verbose=TRUE,
360
-##                     returnParams=FALSE, badSNP=.7){
361
-##  open(SNR)
362
-##  open(A)
363
-##  open(B)
364
-##  open(mixtureParams)
365
-##  ## expect objects to be ff
366
-##
367
-##  keepIndex <- which( SNR[] > SNRMin)
368
-##  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
369
-##
370
-##  NC <- ncol(A)
371
-##  NR <- nrow(A)
372
-##
373
-##  pkgname <- getCrlmmAnnotationName(cdfName)
374
-##  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
375
-##
376
-##  if(verbose) message("Loading annotations.")
377
-##  obj1 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
378
-##  obj2 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
379
-##  ## this is toget rid of the 'no visible binding' notes
380
-##  ## variable definitions
381
-##  XIndex <- getVarInEnv("XIndex")
382
-##  autosomeIndex <- getVarInEnv("autosomeIndex")
383
-##  YIndex <- getVarInEnv("YIndex")
384
-##  SMEDIAN <- getVarInEnv("SMEDIAN")
385
-##  theKnots <- getVarInEnv("theKnots")
386
-##  regionInfo <- getVarInEnv("regionInfo")
387
-##  params <- getVarInEnv("params")
388
-##  rm(list=c(obj1, obj2), envir=.crlmmPkgEnv)
389
-##  rm(obj1, obj2)
390
-##
391
-##  ## IF gender not provide, we predict
392
-##  ## FIXME: XIndex may be greater than ocProbesets()
393
-##  if(is.null(gender)){
394
-##    if(verbose) message("Determining gender.")
395
-####    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
396
-##    XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm")
397
-##    XMedian <- unlist(XMedian)
398
-##    if(sum(SNR[] > SNRMin)==1){
399
-##      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
400
-##    }else{
401
-##      gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
402
-##    }
403
-##  }
404
-##
405
-##  Indexes <- list(autosomeIndex, XIndex, YIndex)
406
-##  cIndexes <- list(keepIndex,
407
-##                   keepIndex[which(gender[keepIndex]==2)],
408
-##                   keepIndex[which(gender[keepIndex]==1)])
409
-##
410
-##  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
411
-##
412
-##  ## call C
413
-##  fIndex <- which(gender==2)
414
-##  mIndex <- which(gender==1)
415
-##
416
-##  ## different here
417
-##  ## use gtypeCallerR in batches
418
-##  snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
419
-##  newparamsBatch <- vector("list", length(snpBatches))
420
-##
421
-##  process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
422
-##                       YIndex, A, B, mixtureParams, fIndex, mIndex,
423
-##                       params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
424
-##    open(A)
425
-##    open(B)
426
-##    open(mixtureParams)
427
-##    snps <- snpBatches[[idxBatch]]
428
-##    rSnps <- range(snps)
429
-##    last <- (idxBatch-1)*batchSize
430
-##    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
431
-##                         XIndex[XIndex %in% snps]-last,
432
-##                         YIndex[YIndex %in% snps]-last)
433
-##    IndexesBatch <- lapply(IndexesBatch, as.integer)
434
-##    tmpA <- as.matrix(A[snps,])
435
-##    tmpB <- as.matrix(B[snps,])
436
-##    ## newparamsBatch[[idxBatch]]
437
-##
438
-##    tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
439
-##                        params[["centers"]][snps,],
440
-##                        params[["scales"]][snps,],
441
-##                        params[["N"]][snps,],
442
-##                        IndexesBatch, cIndexes,
443
-##                        sapply(IndexesBatch, length),
444
-##                        sapply(cIndexes, length), SMEDIAN,
445
-##                        theKnots, mixtureParams[], DF, probs, 0.025)
446
-##    rm(snps, rSnps, IndexesBatch, tmpA, tmpB, last)
447
-##    gc(verbose=FALSE)
448
-##    close(A)
449
-##    close(B)
450
-##    close(mixtureParams)
451
-##    tmp
452
-##  }
453
-##
454
-##  newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
455
-##                             snpBatches=snpBatches,
456
-##                             autosomeIndex=autosomeIndex, XIndex=XIndex,
457
-##                             YIndex=YIndex, A=A, B=B,
458
-##                             mixtureParams=mixtureParams, fIndex=fIndex,
459
-##                             mIndex=mIndex, params=params,
460
-##                             cIndexes=cIndexes, SMEDIAN=SMEDIAN,
461
-##                             theKnots=theKnots, DF=DF, probs=probs,
462
-##                             batchSize=ocProbesets())
463
-##  newparams <- vector("list", 3)
464
-##  names(newparams) <- c("centers", "scales", "N")
465
-##  newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
466
-##  newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
467
-##  newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
468
-##  rm(newparamsBatch); gc(verbose=FALSE)
469
-##  if(verbose) message("Done.")
470
-##  if(verbose) message("Estimating recalibration parameters.")
471
-##  d <- newparams[["centers"]] - params$centers
472
-##
473
-##  ##regression
474
-##  Index <- intersect(which(pmin(newparams[["N"]][, 1],
475
-##                                newparams[["N"]][, 2],
476
-##                                newparams[["N"]][, 3]) > recallMin &
477
-##                                !apply(regionInfo, 1, any)),
478
-##                                autosomeIndex)
479
-##  if(length(Index) < recallRegMin){
480
-##    warning("Recalibration not possible. Possible cause: small sample size.")
481
-##    newparams <- params
482
-##    dev <- vector("numeric", nrow(newparams[["centers"]]))
483
-##    SS <- matrix(Inf, 3, 3)
484
-##    DD <- 0
485
-##  }else{
486
-##    data4reg <- as.data.frame(newparams[["centers"]][Index,])
487
-##    names(data4reg) <- c("AA", "AB", "BB")
488
-##    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
489
-##                       c(coef(lm(AB~AA+BB, data=data4reg)), 0),
490
-##                         coef(lm(BB~AA*AB, data=data4reg)))
491
-##    rownames(regParams) <- c("intercept", "X", "Y", "XY")
492
-##    rm(data4reg)
493
-##
494
-##    minN <- 3
495
-##    newparams[["centers"]][newparams[["N"]] < minN] <- NA
496
-##    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
497
-##    if(verbose) message("Filling out empty centers", appendLF=FALSE)
498
-##    for(i in Index){
499
-##      if(verbose) if(i%%10000==0) message(".", appendLF=FALSE)
500
-##      mu <- newparams[["centers"]][i, ]
501
-##      j <- which(is.na(mu))
502
-##      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
503
-##      rm(mu, j)
504
-##    }
505
-##
506
-##    ##remaing NAs are made like originals
507
-##    if(length(YIndex)>0){
508
-##      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex),
509
-##                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
510
-##    }
511
-##    snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
512
-##    snps2keep <- setdiff(autosomeIndex, snps2ignore)
513
-##    rm(snps2ignore)
514
-##    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
515
-##    if(verbose) cat("\n")
516
-##
517
-##    if(verbose) message("Calculating and standardizing size of shift... ", appendLF=FALSE)
518
-##    GG <- DD <- newparams[["centers"]] - params[["centers"]]
519
-##    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
520
-##    SS <- cov(DD[autosomeIndex, ])
521
-##    SSI <- solve(SS)
522
-##    dev <- vector("numeric", nrow(DD))
523
-##    if(length(YIndex)){
524
-##      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
525
-##      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
526
-##      ##Now Y (only two params)
527
-##      SSY <- SS[c(1, 3), c(1, 3)]
528
-##      SSI <- solve(SSY)
529
-##      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
530
-##      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
531
-##    } else {
532
-##      dev=apply(DD,1,function(x) x%*%SSI%*%x)
533
-##      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
534
-##    }
535
-##  }
536
-##  if (verbose) message("OK")
537
-##
538
-##  ## BC: must keep SD
539
-##  params[-2] <- newparams[-2]
540
-##  rm(newparams)
541
-##  gc(verbose=FALSE)
542
-##
543
-##  if(verbose) message("Calling ", NR, " SNPs... ", appendLF=FALSE)
544
-##
545
-##  ## ###################
546
-##  ## ## MOVE TO C#######
547
-##
548
-##  ## running in batches
549
-##  process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
550
-##                       YIndex, A, B, mixtureParams, fIndex, mIndex,
551
-##                       params, cIndexes, SMEDIAN, theKnots, DF, probs,
552
-##                       regionInfo, batchSize){
553
-##    open(A)
554
-##    open(B)
555
-##    open(mixtureParams)
556
-##    snps <- snpBatches[[idxBatch]]
557
-##    tmpA <- as.matrix(A[snps,])
558
-##    tmpB <- as.matrix(B[snps,])
559
-##    rSnps <- range(snps)
560
-##    last <- (idxBatch-1)*batchSize
561
-##    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
562
-##                         XIndex[XIndex %in% snps]-last,
563
-##                         YIndex[YIndex %in% snps]-last)
564
-##    IndexesBatch <- lapply(IndexesBatch, as.integer)
565
-##    ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
566
-##                            params[["centers"]][snps,],
567
-##                            params[["scales"]][snps,],
568
-##                            params[["N"]][snps,],
569
-##                            IndexesBatch, cIndexes,
570
-##                            sapply(IndexesBatch, length),
571
-##                            sapply(cIndexes, length),
572
-##                            SMEDIAN, theKnots, mixtureParams[],
573
-##                            DF, probs, 0.025,
574
-##                            which(regionInfo[snps, 2]),
575
-##                            which(regionInfo[snps, 1]))
576
-##    A[snps,] <- tmpA
577
-##    B[snps,] <- tmpB
578
-##    rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull, last)
579
-##    gc(verbose=FALSE)
580
-##    close(A)
581
-##    close(B)
582
-##    close(mixtureParams)
583
-##  }
584
-##
585
-##  ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
586
-##           autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
587
-##           A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
588
-##           mIndex=mIndex, params=params, cIndexes=cIndexes,
589
-##           SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
590
-##           regionInfo=regionInfo, batchSize=ocProbesets())
591
-##  ##  END MOVE TO C#######
592
-##  ## ##################
593
-##
594
-##  dev <- dev/(dev+1/383)
595
-##  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
596
-##  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
597
-##
598
-##  if(length(Index) >= recallRegMin){
599
-##   tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
600
-##   tmpSnpQc <- dev[autosomeIndex]
601
-##   SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
602
-##   batchQC <- mean(diag(SS))
603
-##  }else{
604
-##    SS <- matrix(0, 3, 3)
605
-##    batchQC <- Inf
606
-##  }
607
-##
608
-##  if(verbose) message("Done.")
609
-##  if (returnParams){
610
-##    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
611
-##  }else{
612
-##    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
613
-##  }
614
-##}
... ...
@@ -47,14 +47,7 @@ crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
47 47
 	## FIXME: XIndex may be greater than ocProbesets()
48 48
 	if(is.null(gender)){
49 49
 		if(verbose) message("Determining gender.")
50
-		##   XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
51
-		XMedian <- ocLapply(splitIndicesByNode(1:NC), predictGender, theA=A, theB=B, XIndex=XIndex, neededPkgs="crlmm")
52
-		XMedian <- unlist(XMedian)
53
-		if(sum(SNR[] > SNRMin)==1){
54
-			gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
55
-		}else{
56
-			gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
57
-		}
50
+		gender <- imputeGender(A, B, XIndex, YIndex)
58 51
 	}
59 52
 	##
60 53
 	Indexes <- list(autosomeIndex, XIndex, YIndex)