Browse code

HPC for snprma/crlmm on Affy

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@45498 bc3139a8-67e5-0310-9ffc-ced21a209358

Benilton Carvalho authored on 25/03/2010 13:14:54
Showing8 changed files

... ...
@@ -485,3 +485,7 @@ then readIDAT() should work. Thanks to Pierre Cherel who reported this error.
485 485
 ** updates to genotype, crlmmIlluminaRS, crlmmCopynumber
486 486
 ** class union of ff_matrix, matrix, and ffdf
487 487
 
488
+2010-03-18 B Carvalho committed version 1.5.42
489
+
490
+** added parallel/large dataset support to snprma/crlmm
491
+** merged changes on .41 with my local changes
... ...
@@ -1,7 +1,7 @@
1 1
 Package: crlmm
2 2
 Type: Package
3 3
 Title: Genotype Calling (CRLMM) and Copy Number Analysis tool for Affymetrix SNP 5.0 and 6.0 and Illumina arrays.
4
-Version: 1.5.41
4
+Version: 1.5.42
5 5
 Date: 2010-02-05
6 6
 Author: Rafael A Irizarry, Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.edu.au>
7 7
 Maintainer: Benilton S Carvalho <bcarvalh@jhsph.edu>, Robert Scharpf <rscharpf@jhsph.edu>, Matt Ritchie <mritchie@wehi.EDU.AU>
... ...
@@ -57,14 +57,16 @@ importFrom(ellipse, ellipse)
57 57
 exportClasses(CNSetLM)
58 58
 exportMethods(copyNumber, open, "[", show, lM, "lM<-")
59 59
 export(crlmm, 
60
-	      crlmmCopynumber, 
61
-	      crlmmIllumina, 
62
-	      crlmmIlluminaRS, 
63
-	      ellipse, 
64
-	      genotype, 
65
-	      getParam, 
66
-	      readIdatFiles, 
67
-	      snprma) 
60
+       crlmmCopynumber, 
61
+       crlmmIllumina, 
62
+       crlmmIlluminaRS, 
63
+       ellipse, 
64
+       genotype, 
65
+       getParam, 
66
+       readIdatFiles, 
67
+       snprma,
68
+       snprma2,
69
+       crlmm2) 
68 70
 
69 71
 export(initializeBigMatrix, initializeParamObject)
70 72
 
... ...
@@ -233,13 +233,14 @@ gtypeCallerR <- function(A, B, fIndex, mIndex, theCenters, theScales,
233 233
   ## make code robust
234 234
   ## check types before passing to C
235 235
   
236
-  .Call("gtypeCallerPart1", A, B,
237
-        as.integer(fIndex), as.integer(mIndex),
238
-        as.numeric(theCenters), as.numeric(theScales),
239
-        as.integer(theNs), lapply(Indexes, as.integer), lapply(cIndexes, as.integer), as.integer(nIndexes), as.integer(ncIndexes),
240
-        as.numeric(SMEDIAN), as.numeric(knots), as.numeric(params),
241
-        as.integer(dft), as.numeric(probs), as.numeric(trim),
242
-        PACKAGE="crlmm")
236
+  .Call("gtypeCallerPart1", A, B, as.integer(fIndex),
237
+        as.integer(mIndex), as.numeric(theCenters),
238
+        as.numeric(theScales), as.integer(theNs),
239
+        lapply(Indexes, as.integer), lapply(cIndexes, as.integer),
240
+        as.integer(nIndexes), as.integer(ncIndexes),
241
+        as.numeric(SMEDIAN), as.numeric(knots),
242
+        as.numeric(params), as.integer(dft), as.numeric(probs),
243
+        as.numeric(trim), PACKAGE="crlmm")
243 244
   
244 245
 }
245 246
 
... ...
@@ -262,3 +263,363 @@ gtypeCallerR2 <- function(A, B, fIndex, mIndex, theCenters, theScales,
262 263
         as.integer(noTraining), as.integer(noInfo), PACKAGE="crlmm")
263 264
   
264 265
 }
266
+
267
+### parallel version
268
+crlmm2 <- function(filenames, row.names=TRUE, col.names=TRUE,
269
+                   probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
270
+                   save.it=FALSE, load.it=FALSE, intensityFile,
271
+                   mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
272
+                   cdfName, sns, recallMin=10, recallRegMin=1000,
273
+                   returnParams=FALSE, badSNP=.7){
274
+  if ((load.it || save.it) && missing(intensityFile))
275
+    stop("'intensityFile' is missing, and you chose either load.it or save.it")
276
+  if (missing(sns)) sns <- basename(filenames)
277
+  if (!missing(intensityFile))
278
+    if (load.it & !file.exists(intensityFile)){
279
+      load.it <- FALSE
280
+      message("File ", intensityFile, " does not exist.")
281
+      message("Not loading it, but running SNPRMA from scratch.")
282
+    }
283
+  if (!load.it){
284
+    res <- snprma2(filenames, fitMixture=TRUE,
285
+                   mixtureSampleSize=mixtureSampleSize, verbose=verbose,
286
+                   eps=eps, cdfName=cdfName, sns=sns)
287
+    open(res[["A"]])
288
+    open(res[["B"]])
289
+    open(res[["SNR"]])
290
+    open(res[["mixtureParams"]])
291
+    if(save.it){
292
+      t0 <- proc.time()
293
+      save(res, file=intensityFile)
294
+      t0 <- proc.time()-t0
295
+      if (verbose) message("Used ", t0[3], " seconds to save ", intensityFile, ".")
296
+    }
297
+  }else{
298
+    if (verbose) message("Loading ", intensityFile, ".")
299
+    obj <- load(intensityFile)
300
+    if (verbose) message("Done.")
301
+    if (obj != "res")
302
+      stop("Object in ", intensityFile, " seems to be invalid.")
303
+  }
304
+  if(row.names) row.names=res$gns else row.names=NULL
305
+  if(col.names) col.names=res$sns else col.names=NULL
306
+  res2 <- crlmmGT2(res[["A"]], res[["B"]], res[["SNR"]],
307
+                   res[["mixtureParams"]], res[["cdfName"]],
308
+                   gender=gender, row.names=row.names,
309
+                   col.names=col.names, recallMin=recallMin,
310
+                   recallRegMin=1000, SNRMin=SNRMin,
311
+                   returnParams=returnParams, badSNP=badSNP,
312
+                   verbose=verbose)
313
+  
314
+  res2[["SNR"]] <- res[["SNR"]]
315
+  res2[["SKW"]] <- res[["SKW"]]
316
+  return(list2SnpSet(res2, returnParams=returnParams))
317
+}
318
+
319
+crlmmGT2 <- function(A, B, SNR, mixtureParams, cdfName, row.names=NULL,
320
+                     col.names=NULL, probs=c(1/3, 1/3, 1/3), DF=6,
321
+                     SNRMin=5, recallMin=10, recallRegMin=1000,
322
+                     gender=NULL, desctrucitve=FALSE, verbose=TRUE,
323
+                     returnParams=FALSE, badSNP=.7){
324
+  open(SNR)
325
+  open(A)
326
+  open(B)
327
+  open(mixtureParams)
328
+  ## expect objects to be ff
329
+  
330
+  keepIndex <- which( SNR[] > SNRMin)
331
+  if(length(keepIndex)==0) stop("No arrays above quality threshold!")
332
+  
333
+  NC <- ncol(A)
334
+  NR <- nrow(A)
335
+  
336
+  pkgname <- getCrlmmAnnotationName(cdfName)
337
+  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
338
+
339
+  if(verbose) message("Loading annotations.")
340
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
341
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
342
+
343
+  ## this is toget rid of the 'no visible binding' notes
344
+  ## variable definitions
345
+  XIndex <- getVarInEnv("XIndex")
346
+  autosomeIndex <- getVarInEnv("autosomeIndex")
347
+  YIndex <- getVarInEnv("YIndex")
348
+  SMEDIAN <- getVarInEnv("SMEDIAN")
349
+  theKnots <- getVarInEnv("theKnots")
350
+  regionInfo <- getVarInEnv("regionInfo")
351
+  params <- getVarInEnv("params")
352
+  
353
+  ##IF gender not provide, we predict
354
+  if(is.null(gender)){
355
+    if(verbose) message("Determining gender.")
356
+    XMedian <- apply(log2(A[XIndex,, drop=FALSE])+log2(B[XIndex,, drop=FALSE]), 2, median)/2
357
+    if(sum(SNR[] > SNRMin)==1){
358
+      gender <- which.min(c(abs(XMedian-8.9), abs(XMedian-9.5)))
359
+    }else{
360
+      gender <- kmeans(XMedian, c(min(XMedian[SNR[]>SNRMin]), max(XMedian[SNR[]>SNRMin])))[["cluster"]]
361
+    }
362
+  }
363
+  
364
+  Indexes <- list(autosomeIndex, XIndex, YIndex)
365
+  cIndexes <- list(keepIndex, 
366
+                   keepIndex[which(gender[keepIndex]==2)], 
367
+                   keepIndex[which(gender[keepIndex]==1)])
368
+  
369
+  if(verbose) cat("Calling", NR, "SNPs for recalibration... ")
370
+
371
+  ## call C
372
+  fIndex <- which(gender==2)
373
+  mIndex <- which(gender==1)
374
+
375
+  ## different here
376
+  ## use gtypeCallerR in batches
377
+  snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
378
+  newparamsBatch <- vector("list", length(snpBatches))
379
+
380
+  process1 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
381
+                       YIndex, A, B, mixtureParams, fIndex, mIndex,
382
+                       params, cIndexes, SMEDIAN, theKnots, DF, probs, batchSize){
383
+    open(A)
384
+    open(B)
385
+    open(mixtureParams)
386
+    snps <- snpBatches[[idxBatch]]
387
+    rSnps <- range(snps)
388
+    last <- (idxBatch-1)*batchSize
389
+    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
390
+                         XIndex[XIndex %in% snps]-last,
391
+                         YIndex[YIndex %in% snps]-last)
392
+    IndexesBatch <- lapply(IndexesBatch, as.integer)
393
+    tmpA <- as.matrix(A[snps,])
394
+    tmpB <- as.matrix(B[snps,])
395
+    ## newparamsBatch[[idxBatch]]
396
+
397
+    tmp <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
398
+                        params[["centers"]][snps,],
399
+                        params[["scales"]][snps,],
400
+                        params[["N"]][snps,],
401
+                        IndexesBatch, cIndexes,
402
+                        sapply(IndexesBatch, length),
403
+                        sapply(cIndexes, length), SMEDIAN,
404
+                        theKnots, mixtureParams[], DF, probs, 0.025)
405
+    
406
+    last <- rSnps[2]
407
+    rm(snps, rSnps, IndexesBatch, tmpA, tmpB)
408
+    close(A)
409
+    close(B)
410
+    close(mixtureParams)
411
+    tmp
412
+  }
413
+
414
+  newparamsBatch <- ocLapply(seq(along=snpBatches), process1,
415
+                             snpBatches=snpBatches,
416
+                             autosomeIndex=autosomeIndex, XIndex=XIndex,
417
+                             YIndex=YIndex, A=A, B=B,
418
+                             mixtureParams=mixtureParams, fIndex=fIndex,
419
+                             mIndex=mIndex, params=params,
420
+                             cIndexes=cIndexes, SMEDIAN=SMEDIAN,
421
+                             theKnots=theKnots, DF=DF, probs=probs,
422
+                             batchSize=ocProbesets())
423
+##   last <- 0
424
+##   for (idxBatch in seq(along=snpBatches)){
425
+##     snps <- snpBatches[[idxBatch]]
426
+##     rSnps <- range(snps)
427
+##     IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
428
+##                          XIndex[XIndex %in% snps]-last,
429
+##                          YIndex[YIndex %in% snps]-last)
430
+##     IndexesBatch <- lapply(IndexesBatch, as.integer)
431
+##     tmpA <- A[snps,]
432
+##     tmpB <- B[snps,]
433
+##     newparamsBatch[[idxBatch]] <- gtypeCallerR(tmpA, tmpB, fIndex, mIndex,
434
+##                                                params[["centers"]][snps,],
435
+##                                                params[["scales"]][snps,],
436
+##                                                params[["N"]][snps,],
437
+##                                                IndexesBatch, cIndexes,
438
+##                                                sapply(IndexesBatch, length),
439
+##                                                sapply(cIndexes, length),
440
+##                                                SMEDIAN, theKnots,
441
+##                                                mixtureParams[], DF, probs, 0.025)
442
+##     last <- rSnps[2]
443
+##     rm(snps, rSnps, IndexesBatch, tmpA, tmpB)
444
+##   }
445
+##   rm(last)
446
+  
447
+  newparams <- vector("list", 3)
448
+  names(newparams) <- c("centers", "scales", "N")
449
+  newparams[["centers"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 1))
450
+  newparams[["scales"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 2))
451
+  newparams[["N"]] <- do.call("rbind", lapply(newparamsBatch, "[[", 3))
452
+  rm(newparamsBatch)
453
+  if(verbose) message("Done.")
454
+  if(verbose) message("Estimating recalibration parameters.")
455
+  d <- newparams[["centers"]] - params$centers
456
+
457
+  ##regression 
458
+  Index <- intersect(which(pmin(newparams[["N"]][, 1],
459
+                                newparams[["N"]][, 2],
460
+                                newparams[["N"]][, 3]) > recallMin &
461
+                                !apply(regionInfo, 1, any)),
462
+                                autosomeIndex)
463
+  if(length(Index) < recallRegMin){
464
+    warning("Recalibration not possible. Possible cause: small sample size.")
465
+    newparams <- params
466
+    dev <- vector("numeric", nrow(newparams[["centers"]]))
467
+    SS <- matrix(Inf, 3, 3)
468
+    DD <- 0
469
+  }else{
470
+    data4reg <- as.data.frame(newparams[["centers"]][Index,])
471
+    names(data4reg) <- c("AA", "AB", "BB")
472
+    regParams <- cbind(  coef(lm(AA~AB*BB, data=data4reg)),
473
+                       c(coef(lm(AB~AA+BB, data=data4reg)), 0), 
474
+                       coef(lm(BB~AA*AB, data=data4reg)))
475
+    rownames(regParams) <- c("intercept", "X", "Y", "XY")
476
+    rm(data4reg)
477
+  
478
+    minN <- 3
479
+    newparams[["centers"]][newparams[["N"]] < minN] <- NA
480
+    Index <- setdiff(which(rowSums(is.na(newparams[["centers"]]))==1), YIndex)
481
+    if(verbose) cat("Filling out empty centers")
482
+    for(i in Index){
483
+      if(verbose) if(i%%10000==0)cat(".")
484
+      mu <- newparams[["centers"]][i, ]
485
+      j <- which(is.na(mu))
486
+      newparams[["centers"]][i, j] <- c(1, mu[-j], prod(mu[-j]))%*%regParams[, j]
487
+    }
488
+    
489
+    ##remaing NAs are made like originals
490
+    if(length(YIndex)>0){
491
+      noMoveIndex <- union(setdiff(which(rowSums(is.na(newparams[["centers"]]))>0), YIndex), 
492
+                           YIndex[rowSums(is.na(newparams[["centers"]][YIndex, ])>1)])
493
+    }
494
+    snps2ignore <- which(rowSums(is.na(newparams[["centers"]])) > 0)
495
+    snps2keep <- setdiff(autosomeIndex, snps2ignore)
496
+    rm(snps2ignore)
497
+    newparams[["centers"]][is.na(newparams[["centers"]])] <- params[["centers"]][is.na(newparams[["centers"]])]
498
+    if(verbose) cat("\n")
499
+  
500
+    if(verbose) message("Calculating and standardizing size of shift.")
501
+    GG <- DD <- newparams[["centers"]] - params[["centers"]]
502
+    DD <- sweep(DD, 2, colMeans(DD[autosomeIndex, ]))
503
+    SS <- cov(DD[autosomeIndex, ])
504
+    SSI <- solve(SS)
505
+    dev <- vector("numeric", nrow(DD))
506
+    if(length(YIndex)){
507
+      dev[-YIndex] <- apply(DD[-YIndex, ], 1, function(x) x%*%SSI%*%x)
508
+      dev[-YIndex] <- 1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev[-YIndex])
509
+      ##Now Y (only two params)
510
+      SSY <- SS[c(1, 3), c(1, 3)]
511
+      SSI <- solve(SSY) 
512
+      dev[YIndex] <- apply(DD[YIndex, c(1, 3)], 1, function(x) x%*%SSI%*%x)
513
+      dev[YIndex] <- 1/sqrt( (2*pi)^2*det(SSY))*exp(-0.5*dev[YIndex])
514
+    } else {
515
+      dev=apply(DD,1,function(x) x%*%SSI%*%x)
516
+      dev=1/sqrt( (2*pi)^3*det(SS))*exp(-0.5*dev)
517
+    }
518
+  }
519
+    
520
+  ## BC: must keep SD
521
+  params[-2] <- newparams[-2]
522
+  
523
+  rm(newparams);gc(verbose=FALSE)  
524
+  if(verbose) cat("Calling", NR, "SNPs... ")
525
+  ## ###################
526
+  ## ## MOVE TO C#######
527
+
528
+  ## running in batches
529
+  ## snpBatches <- splitIndicesByLength(1:nrow(A), ocProbesets())
530
+
531
+  process2 <- function(idxBatch, snpBatches, autosomeIndex, XIndex,
532
+                       YIndex, A, B, mixtureParams, fIndex, mIndex,
533
+                       params, cIndexes, SMEDIAN, theKnots, DF, probs,
534
+                       regionInfo, batchSize){
535
+    open(A)
536
+    open(B)
537
+    open(mixtureParams)
538
+    snps <- snpBatches[[idxBatch]]
539
+    tmpA <- as.matrix(A[snps,])
540
+    tmpB <- as.matrix(B[snps,])
541
+    rSnps <- range(snps)
542
+    last <- (idxBatch-1)*batchSize
543
+    IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
544
+                         XIndex[XIndex %in% snps]-last,
545
+                         YIndex[YIndex %in% snps]-last)
546
+    IndexesBatch <- lapply(IndexesBatch, as.integer)
547
+    ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
548
+                            params[["centers"]][snps,],
549
+                            params[["scales"]][snps,],
550
+                            params[["N"]][snps,],
551
+                            IndexesBatch, cIndexes,
552
+                            sapply(IndexesBatch, length),
553
+                            sapply(cIndexes, length),
554
+                            SMEDIAN, theKnots, mixtureParams[],
555
+                            DF, probs, 0.025,
556
+                            which(regionInfo[snps, 2]),
557
+                            which(regionInfo[snps, 1]))
558
+    A[snps,] <- tmpA
559
+    B[snps,] <- tmpB
560
+    last <- rSnps[2]
561
+    rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull)
562
+    close(A)
563
+    close(B)
564
+    close(mixtureParams)
565
+  }
566
+
567
+  ocLapply(seq(along=snpBatches), process2, snpBatches=snpBatches,
568
+           autosomeIndex=autosomeIndex, XIndex=XIndex, YIndex=YIndex,
569
+           A=A, B=B, mixtureParams=mixtureParams, fIndex=fIndex,
570
+           mIndex=mIndex, params=params, cIndexes=cIndexes,
571
+           SMEDIAN=SMEDIAN, theKnots=theKnots, DF=DF, probs=probs,
572
+           regionInfo=regionInfo, batchSize=ocProbesets())
573
+  
574
+##   last <- 0
575
+##   for (idxBatch in seq(along=snpBatches)){
576
+##     snps <- snpBatches[[idxBatch]]
577
+##     tmpA <- A[snps,]
578
+##     tmpB <- B[snps,]
579
+##     rSnps <- range(snps)
580
+##     IndexesBatch <- list(autosomeIndex[autosomeIndex %in% snps]-last,
581
+##                          XIndex[XIndex %in% snps]-last,
582
+##                          YIndex[YIndex %in% snps]-last)
583
+##     IndexesBatch <- lapply(IndexesBatch, as.integer)
584
+##     ImNull <- gtypeCallerR2(tmpA, tmpB, fIndex, mIndex,
585
+##                             params[["centers"]][snps,],
586
+##                             params[["scales"]][snps,],
587
+##                             params[["N"]][snps,],
588
+##                             IndexesBatch, cIndexes,
589
+##                             sapply(IndexesBatch, length),
590
+##                             sapply(cIndexes, length),
591
+##                             SMEDIAN, theKnots, mixtureParams[],
592
+##                             DF, probs, 0.025,
593
+##                             which(regionInfo[snps, 2]),
594
+##                             which(regionInfo[snps, 1]))
595
+##     A[snps,] <- tmpA
596
+##     B[snps,] <- tmpB
597
+##     last <- rSnps[2]
598
+##     rm(tmpA, tmpB, snps, rSnps, IndexesBatch, ImNull)
599
+##   }
600
+##   
601
+##   gc(verbose=FALSE)
602
+  ##  END MOVE TO C#######
603
+  ## ##################
604
+  
605
+  dev <- dev/(dev+1/383)
606
+  if(!is.null(row.names)){ rownames(A) <- rownames(B) <- names(dev) <- row.names}
607
+  if(!is.null(col.names)){ colnames(A) <- colnames(B) <- col.names}
608
+
609
+  if(length(Index) >= recallRegMin){
610
+   tmp4batchQC <- DD[autosomeIndex,]/(params[["N"]][autosomeIndex,]+1)
611
+   tmpSnpQc <- dev[autosomeIndex]
612
+   SS <- cov(tmp4batchQC[tmpSnpQc < badSNP,])
613
+   batchQC <- mean(diag(SS))
614
+  }else{
615
+    SS <- matrix(0, 3, 3)
616
+    batchQC <- Inf
617
+  }
618
+  
619
+  if(verbose) message("Done.")
620
+  if (returnParams){
621
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, params=params, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
622
+  }else{
623
+    return(list(calls=A, confs=B, SNPQC=dev, batchQC=batchQC, DD=DD, covDD=SS, gender=gender, pkgname=pkgname))
624
+  }
625
+}
... ...
@@ -1,17 +1,21 @@
1
-snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
1
+snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
2
+                   eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
2 3
   if (missing(sns)) sns <- basename(filenames)
3
-  ##ADD CHECK TO SEE IF LOADED
4 4
   if (missing(cdfName))
5 5
     cdfName <- read.celfile.header(filenames[1])$cdfName
6
-  ##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
7 6
   pkgname <- getCrlmmAnnotationName(cdfName)
7
+
8 8
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
9
-    suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
10
-    msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
9
+    suggCall <- paste("library(", pkgname,
10
+                      ", lib.loc='/Altern/Lib/Loc')",
11
+                      sep="")
12
+    msg <- paste("If", pkgname,
13
+                 "is installed on an alternative location,",
14
+                 "please load it manually by using", suggCall)
11 15
     message(strwrap(msg))
12 16
     stop("Package ", pkgname, " could not be found.")
13
-    rm(suggCall, msg)
14 17
   }
18
+  
15 19
   if(verbose) message("Loading annotations and mixture model parameters.")
16 20
   loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
17 21
   loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
... ...
@@ -32,6 +36,9 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1,
32 36
   mixtureParams <- matrix(0, 4, length(filenames))
33 37
   SNR <- vector("numeric", length(filenames))
34 38
   SKW <- vector("numeric", length(filenames))
39
+##   mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames))
40
+##   SNR <- initializeBigVector("crlmmSNR-", length(filenames), "numeric")
41
+##   SKW <- initializeBigVector("crlmmSKW-", length(filenames), "numeric")
35 42
   
36 43
   ## This is the sample for the fitting of splines
37 44
   ## BC: I like better the idea of the user passing the seed,
... ...
@@ -47,10 +54,13 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1,
47 54
   
48 55
   if(verbose){
49 56
     message("Processing ", length(filenames), " files.")
50
-    if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=length(filenames), style=3)
57
+    pb <- txtProgressBar(min=0, max=length(filenames), style=3)
51 58
   }
59
+  
60
+  ##for skewness. no need to do everything
61
+  idx2 <- sample(length(fid), 10^5)
62
+  
52 63
   ##We start looping throug cel files
53
-  idx2 <- sample(length(fid), 10^5) ##for skewness. no need to do everything
54 64
   for(i in seq(along=filenames)){
55 65
     y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
56 66
     x <- log2(y[idx2])
... ...
@@ -70,13 +80,9 @@ snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE, eps=0.1,
70 80
       mixtureParams[, i] <- tmp[["coef"]]
71 81
       SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
72 82
     }
73
-    if (verbose)
74
-      if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
75
-      else cat(".")
83
+    if (verbose) setTxtProgressBar(pb, i)
76 84
   }
77
-  if (verbose)
78
-    if (getRversion() > '2.7.0') close(pb)
79
-    else cat("\n")
85
+  if (verbose) close(pb)
80 86
   if (!fitMixture) SNR <- mixtureParams <- NA
81 87
   ## gns comes from preprocStuff.rda
82 88
   list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
... ...
@@ -127,5 +133,106 @@ fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=1
127 133
  return(list(coef= -fit1, medF1=medF1, sigma1=sigmas[1], sigma2=sigmas[2]))
128 134
 }
129 135
 
136
+snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
137
+                    eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
138
+  if (missing(sns)) sns <- basename(filenames)
139
+  if (missing(cdfName))
140
+    cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
141
+  pkgname <- getCrlmmAnnotationName(cdfName)
142
+  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))
143
+  
144
+  if(verbose) message("Loading annotations and mixture model parameters.")
145
+  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
146
+  pnsa <- getVarInEnv("pnsa")
147
+  pnsb <- getVarInEnv("pnsb")
148
+  gns <- getVarInEnv("gns")
149
+  
150
+  ##We will read each cel file, summarize, and run EM one by one
151
+  ##We will save parameters of EM to use later
152
+  if(verbose) message("Initializing objects.")
153
+  mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
154
+  SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
155
+  SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")
156
+  
157
+  ## This is the sample for the fitting of splines
158
+  ## BC: I like better the idea of the user passing the seed,
159
+  ##     because this might intefere with other analyses
160
+  ##     (like what happened to GCRMA)
161
+  ##S will hold (A+B)/2 and M will hold A-B
162
+  ##NOTE: We actually dont need to save S. Only for pics etc...
163
+  ##f is the correction. we save to avoid recomputing
164
+  A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer")
165
+  B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer")
166
+
167
+  sampleBatches <- splitIndicesByNode(seq(along=filenames))
168
+
169
+  if(verbose) message("Processing ", length(filenames), " files.")
170
+
171
+  ocLapply(sampleBatches, processCEL, filenames=filenames,
172
+           fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR,
173
+           mixtureParams=mixtureParams, eps=eps, seed=seed,
174
+           mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
175
+           neededPkgs=c("crlmm", pkgname))
176
+  
177
+  list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
178
+}
179
+
130 180
 
181
+processCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
182
+                       mixtureParams, eps, seed, mixtureSampleSize,
183
+                       pkgname){
184
+  
185
+  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
186
+  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
187
+  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
188
+  autosomeIndex <- getVarInEnv("autosomeIndex")
189
+  pnsa <- getVarInEnv("pnsa")
190
+  pnsb <- getVarInEnv("pnsb")
191
+  fid <- getVarInEnv("fid")
192
+  reference <- getVarInEnv("reference")
193
+  aIndex <- getVarInEnv("aIndex")
194
+  bIndex <- getVarInEnv("bIndex")
195
+  SMEDIAN <- getVarInEnv("SMEDIAN")
196
+  theKnots <- getVarInEnv("theKnots")
197
+  gns <- getVarInEnv("gns")
198
+
199
+  ## for mixture
200
+  set.seed(seed)
201
+  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
202
+  ##for skewness. no need to do everything
203
+  idx2 <- sample(length(fid), 10^5)
131 204
 
205
+  open(A)
206
+  open(B)
207
+  open(SKW)
208
+  open(mixtureParams)
209
+  open(SNR)
210
+
211
+  for (k in i){
212
+    y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
213
+    x <- log2(y[idx2])
214
+    SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
215
+    rm(x)
216
+    y <- normalize.quantiles.use.target(y, target=reference)
217
+    A[, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
218
+    B[, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
219
+    rm(y)
220
+    
221
+    if(fitMixture){
222
+      S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
223
+      M <- log2(A[idx, k])-log2(B[idx, k])
224
+      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
225
+      mixtureParams[, k] <- tmp[["coef"]]
226
+      SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
227
+    } else {
228
+      mixtureParams[, k] <- NA
229
+      SNR[k] <- NA
230
+    }
231
+  }
232
+  close(A)
233
+  close(B)
234
+  close(SKW)
235
+  close(mixtureParams)
236
+  close(SNR)
237
+  TRUE
238
+}
... ...
@@ -37,7 +37,7 @@ getVarInEnv <- function(dataset, environ=.crlmmPkgEnv){
37 37
 }
38 38
 
39 39
 list2SnpSet <- function(x, returnParams=FALSE){
40
-  pd <- data.frame(SNR=x[["SNR"]], gender=x[["gender"]],
40
+  pd <- data.frame(SNR=x[["SNR"]][], gender=x[["gender"]],
41 41
                    batchQC=rep(x[["batchQC"]], ncol(x[["calls"]])),
42 42
                    row.names=colnames(x[["calls"]]))
43 43
   pdv <- data.frame(labelDescription=c("Signal-to-noise Ratio",
... ...
@@ -205,7 +205,7 @@ initializeParamObject <- function(dimnames){
205 205
 	return(ll)
206 206
 }
207 207
 
208
-
208
+## BC: how about moving initializeBigMatrix to oligoClasses?
209 209
 initializeBigMatrix <- function(name, nr, nc, vmode="integer"){
210 210
 	if(isPackageLoaded("ff")){
211 211
 		if(prod(nr, nc) > 2^31){
... ...
@@ -238,9 +238,32 @@ initializeBigMatrix <- function(name, nr, nc, vmode="integer"){
238 238
 					    vmode=vmode)
239 239
 			results[,] <- NA
240 240
 		}
241
-	}  else results <- matrix(NA, nr, nc)
241
+	}  else {
242
+          theNA <- switch(vmode,
243
+                          integer=NA_integer_,
244
+                          double=NA_real_,
245
+                          character=NA_character_,
246
+                          stop("Mode ", vmode, " not implemented for regular matrices"))
247
+          results <- matrix(theNA, nr, nc)
248
+        }
242 249
 	return(results)
243 250
 }
251
+
252
+initializeBigVector <- function(name, n, vmode="integer"){
253
+  if(isPackageLoaded("ff")){
254
+    results <- ff(vmode=vmode, length=n, pattern=file.path(ldPath(), basename(name)))
255
+  }  else {
256
+    theNA <- switch(vmode,
257
+                    integer=NA_integer_,
258
+                    double=NA_real_,
259
+                    character=NA_character_,
260
+                    stop("Mode ", vmode, " not implemented for regular matrices"))
261
+    results <- rep(theNA, n)
262
+  }
263
+  return(results)
264
+}
265
+
266
+
244 267
 setMethod("annotatedDataFrameFrom", "ff_matrix", Biobase:::annotatedDataFrameFromMatrix)
245 268
 setMethod("annotatedDataFrameFrom", "ffdf", Biobase:::annotatedDataFrameFromMatrix)
246 269
 
... ...
@@ -1,5 +1,6 @@
1 1
 \name{crlmm}
2 2
 \alias{crlmm}
3
+\alias{crlmm2}
3 4
 
4 5
 \title{Genotype oligonucleotide arrays with CRLMM}
5 6
 \description{
... ...
@@ -14,6 +15,12 @@ crlmm(filenames, row.names=TRUE, col.names=TRUE,
14 15
       intensityFile, mixtureSampleSize=10^5,
15 16
       eps=0.1, verbose=TRUE, cdfName, sns, recallMin=10,
16 17
       recallRegMin=1000, returnParams=FALSE, badSNP=0.7)
18
+crlmm2(filenames, row.names=TRUE, col.names=TRUE,
19
+      probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5,
20
+      gender=NULL, save.it=FALSE, load.it=FALSE,
21
+      intensityFile, mixtureSampleSize=10^5,
22
+      eps=0.1, verbose=TRUE, cdfName, sns, recallMin=10,
23
+      recallRegMin=1000, returnParams=FALSE, badSNP=0.7)
17 24
 }
18 25
 
19 26
 \arguments{
... ...
@@ -49,6 +56,10 @@ crlmm(filenames, row.names=TRUE, col.names=TRUE,
49 56
   \item{batchQC}{Batch Quality Score}
50 57
   \item{params}{Recalibrated parameters}
51 58
 }
59
+\details{
60
+  'crlmm2' allows one to genotype very large datasets (via ff package) and also permits
61
+  the use of clusters or multiple cores (via snow package) to speed up genotyping.
62
+  }
52 63
 \references{
53 64
   Carvalho B, Bengtsson H, Speed TP, Irizarry RA. Exploration,
54 65
   normalization, and genotype calls of high-density oligonucleotide SNP
... ...
@@ -69,6 +80,23 @@ if (require(genomewidesnp5Crlmm) & require(hapmapsnp5)){
69 80
   cels <- list.celfiles(path, full.names=TRUE)
70 81
   (crlmmOutput <- crlmm(cels))
71 82
 }
83
+
84
+\dontrun{
85
+## HPC Example
86
+library(ff)
87
+library(snow)
88
+library(crlmm)
89
+## genotype 50K SNPs at a time
90
+ocProbesets(50000)
91
+## setup cluster - 8 cores on the machine
92
+setCluster(8, "SOCK")
93
+
94
+path <- system.file("celFiles", package="hapmapsnp5")
95
+cels <- list.celfiles(path, full.names=TRUE)
96
+crlmmOutput <- crlmm2(cels)
97
+
98
+}
99
+
72 100
 }
73 101
 \keyword{classif}
74 102
 
... ...
@@ -1,6 +1,7 @@
1 1
 \name{snprma}
2 2
 \Rdversion{1.1}
3 3
 \alias{snprma}
4
+\alias{snprma2}
4 5
 
5 6
 \title{
6 7
   Preprocessing tool for SNP arrays.
... ...
@@ -12,6 +13,7 @@
12 13
 }
13 14
 \usage{
14 15
 snprma(filenames, mixtureSampleSize = 10^5, fitMixture = FALSE, eps = 0.1, verbose = TRUE, seed = 1, cdfName, sns)
16
+snprma2(filenames, mixtureSampleSize = 10^5, fitMixture = FALSE, eps = 0.1, verbose = TRUE, seed = 1, cdfName, sns)
15 17
 }
16 18
 \arguments{
17 19
   \item{filenames}{
... ...
@@ -49,6 +51,10 @@ snprma(filenames, mixtureSampleSize = 10^5, fitMixture = FALSE, eps = 0.1, verbo
49 51
   \item{mixtureParams}{Parameters from mixture model}
50 52
   \item{cdfName}{Name of the CDF}
51 53
 }
54
+\details{
55
+  'snprma2' allows one to genotype very large datasets (via ff package) and also permits
56
+  the use of clusters or multiple cores (via snow package) to speed up preprocessing.
57
+  }
52 58
 \examples{
53 59
 if (require(genomewidesnp5Crlmm) & require(hapmapsnp5) & require(oligoClasses)){
54 60
   path <- system.file("celFiles", package="hapmapsnp5")
... ...
@@ -60,6 +66,22 @@ if (require(genomewidesnp5Crlmm) & require(hapmapsnp5) & require(oligoClasses)){
60 66
   snprmaOutput[["A"]][1:10,]
61 67
   snprmaOutput[["B"]][1:10,]
62 68
 }
69
+\dontrun{
70
+## HPC Example
71
+library(ff)
72
+library(snow)
73
+library(crlmm)
74
+## genotype 50K SNPs at a time
75
+ocProbesets(50000)
76
+## setup cluster - 8 cores on the machine
77
+setCluster(8, "SOCK")
78
+
79
+path <- system.file("celFiles", package="hapmapsnp5")
80
+cels <- list.celfiles(path, full.names=TRUE)
81
+snprmaOutput <- snprma2(cels)
82
+
83
+}
84
+
63 85
 }
64 86
 \keyword{manip}
65 87
 \keyword{classif}