Browse code

Changes to support humanomniexpress12v1b illumina platform

- added platform to validCdfNames()
- added platform to chipList in RGtoXY and RGtoXY2

Other minor changes to copynumber vignette
- removed eol whitespace
- removed some commented lines towards the end
- added a message to indicate loading a saved object

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/branches/RELEASE_2_6/madman/Rpacks/crlmm@49102 bc3139a8-67e5-0310-9ffc-ced21a209358

Rob Scharp authored on 30/08/2010 15:45:15
Showing3 changed files

... ...
@@ -30,7 +30,7 @@ readIdatFiles <- function(sampleSheet=NULL,
30 30
 			       barcode = sampleSheet[,arrayInfoColNames$barcode]
31 31
 			       arrayNames=barcode
32 32
 		       }
33
-		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {  
33
+		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
34 34
 			       position = sampleSheet[,arrayInfoColNames$position]
35 35
 			       if(is.null(arrayNames))
36 36
 				       arrayNames=position
... ...
@@ -44,7 +44,7 @@ readIdatFiles <- function(sampleSheet=NULL,
44 44
 		       }
45 45
 	       }
46 46
 	       pd = new("AnnotatedDataFrame", data = sampleSheet)
47
-	       sampleNames(pd) <- basename(arrayNames)               
47
+	       sampleNames(pd) <- basename(arrayNames)
48 48
        }
49 49
        if(is.null(arrayNames)) {
50 50
                arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
... ...
@@ -54,7 +54,7 @@ readIdatFiles <- function(sampleSheet=NULL,
54 54
                }
55 55
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
56 56
        }
57
-       
57
+
58 58
        narrays = length(arrayNames)
59 59
        grnfiles = paste(arrayNames, fileExt$green, sep=sep)
60 60
        redfiles = paste(arrayNames, fileExt$red, sep=sep)
... ...
@@ -71,7 +71,7 @@ readIdatFiles <- function(sampleSheet=NULL,
71 71
 	       redidats <- redfiles
72 72
        }
73 73
        if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
74
-       if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")       
74
+       if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
75 75
 ##       if(!all(c(redfiles,grnfiles) %in% dir(path=path))){
76 76
 ##	       stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
77 77
 ##		    paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
... ...
@@ -146,12 +146,12 @@ readIdatFiles <- function(sampleSheet=NULL,
146 146
 	       }
147 147
 	       rm(G)
148 148
 	       gc()
149
-         
149
+
150 150
 	       cat(paste(sep, fileExt$red, sep=""), "\n")
151 151
 	       R = readIDAT(redidats[i])
152 152
 	       idsR = rownames(R$Quants)
153
-         
154
-	       if(length(ids)==length(idsG)) {   
153
+
154
+	       if(length(ids)==length(idsG)) {
155 155
 		       if(sum(ids==idsR)==nprobes) {
156 156
 			       RG@assayData$R[,i] = R$Quants[ ,"Mean"]
157 157
 				   zeroR = R$Quants[ ,"NBeads"]==0
... ...
@@ -204,7 +204,7 @@ readIdatFiles2 <- function(sampleSheet=NULL,
204 204
 			       barcode = sampleSheet[,arrayInfoColNames$barcode]
205 205
 			       arrayNames=barcode
206 206
 		       }
207
-		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {  
207
+		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
208 208
 			       position = sampleSheet[,arrayInfoColNames$position]
209 209
 			       if(is.null(arrayNames))
210 210
 				       arrayNames=position
... ...
@@ -218,7 +218,7 @@ readIdatFiles2 <- function(sampleSheet=NULL,
218 218
 		       }
219 219
 	       }
220 220
 	       pd = new("AnnotatedDataFrame", data = sampleSheet)
221
-	       sampleNames(pd) <- basename(arrayNames)               
221
+	       sampleNames(pd) <- basename(arrayNames)
222 222
        }
223 223
        if(is.null(arrayNames)) {
224 224
                arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
... ...
@@ -228,7 +228,7 @@ readIdatFiles2 <- function(sampleSheet=NULL,
228 228
                }
229 229
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
230 230
        }
231
-       
231
+
232 232
        narrays = length(arrayNames)
233 233
        grnfiles = paste(arrayNames, fileExt$green, sep=sep)
234 234
        redfiles = paste(arrayNames, fileExt$red, sep=sep)
... ...
@@ -245,7 +245,7 @@ readIdatFiles2 <- function(sampleSheet=NULL,
245 245
 	       redidats <- redfiles
246 246
        }
247 247
        if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
248
-       if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")       
248
+       if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
249 249
 ##       if(!all(c(redfiles,grnfiles) %in% dir(path=path))){
250 250
 ##	       stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
251 251
 ##		    paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
... ...
@@ -327,12 +327,12 @@ readIdatFiles2 <- function(sampleSheet=NULL,
327 327
 	       }
328 328
 	       rm(G)
329 329
 	       gc()
330
-         
330
+
331 331
 	       cat(paste(sep, fileExt$red, sep=""), "\n")
332 332
 	       R = readIDAT(redidats[i])
333 333
 	       idsR = rownames(R$Quants)
334
-         
335
-	       if(length(ids)==length(idsG)) {   
334
+
335
+	       if(length(ids)==length(idsG)) {
336 336
 		       if(sum(ids==idsR)==nprobes) {
337 337
 			       RG@assayData$R[,i] = R$Quants[ ,"Mean"]
338 338
 		               zeroR = R$Quants[ ,"NBeads"]==0
... ...
@@ -369,24 +369,24 @@ readIDAT <- function(idatFile){
369 369
 
370 370
   }
371 371
 
372
-  versionNumber <- readBin(tempCon, "integer", n=1, size=8, 
372
+  versionNumber <- readBin(tempCon, "integer", n=1, size=8,
373 373
                            endian="little", signed=FALSE)
374
-  
375
-  nFields <- readBin(tempCon, "integer", n=1, size=4, 
374
+
375
+  nFields <- readBin(tempCon, "integer", n=1, size=4,
376 376
                      endian="little", signed=FALSE)
377 377
 
378 378
   fields <- matrix(0,nFields,3);
379 379
   colnames(fields) <- c("Field Code", "Byte Offset", "Bytes")
380 380
   for(i1 in 1:nFields){
381
-    fields[i1,"Field Code"] <- 
381
+    fields[i1,"Field Code"] <-
382 382
       readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE)
383
-    fields[i1,"Byte Offset"] <- 
383
+    fields[i1,"Byte Offset"] <-
384 384
       readBin(tempCon, "integer", n=1, size=8, endian="little", signed=FALSE)
385 385
   }
386 386
 
387 387
   knownCodes <- c(1000, 102, 103, 104, 107, 200, 300, 400,
388 388
                   401, 402, 403, 404, 405, 406, 407, 408, 409)
389
-  names(knownCodes) <- 
389
+  names(knownCodes) <-
390 390
     c("nSNPsRead",  # 1000
391 391
       "IlluminaID", #  102
392 392
       "SD",         #  103
... ...
@@ -419,35 +419,35 @@ readIDAT <- function(idatFile){
419 419
   }
420 420
 
421 421
   seek(tempCon, fields["nSNPsRead", "Byte Offset"])
422
-  nSNPsRead <- readBin(tempCon, "integer", n=1, size=4, 
422
+  nSNPsRead <- readBin(tempCon, "integer", n=1, size=4,
423 423
                        endian="little", signed=FALSE)
424 424
 
425 425
   seek(tempCon, fields["IlluminaID", "Byte Offset"])
426
-  IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4, 
426
+  IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4,
427 427
                        endian="little", signed=FALSE)
428 428
 
429 429
   seek(tempCon, fields["SD", "Byte Offset"])
430
-  SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2, 
430
+  SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2,
431 431
                 endian="little", signed=FALSE)
432 432
 
433 433
   seek(tempCon, fields["Mean", "Byte Offset"])
434
-  Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2, 
434
+  Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2,
435 435
                   endian="little", signed=FALSE)
436 436
 
437 437
   seek(tempCon, fields["NBeads", "Byte Offset"])
438 438
   NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE)
439 439
 
440 440
   seek(tempCon, fields["MidBlock", "Byte Offset"])
441
-  nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4, 
441
+  nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4,
442 442
                               endian="little", signed=FALSE)
443
-  MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4, 
443
+  MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4,
444 444
                       endian="little", signed=FALSE)
445 445
 
446 446
   seek(tempCon, fields["RunInfo", "Byte Offset"])
447
-  nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4, 
447
+  nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4,
448 448
                             endian="little", signed=FALSE)
449 449
   RunInfo <- matrix(NA, nRunInfoBlocks, 5)
450
-  colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars", 
450
+  colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars",
451 451
                          "BlockCode", "CodeVersion")
452 452
   for(i1 in 1:2) { #nRunInfoBlocks){  ## MR edit
453 453
     for(i2 in 1:5){
... ...
@@ -457,50 +457,50 @@ readIDAT <- function(idatFile){
457 457
   }
458 458
 
459 459
   seek(tempCon, fields["RedGreen", "Byte Offset"])
460
-  RedGreen <- readBin(tempCon, "numeric", n=1, size=4, 
460
+  RedGreen <- readBin(tempCon, "numeric", n=1, size=4,
461 461
                       endian="little", signed=FALSE)
462
-  #RedGreen <- readBin(tempCon, "integer", n=4, size=1, 
462
+  #RedGreen <- readBin(tempCon, "integer", n=4, size=1,
463 463
   #                    endian="little", signed=FALSE)
464 464
 
465 465
   seek(tempCon, fields["MostlyNull", "Byte Offset"])
466 466
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
467
-  MostlyNull <- readChar(tempCon, nChars) 
467
+  MostlyNull <- readChar(tempCon, nChars)
468 468
 
469 469
   seek(tempCon, fields["Barcode", "Byte Offset"])
470 470
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
471
-  Barcode <- readChar(tempCon, nChars) 
471
+  Barcode <- readChar(tempCon, nChars)
472 472
 
473 473
   seek(tempCon, fields["ChipType", "Byte Offset"])
474 474
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
475
-  ChipType <- readChar(tempCon, nChars) 
475
+  ChipType <- readChar(tempCon, nChars)
476 476
 
477 477
   seek(tempCon, fields["MostlyA", "Byte Offset"])
478 478
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
479
-  MostlyA <- readChar(tempCon, nChars) 
479
+  MostlyA <- readChar(tempCon, nChars)
480 480
 
481 481
   seek(tempCon, fields["Unknown.1", "Byte Offset"])
482 482
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
483
-  Unknown.1 <- readChar(tempCon, nChars) 
483
+  Unknown.1 <- readChar(tempCon, nChars)
484 484
 
485 485
   seek(tempCon, fields["Unknown.2", "Byte Offset"])
486 486
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
487
-  Unknown.2 <- readChar(tempCon, nChars) 
487
+  Unknown.2 <- readChar(tempCon, nChars)
488 488
 
489 489
   seek(tempCon, fields["Unknown.3", "Byte Offset"])
490 490
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
491
-  Unknown.3 <- readChar(tempCon, nChars) 
491
+  Unknown.3 <- readChar(tempCon, nChars)
492 492
 
493 493
   seek(tempCon, fields["Unknown.4", "Byte Offset"])
494 494
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
495
-  Unknown.4 <- readChar(tempCon, nChars) 
495
+  Unknown.4 <- readChar(tempCon, nChars)
496 496
 
497 497
   seek(tempCon, fields["Unknown.5", "Byte Offset"])
498 498
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
499
-  Unknown.5 <- readChar(tempCon, nChars) 
499
+  Unknown.5 <- readChar(tempCon, nChars)
500 500
 
501 501
   close(tempCon)
502 502
 
503
-  Unknowns <- 
503
+  Unknowns <-
504 504
     list(MostlyNull=MostlyNull,
505 505
          MostlyA=MostlyA,
506 506
          Unknown.1=Unknown.1,
... ...
@@ -513,21 +513,21 @@ readIDAT <- function(idatFile){
513 513
   colnames(Quants) <- c("Mean", "SD", "NBeads")
514 514
   rownames(Quants) <- as.character(IlluminaID)
515 515
 
516
-  idatValues <- 
517
-    list(fileSize=fileSize, 
518
-         versionNumber=versionNumber, 
519
-         nFields=nFields, 
516
+  idatValues <-
517
+    list(fileSize=fileSize,
518
+         versionNumber=versionNumber,
519
+         nFields=nFields,
520 520
          fields=fields,
521
-         nSNPsRead=nSNPsRead, 
522
-         #IlluminaID=IlluminaID, 
523
-         #SD=SD, 
524
-         #Mean=Mean, 
521
+         nSNPsRead=nSNPsRead,
522
+         #IlluminaID=IlluminaID,
523
+         #SD=SD,
524
+         #Mean=Mean,
525 525
          #NBeads=NBeads,
526 526
          Quants=Quants,
527
-         MidBlock=MidBlock, 
528
-         RunInfo=RunInfo, 
529
-         RedGreen=RedGreen, 
530
-         Barcode=Barcode, 
527
+         MidBlock=MidBlock,
528
+         RunInfo=RunInfo,
529
+         RedGreen=RedGreen,
530
+         Barcode=Barcode,
531 531
          ChipType=ChipType,
532 532
          Unknowns=Unknowns)
533 533
 
... ...
@@ -538,20 +538,20 @@ readIDAT <- function(idatFile){
538 538
 readBPM <- function(bpmFile){
539 539
 
540 540
   ## Reads and parses Illumina BPM files
541
-  
541
+
542 542
   fileSize <- file.info(bpmFile)$size
543 543
 
544 544
   tempCon <- file(bpmFile,"rb")
545 545
 
546 546
   # The first few bytes of the egtFile are some type of
547
-  # header, but there's no related byte offset information. 
547
+  # header, but there's no related byte offset information.
548 548
 
549 549
   prefixCheck <- readChar(tempCon,3) ## should be "BPM"
550 550
 
551 551
   null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
552
-  ## should be 1  
553
-  
554
-  versionNumber <- 
552
+  ## should be 1
553
+
554
+  versionNumber <-
555 555
     readBin(tempCon, "integer", n=1, size=4, endian="little", signed=FALSE)
556 556
   ## should be 4
557 557
 
... ...
@@ -565,14 +565,14 @@ readBPM <- function(bpmFile){
565 565
   entriesByteOffset <- seek(tempCon);
566 566
   nEntries <- readBin(tempCon, "integer", n=1, size=4,
567 567
                       endian="little", signed=FALSE)
568
-    
568
+
569 569
   if(FALSE){
570
-    
570
+
571 571
     snpIndexByteOffset <- seek(tempCon)
572 572
     snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4,
573 573
                         endian="little", signed=FALSE)
574 574
     ## for the 1M array, these are simply in order from 1 to 1072820.
575
-    
575
+
576 576
     snpNamesByteOffset <- seek(tempCon)
577 577
     snpNames <- rep("A", nEntries)
578 578
     for(i1 in 1:nEntries){
... ...
@@ -583,21 +583,21 @@ readBPM <- function(bpmFile){
583 583
   }
584 584
 
585 585
   seek(tempCon, 15278138)
586
-    
586
+
587 587
   normIDByteOffset <- seek(tempCon)
588 588
   normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1
589
-  
589
+
590 590
   newBlockByteOffset <- seek(tempCon)
591 591
   newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE)
592
-  
592
+
593 593
   close(tempCon)
594 594
 
595 595
   byteOffsets <- list(entriesByteOffset=entriesByteOffset,
596
-                      #snpIndexByteOffset=snpIndexByteOffset, 
596
+                      #snpIndexByteOffset=snpIndexByteOffset,
597 597
                       #snpNamesByteOffset=snpNamesByteOffset,
598 598
                       normIDByteOffset=normIDByteOffset,
599 599
                       newBlockByteOffset=newBlockByteOffset)
600
-  
600
+
601 601
   allStuff <- list(prefixCheck=prefixCheck,
602 602
                    null.1=null.1,
603 603
                    versionNumber=versionNumber,
... ...
@@ -611,7 +611,7 @@ readBPM <- function(bpmFile){
611 611
                    newBlock=newBlock,
612 612
                    byteOffsets=byteOffsets)
613 613
   allStuff
614
-  
614
+
615 615
 }
616 616
 
617 617
 RGtoXY = function(RG, chipType, verbose=TRUE) {
... ...
@@ -623,11 +623,12 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
623 623
                "human370quadv3c",        # 370CNV quad
624 624
                "human550v3b",            # 550K
625 625
                "human1mduov3b",          # 1M Duo
626
-               "humanomni1quadv1b")      # Omni1 quad
626
+               "humanomni1quadv1b",      # Omni1 quad
627
+	       "humanomniexpress12v1b")  # Omni express 12
627 628
   if(missing(chipType)){
628 629
 	  chipType = match.arg(annotation(RG), chipList)
629 630
   } else chipType = match.arg(chipType, chipList)
630
-  
631
+
631 632
   pkgname <- getCrlmmAnnotationName(chipType)
632 633
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
633 634
      suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
... ...
@@ -643,21 +644,21 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
643 644
   bids <- getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
644 645
   ids <- names(aids)
645 646
   snpbase <- getVarInEnv("base")
646
-  
647
+
647 648
   nsnps = length(aids)
648 649
   narrays = ncol(RG)
649
-  
650
+
650 651
 #  aidcol = match("AddressA_ID", colnames(annot))
651 652
 #  if(is.na(aidcol))
652 653
 #    aidcol = match("Address", colnames(annot))
653 654
 #  bidcol = match("AddressB_ID", colnames(annot))
654
-#  if(is.na(bidcol)) 
655
+#  if(is.na(bidcol))
655 656
 #    bidcol = match("Address2", colnames(annot))
656 657
 #  aids = annot[, aidcol]
657 658
 #  bids = annot[, bidcol]
658 659
 #  snpids = annot[,"Name"]
659
-#  snpbase = annot[,"SNP"] 
660
-  infI = !is.na(bids) & bids!=0  
660
+#  snpbase = annot[,"SNP"]
661
+  infI = !is.na(bids) & bids!=0
661 662
   aord = match(aids, featureNames(RG)) # NAs are possible here
662 663
   bord = match(bids, featureNames(RG)) # and here
663 664
 #  argrg = aids[rrgg]
... ...
@@ -670,7 +671,7 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
670 671
                  annotation=chipType, phenoData=RG@phenoData, protocolData=RG@protocolData, storage.mode="environment")
671 672
   rm(tmpmat)
672 673
   gc()
673
-  
674
+
674 675
   # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
675 676
   XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
676 677
   XY@assayData$Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
... ...
@@ -680,19 +681,19 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
680 681
 #  XY@assayData$Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
681 682
 #  XY@assayData$Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
682 683
   gc()
683
-  
684
+
684 685
   ## Warning - not 100% sure that the code below is correct - could be more complicated than this
685
-  
686
+
686 687
   # Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
687 688
 #  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
688
-  
689
+
689 690
 #  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
690 691
 #  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
691 692
 #  Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
692 693
 #  Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
693 694
 #  Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
694 695
 #  Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
695
-  
696
+
696 697
   # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
697 698
 #  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
698 699
 
... ...
@@ -702,11 +703,11 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
702 703
 #  Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
703 704
 #  Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
704 705
 #  Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
705
-    
706
+
706 707
   #  For now zero out Infinium I probes
707 708
   XY@assayData$X[infI,] = 0
708 709
   XY@assayData$Y[infI,] = 0
709
-  XY@assayData$zero[infI,] = 0  
710
+  XY@assayData$zero[infI,] = 0
710 711
 #  XY@assayData$Xnb[infI,] = 0
711 712
 #  XY@assayData$Ynb[infI,] = 0
712 713
 #  XY@assayData$Xse[infI,] = 0
... ...
@@ -721,7 +722,7 @@ RGtoXY = function(RG, chipType, verbose=TRUE) {
721 722
 
722 723
 
723 724
 RGtoXY2 = function(RG, chipType, verbose=TRUE) {
724
-  chipList = c("human1mv1c",             # 1M
725
+	  chipList = c("human1mv1c",             # 1M
725 726
                "human370v1c",            # 370CNV
726 727
                "human650v3a",            # 650Y
727 728
                "human610quadv1b",        # 610 quad
... ...
@@ -729,11 +730,12 @@ RGtoXY2 = function(RG, chipType, verbose=TRUE) {
729 730
                "human370quadv3c",        # 370CNV quad
730 731
                "human550v3b",            # 550K
731 732
                "human1mduov3b",          # 1M Duo
732
-               "humanomni1quadv1b")      # Omni1 quad
733
+               "humanomni1quadv1b",      # Omni1 quad
734
+	       "humanomniexpress12v1b")  # Omni express 12
733 735
   if(missing(chipType)){
734 736
 	  chipType = match.arg(annotation(RG), chipList)
735 737
   } else chipType = match.arg(chipType, chipList)
736
-  
738
+
737 739
   pkgname <- getCrlmmAnnotationName(chipType)
738 740
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
739 741
      suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
... ...
@@ -749,21 +751,21 @@ RGtoXY2 = function(RG, chipType, verbose=TRUE) {
749 751
   bids <- getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
750 752
   ids <- names(aids)
751 753
   snpbase <- getVarInEnv("base")
752
-  
754
+
753 755
   nsnps = length(aids)
754 756
   narrays = ncol(RG)
755
-  
757
+
756 758
 #  aidcol = match("AddressA_ID", colnames(annot))
757 759
 #  if(is.na(aidcol))
758 760
 #    aidcol = match("Address", colnames(annot))
759 761
 #  bidcol = match("AddressB_ID", colnames(annot))
760
-#  if(is.na(bidcol)) 
762
+#  if(is.na(bidcol))
761 763
 #    bidcol = match("Address2", colnames(annot))
762 764
 #  aids = annot[, aidcol]
763 765
 #  bids = annot[, bidcol]
764 766
 #  snpids = annot[,"Name"]
765
-#  snpbase = annot[,"SNP"] 
766
-  infI = !is.na(bids) & bids!=0  
767
+#  snpbase = annot[,"SNP"]
768
+  infI = !is.na(bids) & bids!=0
767 769
   aord = match(aids, featureNames(RG)) # NAs are possible here
768 770
   bord = match(bids, featureNames(RG)) # and here
769 771
 #  argrg = aids[rrgg]
... ...
@@ -782,7 +784,7 @@ RGtoXY2 = function(RG, chipType, verbose=TRUE) {
782 784
   XY@assayData$X[1:nsnps,] = 0
783 785
   XY@assayData$Y[1:nsnps,] = 0
784 786
   XY@assayData$zero[1:nsnps,] = 0
785
-  
787
+
786 788
   # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
787 789
   XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
788 790
   XY@assayData$Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
... ...
@@ -792,19 +794,19 @@ RGtoXY2 = function(RG, chipType, verbose=TRUE) {
792 794
 #  XY@assayData$Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
793 795
 #  XY@assayData$Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
794 796
   gc()
795
-  
797
+
796 798
   ## Warning - not 100% sure that the code below is correct - could be more complicated than this
797
-  
799
+
798 800
   # Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
799 801
 #  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
800
-  
802
+
801 803
 #  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
802 804
 #  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
803 805
 #  Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
804 806
 #  Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
805 807
 #  Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
806 808
 #  Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
807
-  
809
+
808 810
   # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
809 811
 #  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
810 812
 
... ...
@@ -814,11 +816,11 @@ RGtoXY2 = function(RG, chipType, verbose=TRUE) {
814 816
 #  Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
815 817
 #  Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
816 818
 #  Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
817
-    
819
+
818 820
   #  For now zero out Infinium I probes
819 821
   XY@assayData$X[infI,] = 0
820 822
   XY@assayData$Y[infI,] = 0
821
-  XY@assayData$zero[infI,] = 0  
823
+  XY@assayData$zero[infI,] = 0
822 824
 #  XY@assayData$Xnb[infI,] = 0
823 825
 #  XY@assayData$Ynb[infI,] = 0
824 826
 #  XY@assayData$Xse[infI,] = 0
... ...
@@ -846,10 +848,10 @@ stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
846 848
 #  data(preprocStuff, package=pkgname, envir=.crlmmPkgEnv)
847 849
 
848 850
   stripnum <- getVarInEnv("stripnum")
849
-  
851
+
850 852
   if(useTarget)
851 853
     targetdist = getVarInEnv("reference")
852
-  
854
+
853 855
 #  Xqws = Yqws = matrix(0, nrow(XY), ncol(XY))
854 856
 #  colnames(Xqws) = colnames(Yqws) = sampleNames(XY) #$X
855 857
 #  rownames(Xqws) = rownames(Yqws) = featureNames(XY)
... ...
@@ -857,7 +859,7 @@ stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
857 859
   if(verbose){
858 860
     message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.")
859 861
     if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=max(stripnum), style=3)
860
-  } 
862
+  }
861 863
   for(s in 1:max(stripnum)) {
862 864
     if(verbose) {
863 865
       if (getRversion() > '2.7.0') setTxtProgressBar(pb, s)
... ...
@@ -937,7 +939,7 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
937 939
   SMEDIAN <- getVarInEnv("SMEDIAN")
938 940
   theKnots <- getVarInEnv("theKnots")
939 941
   gns <- getVarInEnv("gns")
940
-  
942
+
941 943
   # separate out copy number probes
942 944
   npIndex = getVarInEnv("npProbesFid")
943 945
   nprobes = length(npIndex)
... ...
@@ -946,24 +948,24 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
946 948
   B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
947 949
 
948 950
   # new lines below - useful to keep track of zeroed out probes
949
-  zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays) 
951
+  zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
950 952
 
951 953
   colnames(A) <- colnames(B) <- colnames(zero) <- sns
952 954
   rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex)
953
-  
955
+
954 956
   cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
955 957
   if(save.it & !missing(cnFile)) {
956
-    t0 <- proc.time() 
957
-    save(cnAB, file=cnFile) 
958
+    t0 <- proc.time()
959
+    save(cnAB, file=cnFile)
958 960
     t0 <- proc.time()-t0
959 961
     if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
960 962
   }
961 963
   rm(cnAB, B, zero)
962
-  
964
+
963 965
   # next process snp probes
964 966
   snpIndex = getVarInEnv("snpProbesFid")
965 967
   nprobes <- length(snpIndex)
966
-  
968
+
967 969
   ##We will read each cel file, summarize, and run EM one by one
968 970
   ##We will save parameters of EM to use later
969 971
   mixtureParams <- matrix(0, 4, narrays)
... ...
@@ -977,7 +979,7 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
977 979
   set.seed(seed)
978 980
   idx <- sort(sample(autosomeIndex, mixtureSampleSize))
979 981
   idx2 <- sample(nprobes, 10^5)
980
-  
982
+
981 983
   ##S will hold (A+B)/2 and M will hold A-B
982 984
   ##NOTE: We actually dont need to save S. Only for pics etc...
983 985
   ##f is the correction. we save to avoid recomputing
... ...
@@ -986,10 +988,10 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
986 988
 
987 989
   # new lines below - useful to keep track of zeroed out SNPs
988 990
   zero <- matrix(as.integer(exprs(channel(XY, "zero"))[snpIndex,]), nprobes, narrays)
989
-    
991
+
990 992
   colnames(A) <- colnames(B) <- colnames(zero) <- sns
991 993
   rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
992
-  
994
+
993 995
   if(verbose){
994 996
      message("Calibrating ", narrays, " arrays.")
995 997
      if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
... ...
@@ -1022,8 +1024,8 @@ preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
1022 1024
   res = list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
1023 1025
 
1024 1026
   if(save.it & !missing(snpFile)) {
1025
-    t0 <- proc.time() 
1026
-    save(res, file=snpFile) 
1027
+    t0 <- proc.time()
1028
+    save(res, file=snpFile)
1027 1029
     t0 <- proc.time()-t0
1028 1030
     if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
1029 1031
   }
... ...
@@ -1076,7 +1078,7 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
1076 1078
   theKnots <- getVarInEnv("theKnots")
1077 1079
 #  gns <- featureNames(XY) # getVarInEnv("gns") # needs to include np probes - gns is only for snps
1078 1080
   narrays = ncol(XY)
1079
-  
1081
+
1080 1082
   if(save.it & !missing(cnFile)) {
1081 1083
     # separate out copy number probes
1082 1084
     npIndex = getVarInEnv("npProbesFid")
... ...
@@ -1085,29 +1087,29 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
1085 1087
     B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
1086 1088
 
1087 1089
     # new lines below - useful to keep track of zeroed out probes
1088
-    zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays) 
1090
+    zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
1089 1091
 
1090 1092
     colnames(A) <- colnames(B) <- colnames(zero) <- sns
1091 1093
     rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex)
1092
-  
1094
+
1093 1095
     cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
1094
-    
1095
-    t0 <- proc.time() 
1096
-    save(cnAB, file=cnFile) 
1096
+
1097
+    t0 <- proc.time()
1098
+    save(cnAB, file=cnFile)
1097 1099
     t0 <- proc.time()-t0
1098 1100
     if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
1099 1101
      rm(cnAB, B, zero)
1100 1102
   }
1101
-  
1103
+
1102 1104
   # next process snp probes
1103 1105
   snpIndex = getVarInEnv("snpProbesFid")
1104 1106
   nprobes <- length(snpIndex)
1105
-  
1107
+
1106 1108
   ##We will read each cel file, summarize, and run EM one by one
1107 1109
   ##We will save parameters of EM to use later
1108 1110
   mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
1109 1111
   SNR <- initializeBigVector("crlmmSNR-", narrays, "double")
1110
-  SKW <- initializeBigVector("crlmmSKW-", narrays, "double") 
1112
+  SKW <- initializeBigVector("crlmmSKW-", narrays, "double")
1111 1113
 #  mixtureParams <- matrix(0, 4, narrays)
1112 1114
 #  SNR <- vector("numeric", narrays)
1113 1115
 #  SKW <- vector("numeric", narrays)
... ...
@@ -1119,7 +1121,7 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
1119 1121
   set.seed(seed)
1120 1122
   idx <- sort(sample(autosomeIndex, mixtureSampleSize))
1121 1123
   idx2 <- sample(nprobes, 10^5)
1122
-  
1124
+
1123 1125
   ##S will hold (A+B)/2 and M will hold A-B
1124 1126
   ##NOTE: We actually dont need to save S. Only for pics etc...
1125 1127
   ##f is the correction. we save to avoid recomputing
... ...
@@ -1138,15 +1140,15 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
1138 1140
 #  if(!is.integer(A)) {
1139 1141
 #     A = matrix(as.integer(A), nrow(A), ncol(A))
1140 1142
 #     B = matrix(as.integer(B), nrow(B), ncol(B))
1141
-#  }  
1142
-    
1143
+#  }
1144
+
1143 1145
 #  colnames(A) <- colnames(B) <- colnames(zero) <- sns
1144 1146
 #  rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
1145 1147
 
1146 1148
   A <- initializeBigMatrix("crlmmA-", nprobes, narrays, "integer")
1147 1149
   B <- initializeBigMatrix("crlmmB-", nprobes, narrays, "integer")
1148 1150
   zero <- initializeBigMatrix("crlmmZero-", nprobes, narrays, "integer")
1149
-  
1151
+
1150 1152
   if(verbose){
1151 1153
      message("Calibrating ", narrays, " arrays.")
1152 1154
      if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
... ...
@@ -1155,7 +1157,7 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
1155 1157
   for(i in 1:narrays){
1156 1158
      A[,i] = exprs(channel(XY, "X"))[snpIndex,i]
1157 1159
      B[,i] = exprs(channel(XY, "Y"))[snpIndex,i]
1158
-     zero[,i] = exprs(channel(XY, "zero"))[snpIndex,i] 
1160
+     zero[,i] = exprs(channel(XY, "zero"))[snpIndex,i]
1159 1161
 #    SKW[i] = mean((A[snpIndex,i][idx2]-mean(A[snpIndex,i][idx2]))^3)/(sd(A[snpIndex,i][idx2])^3)
1160 1162
      SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
1161 1163
     if(fitMixture){
... ...
@@ -1184,8 +1186,8 @@ preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
1184 1186
              mixtureParams=mixtureParams, cdfName=cdfName) # , snpIndex=snpIndex, npIndex=npIndex)
1185 1187
 
1186 1188
   if(save.it & !missing(snpFile)) {
1187
-    t0 <- proc.time() 
1188
-    save(res, file=snpFile) 
1189
+    t0 <- proc.time()
1190
+    save(res, file=snpFile)
1189 1191
     t0 <- proc.time()-t0
1190 1192
     if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
1191 1193
   }
... ...
@@ -1224,7 +1226,7 @@ crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1224 1226
         stop("Both RG and XY specified - please use one or the other")
1225 1227
     }
1226 1228
     if (missing(sns)) sns <- sampleNames(XY) #$X
1227
-    
1229
+
1228 1230
     res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1229 1231
                         seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget,
1230 1232
                         save.it=save.it, snpFile=snpFile, cnFile=cnFile)
... ...
@@ -1245,7 +1247,7 @@ crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1245 1247
                   recallRegMin=1000, SNRMin=SNRMin,
1246 1248
                   returnParams=returnParams, badSNP=badSNP,
1247 1249
                   verbose=verbose)
1248
-  
1250
+
1249 1251
   res2[["SNR"]] <- res[["SNR"]]
1250 1252
   res2[["SKW"]] <- res[["SKW"]]
1251 1253
   rm(res)
... ...
@@ -1279,7 +1281,7 @@ crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1279 1281
         stop("Both RG and XY specified - please use one or the other")
1280 1282
     }
1281 1283
     if (missing(sns)) sns <- sampleNames(XY) #$X
1282
-    
1284
+
1283 1285
     res = preprocessInfinium2v2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
1284 1286
                         seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #,
1285 1287
                       #  save.it=save.it, snpFile=snpFile, cnFile=cnFile)
... ...
@@ -1295,13 +1297,13 @@ crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1295 1297
 #                    alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
1296 1298
 #                    call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
1297 1299
 #                    callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
1298
-#  	            annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD) 
1300
+#  	            annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD)
1299 1301
 #    sampleNames(callSet) <- sns
1300 1302
 #    featureNames(callSet) <- res[["gns"]]
1301 1303
 #    pData(callSet)$SKW <- rep(NA, length(sns))
1302 1304
 #    pData(callSet)$SNR <- rep(NA, length(sns))
1303 1305
 #    pData(callSet)$gender <- rep(NA, length(sns))
1304
-					  
1306
+
1305 1307
   }else{
1306 1308
       if(verbose) message("Loading ", snpFile, ".")
1307 1309
         obj <- load(snpFile)
... ...
@@ -1311,8 +1313,8 @@ crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1311 1313
   }
1312 1314
 
1313 1315
  #   rm(phenD, protD , fD)
1314
-	
1315
-#    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))                
1316
+
1317
+#    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))
1316 1318
 #    suppressWarnings(A(callSet) <- res[["A"]])
1317 1319
 #    suppressWarnings(B(callSet) <- res[["B"]])
1318 1320
 #    pData(callSet)$SKW <- res$SKW
... ...
@@ -1321,7 +1323,7 @@ crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1321 1323
 #    rm(res); gc()
1322 1324
   if(row.names) row.names=res$gns else row.names=NULL
1323 1325
   if(col.names) col.names=res$sns else col.names=NULL
1324
-  
1326
+
1325 1327
   res2 <- crlmmGT2(A=res[["A"]], #as.matrix(A(callSet)[snp.index,]), # j]),
1326 1328
                   B=res[["B"]], # as.matrix(B(callSet)[snp.index,]),  # j]),
1327 1329
                   SNR=res[["SNR"]], # callSet$SNR, # [j],
... ...
@@ -1338,7 +1340,7 @@ crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1338 1340
                   verbose=verbose,
1339 1341
                   returnParams=returnParams,
1340 1342
                   badSNP=badSNP)
1341
-#    rm(res); gc()  
1343
+#    rm(res); gc()
1342 1344
 #    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
1343 1345
 #    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
1344 1346
 #    callSet$gender[j] <- tmp$gender
... ...
@@ -1355,8 +1357,8 @@ crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
1355 1357
 }
1356 1358
 
1357 1359
 
1358
-## MR: Below is a more memory efficient version of crlmmIllumina() which 
1359
-## reads in the .idats and genotypes in the one function and removes objects 
1360
+## MR: Below is a more memory efficient version of crlmmIllumina() which
1361
+## reads in the .idats and genotypes in the one function and removes objects
1360 1362
 ## after they have been used
1361 1363
 crlmmIlluminaV2 = function(sampleSheet=NULL,
1362 1364
 			  arrayNames=NULL,
... ...
@@ -1371,7 +1373,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1371 1373
 #                          rgFile,
1372 1374
 			  stripNorm=TRUE,
1373 1375
 			  useTarget=TRUE,
1374
-          	          row.names=TRUE, 
1376
+          	          row.names=TRUE,
1375 1377
 			  col.names=TRUE,
1376 1378
 			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
1377 1379
                           seed=1, save.ab=FALSE, snpFile, cnFile,
... ...
@@ -1382,7 +1384,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1382 1384
   if(missing(cdfName)) stop("must specify cdfName")
1383 1385
   if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
1384 1386
 #  if(missing(sns)) sns <- basename(arrayNames)
1385
-			  
1387
+
1386 1388
 #  if (save.rg & missing(rgFile))
1387 1389
 #    stop("'rgFile' is missing, and you chose save.rg")
1388 1390
   if (save.ab & (missing(snpFile) | missing(cnFile)))
... ...
@@ -1437,7 +1439,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1437 1439
 #    featureNames(callSet) <- res[["gns"]]
1438 1440
 #    pData(callSet)$SKW <- rep(NA, length(sns))
1439 1441
 #    pData(callSet)$SNR <- rep(NA, length(sns))
1440
-#    pData(callSet)$gender <- rep(NA, length(sns))			
1442
+#    pData(callSet)$gender <- rep(NA, length(sns))
1441 1443
 #	}
1442 1444
 #	pData(callSet)[j,] <- phenD
1443 1445
 #	pData(protocolData(callSet))[j,] <- protD
... ...
@@ -1445,7 +1447,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1445 1447
 #	pData(protocolData(callSet)) <- protD
1446 1448
 
1447 1449
 #    rm(phenD, protD, fD)
1448
-	
1450
+
1449 1451
 #    if(k > 1 & nrow(res[[1]]) != nrow(callSet)){
1450 1452
         ##RS: I don't understand why the IDATS for the
1451 1453
         ##same platform potentially have different lengths
... ...
@@ -1453,7 +1455,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1453 1455
 #        res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ]
1454 1456
 #    }
1455 1457
 
1456
-#    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))                
1458
+#    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))
1457 1459
 #    suppressWarnings(A(callSet)[, j] <- res[["A"]])
1458 1460
 #    suppressWarnings(B(callSet)[, j] <- res[["B"]])
1459 1461
 #    suppressWarnings(A(callSet) <- res[["A"]])
... ...
@@ -1465,7 +1467,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1465 1467
 #    mixtureParams <- res$mixtureParams
1466 1468
 #    rm(res); gc()
1467 1469
   if(row.names) row.names=res$gns else row.names=NULL
1468
-  if(col.names) col.names=res$sns else col.names=NULL    
1470
+  if(col.names) col.names=res$sns else col.names=NULL
1469 1471
   res2 <- crlmmGT2(A=res[["A"]], #as.matrix(A(callSet)[snp.index,]), # j]),
1470 1472
                   B=res[["B"]], # as.matrix(B(callSet)[snp.index,]),  # j]),
1471 1473
                   SNR=res[["SNR"]], # callSet$SNR, # [j],
... ...
@@ -1482,7 +1484,7 @@ crlmmIlluminaV2 = function(sampleSheet=NULL,
1482 1484
                   verbose=verbose,
1483 1485
                   returnParams=returnParams,
1484 1486
                   badSNP=badSNP)
1485
-#    rm(res); gc()  
1487
+#    rm(res); gc()
1486 1488
 #    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
1487 1489
 #    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
1488 1490
 #    callSet$gender[j] <- tmp$gender
... ...
@@ -165,8 +165,10 @@ validCdfNames <- function(){
165 165
 	  "human610quadv1b",
166 166
 	  "human660quadv1a",
167 167
 	  "human1mduov3b",
168
-	  "humanomni1quadv1b")
168
+	  "humanomni1quadv1b",
169
+          "humanomniexpress12v1b")
169 170
 }
171
+
170 172
 isValidCdfName <- function(cdfName){
171 173
 	chipList <- validCdfNames()
172 174
 	result <- cdfName %in% chipList	
... ...
@@ -24,18 +24,18 @@
24 24
 
25 25
 <<setup, echo=FALSE, results=hide>>=
26 26
 options(continue=" ", width=70)
27
-@ 
27
+@
28 28
 
29 29
 %\section{Estimating copy number}
30 30
 
31 31
 %At present, software for copy number estimation is provided only for the
32
-%Affymetrix 6.0 platform.  
32
+%Affymetrix 6.0 platform.
33 33
 
34 34
 \begin{abstract}
35 35
   This vignette estimates copy number for HapMap samples on the
36 36
   Affymetrix 6.0 platform.  See \citep{Scharpf2010} for the working
37 37
   paper.
38
-  
38
+
39 39
 \end{abstract}
40 40
 
41 41
 \section{Simple usage}
... ...
@@ -47,7 +47,7 @@ library(oligoClasses)
47 47
 library(crlmm)
48 48
 crlmm:::validCdfNames()
49 49
 cdfName <- "genomewidesnp6"
50
-@ 
50
+@
51 51
 
52 52
 \paragraph{Preprocess and genotype.}
53 53
 
... ...
@@ -57,7 +57,7 @@ intensities and genotype data, and define a 'batch' variable. The
57 57
 batch variable will later be useful when we estimate copy number.  We
58 58
 typically use the 96 well chemistry plate or the scan date of the
59 59
 array as a surrogate for batch. Adjusting by date or chemistry plate
60
-can be helpful for limiting the influence of batch effects.  
60
+can be helpful for limiting the influence of batch effects.
61 61
 
62 62
 <<celfiles>>=
63 63
 celFiles <- list.celfiles("/thumper/ctsa/snpmicroarray/hapmap/raw/affy/1m", full.names=TRUE, pattern=".CEL")
... ...
@@ -68,7 +68,7 @@ batch <- substr(basename(celFiles), 13, 13)
68 68
 celFiles <- celFiles[batch %in% c("C", "Y")]
69 69
 batch <- batch[batch %in% c("C", "Y")]
70 70
 stopifnot(length(batch) == length(celFiles))
71
-@ 
71
+@
72 72
 
73 73
 The preprocessing steps for copy number estimation includes quantile
74 74
 normalization of the polymorphic and nonpolymorphic markers and a
... ...
@@ -95,9 +95,12 @@ if(!exists("cnSet")){
95 95
 	if(!file.exists(file.path(outdir, "cnSet.rda"))){
96 96
 		cnSet <- genotype(celFiles[1:50], cdfName=cdfName, copynumber=TRUE, batch=batch[1:50])
97 97
 		save(cnSet, file=file.path(outdir, "cnSet.rda"))
98
-	} else load(file.path(outdir, "cnSet.rda"))
98
+	} else{
99
+		message("Loading cnSet container")
100
+		load(file.path(outdir, "cnSet.rda"))
101
+	}
99 102
 }
100
-@ 
103
+@
101 104
 
102 105
 A more memory efficient approach to preprocessing and genotyping is
103 106
 implemented in the \R{} function \Rfunction{genotype2}.  The arguments
... ...
@@ -114,13 +117,18 @@ library(ff)
114 117
 ld.path <- file.path(outdir, "ffobjs")
115 118
 if(!file.exists(ld.path)) dir.create(ld.path)
116 119
 ldPath(ld.path)
120
+ocProbesets(100000)
121
+ocSamples(300)
117 122
 if(!exists("cnSet2")){
118 123
 	if(!file.exists(file.path(outdir, "cnSet2.rda"))){
119 124
 		cnSet2 <- genotype2(celFiles, cdfName=cdfName, copynumber=TRUE, batch=batch)
120 125
 		save(cnSet2, file=file.path(outdir, "cnSet2.rda"))
121
-	} else load(file.path(outdir, "cnSet2.rda"))
126
+	} else{
127
+		message("Loading cnSet2 container")
128
+		load(file.path(outdir, "cnSet2.rda"))
129
+	}
122 130
 }
123
-@ 
131
+@
124 132
 
125 133
 The objects returned by the \Rfunction{genotype} and
126 134
 \Rfunction{genotype2} differ in size as the former returns an object
... ...
@@ -151,7 +159,7 @@ if(FALSE){
151 159
 gt1 <- calls(cnSet)[, 1:50]
152 160
 gt2 <- calls(cnSet2)[, 1:50]
153 161
 all.equal(gt1, gt2)
154
-@ 
162
+@
155 163
 
156 164
 \noindent With \Robject{ff} objects the additional I/O time required
157 165
 for extracting data is substantial and will increase for larger
... ...
@@ -194,7 +202,7 @@ open(snpCallProbability(cnSet2))
194 202
 posterior.prob2 <- integerScoreToProbability(snpCallProbability(cnSet2)[rows, cols])
195 203
 close(snpCallProbability(cnSet2))
196 204
 all.equal(posterior.prob1, posterior.prob2)
197
-@ 
205
+@
198 206
 
199 207
 Next, we obtain locus-level estimates of copy number by fitting a
200 208
 linear model to each SNP. A variable named 'batch' must be indicated
... ...
@@ -212,14 +220,16 @@ latter. As with the preprocessing and genotyping, the following code
212 220
 should be submitted as part of the batch job as it is too slow for
213 221
 interactive analysis.
214 222
 
223
+
224
+
215 225
 <<cn, echo=FALSE, eval=FALSE>>=
216 226
 ##Matrix format
217 227
 cnSet <- crlmmCopynumber(cnSet)
218
-@ 
228
+@
219 229
 
220 230
 <<save.cnset, echo=FALSE, eval=FALSE>>=
221 231
 save(cnSet, file=file.path(outdir, "cnSet.rda"))
222
-@ 
232
+@
223 233
 
224 234
 <<cn.ff, echo=FALSE, eval=FALSE>>=
225 235
 rm(cnSet); gc()
... ...
@@ -227,13 +237,13 @@ ocProbesets(75e3)
227 237
 ##ff objects
228 238
 system.time(cnSet2 <- crlmmCopynumber2(cnSet2))
229 239
 save(cnSet2, file=file.path(outdir, "cnSet2.rda"))
230
-@ 
240
+@
231 241
 
232 242
 <<timings, eval=FALSE>>=
233 243
 tryCatch(print(paste("Time for matrix version:", time1)), error=function(e) print("timing for CN estimation not available"))
234 244
 tryCatch(print(paste("Time for ff version:", time2)), error=function(e) print("timing for CN estimation not available"))
235 245
 rm(cnSet2); gc()
236
-@ 
246
+@
237 247
 
238 248
 
239 249
 \section{Accessors}
... ...
@@ -245,7 +255,7 @@ visualizations using the sample object provided in the
245 255
 <<sampleset>>=
246 256
 data(sample.CNSetLM)
247 257
 x <- sample.CNSetLM
248
-@ 
258
+@
249 259
 
250 260
 
251 261
 \subsection{Quantile-normalized intensities}
... ...
@@ -258,7 +268,7 @@ snp.index <- which(isSnp(x))
258 268
 np.index <- which(!isSnp(x))
259 269
 a <- (A(x))[snp.index, ]
260 270
 dim(a)
261
-@ 
271
+@
262 272
 
263 273
 The extra set of parentheses surrounding \Robject{A(cnSet2)} above is
264 274
 added to emphasize the appropriate order of operations. Subsetting the
... ...
@@ -267,54 +277,54 @@ large datasets.
267 277
 
268 278
 <<eval=FALSE>>=
269 279
 a <- A(x[snp.index, ])
270
-@ 
280
+@
271 281
 
272 282
 The quantile-normalized intensities for nonpolymorphic loci are obtained
273 283
 by:
274 284
 
275 285
 <<>>=
276 286
 npIntensities <- (A(x))[np.index, ]
277
-@ 
287
+@
278 288
 
279 289
 Quantile normalized intensities for the B allele at polymorphic loci:
280 290
 
281 291
 <<B.polymorphic>>=
282 292
 b.snps <- (B(x))[snp.index, ]
283
-@ 
293
+@
284 294
 
285 295
 Note that NAs are recorded in the 'B' assay data element for
286 296
 nonpolymorphic loci:
287 297
 
288 298
 <<B.NAs>>=
289 299
 all(is.na((B(x))[np.index, ]))
290
-@ 
300
+@
291 301
 
292 302
 <<clean, echo=FALSE>>=
293 303
 rm(b.snps, a, npIntensities); gc()
294
-@ 
304
+@
295 305
 
296 306
 \paragraph{\Robject{SnpSet}: Genotype calls and confidence scores}
297 307
 
298 308
 Genotype calls:
299 309
 <<genotypes>>=
300 310
 genotypes <- (snpCall(x))[snp.index, ]
301
-@ 
311
+@
302 312
 Confidence scores of the genotype calls:
303 313
 <<confidenceScores>>=
304 314
 genotypeConf <- integerScoreToProbability(snpCallProbability(x)[snp.index[1:10], ])
305
-@ 
315
+@
306 316
 
307 317
 \paragraph{\Robject{CopyNumberSet}: allele-specific copy number}
308 318
 
309 319
 Allele-specific copy number at polymorphic loci:
310 320
 <<ca>>=
311 321
 ca <- CA(x[snp.index, ])/100
312
-@ 
322
+@
313 323
 
314 324
 Total copy number at nonpolymorphic loci:
315 325
 <<ca>>=
316 326
 cn.nonpolymorphic <- CA(x[np.index, ])/100
317
-@ 
327
+@
318 328
 
319 329
 Total copy number at both polymorphic and nonpolymorphic loci:
320 330
 <<totalCopynumber>>=
... ...
@@ -322,7 +332,7 @@ path <- system.file("scripts", package="crlmm")
322 332
 source(file.path(path, "helperFunctions.R"))
323 333
 cn <- totalCopyNumber(x, i=c(snp.index, np.index))
324 334
 apply(cn, 2, median, na.rm=TRUE)
325
-@ 
335
+@
326 336
 
327 337
 (Note: the accessor totalCopyNumber method is sourced from a few
328 338
 helper functions. Documentation will be available in the devel version
... ...
@@ -335,15 +345,15 @@ Information on physical position and chromosome can be accessed as follows:
335 345
 <<positionChromosome>>=
336 346
 xx <- position(x)
337 347
 yy <- chromosome(x)
338
-@ 
348
+@
339 349
 
340 350
 Parameters from the linear model used to estimate copy number are
341
-stored in the slot \Robject{lM}. 
351
+stored in the slot \Robject{lM}.
342 352
 
343 353
 <<copynumberParameters>>=
344 354
 names(lM(x))
345 355
 dim(lM(x)[[1]])
346
-@ 
356
+@
347 357
 
348 358
 
349 359
 
... ...
@@ -355,7 +365,7 @@ A histogram of the signal to noise ratio for the HapMap samples:
355 365
 
356 366
 <<plotSnr, fig=TRUE, include=FALSE>>=
357 367
 hist(x$SNR, xlab="SNR", main="", breaks=25)
358
-@ 
368
+@
359 369
 
360 370
 \begin{figure}
361 371
   \centering
... ...
@@ -371,24 +381,24 @@ Figure \ref{fig:oneSample} plots physical position (horizontal axis)
371 381
 versus copy number (vertical axis) for the first sample.  There is less
372 382
 information to estimate copy number at nonpolymorphic loci; improvements
373 383
 to the univariate prediction regions at nonpolymorphic loci are a future
374
-area of research.  
384
+area of research.
375 385
 
376 386
 <<oneSample, fig=TRUE, width=8, height=4, include=FALSE>>=
377 387
 par(las=1, mar=c(4, 5, 4, 2))
378
-plot(position(x), copyNumber(x)[, 1]/100, pch=21, 
379
-     cex=0.4, xaxt="n", col="grey20", ylim=c(0,5), 
388
+plot(position(x), copyNumber(x)[, 1]/100, pch=21,
389
+     cex=0.4, xaxt="n", col="grey20", ylim=c(0,5),
380 390
      ylab="copy number", xlab="physical position (Mb)",
381 391
      main=paste(sampleNames(x)[1], ", CHR:", unique(chromosome(x))))
382 392
 points(position(x)[!isSnp(x)], copyNumber(x)[!isSnp(x), 1]/100,
383 393
        pch=21, cex=0.4, col="lightblue", bg="lightblue")
384 394
 axis(1, at=pretty(range(position(x))), labels=pretty(range(position(x)))/1e6)
385 395
 abline(h=2)
386
-@ 
396
+@
387 397
 
388 398
 <<idiogram, eval=FALSE, echo=FALSE>>=
389 399
 require(SNPchip)
390 400
 plotCytoband(22, new=FALSE, cytoband.ycoords=c(3.8, 4), label.cytoband=FALSE)
391
-@ 
401
+@
392 402
 
393 403
 \begin{figure}
394 404
   \includegraphics[width=0.9\textwidth]{copynumber-oneSample}
... ...
@@ -408,14 +418,14 @@ plotCytoband(22, new=FALSE, cytoband.ycoords=c(3.8, 4), label.cytoband=FALSE)
408 418
 %xlim <- c(10*1e6, max(position(cnSet2)))
409 419
 %cols <- brewer.pal(8, "Dark2")[1:4]
410 420
 %for(j in 1:3){
411
-%	plot(position(cnSet2), cnState[, j], pch=".", col=cols[2], xaxt="n", 
421
+%	plot(position(cnSet2), cnState[, j], pch=".", col=cols[2], xaxt="n",
412 422
 %	     ylab="copy number", xlab="Physical position (Mb)", type="s", lwd=2,
413 423
 %	     ylim=c(0,6), xlim=xlim)
414 424
 %	points(position(cnSet2), cns[, j], pch=".", col=cols[3])
415 425
 %	lines(position(cnSet2), cnState[,j], lwd=2, col=cols[2])
416
-%	axis(1, at=pretty(position(cnSet)), 
426
+%	axis(1, at=pretty(position(cnSet)),
417 427
 %	     labels=pretty(position(cnSet))/1e6)
418
-%	abline(h=c(1,3), lty=2, col=cols[1])	
428
+%	abline(h=c(1,3), lty=2, col=cols[1])
419 429
 %	legend("topright", bty="n", legend=sampleNames(cnSet)[j])
420 430
 %	legend("topleft", lty=1, col=cols[2], legend="copy number state",
421 431
 %	       bty="n", lwd=2)
... ...
@@ -423,7 +433,7 @@ plotCytoband(22, new=FALSE, cytoband.ycoords=c(3.8, 4), label.cytoband=FALSE)
423 433
 %		     label.cytoband=FALSE, xlim=xlim)
424 434
 %}
425 435
 %par(op)
426
-%@ 
436
+%@
427 437
 %
428 438
 %\begin{figure}
429 439
 %  \includegraphics[width=\textwidth]{copynumber-overlayHmmPredictions}
... ...
@@ -462,13 +472,13 @@ plot(i, x, copynumber=2)
462 472
 ##for(i in index){
463 473
 ##	gt <- calls(cnSet)[i, ]
464 474
 ##	if(i != 89){
465
-##		myScatter(cnSet[i, ], 
466
-##			  pch=pch, 
467
-##			  col=colors[snpCall(cnSet)[i, ]], 
475
+##		myScatter(cnSet[i, ],
476
+##			  pch=pch,
477
+##			  col=colors[snpCall(cnSet)[i, ]],
468 478
 ##			  bg=colors[snpCall(cnSet)[i, ]], cex=cex,
469 479
 ##			  xlim=xlim, ylim=ylim)
470 480
 ##		mtext("A", 1, outer=TRUE, line=1)
471
-##		mtext("B", 2, outer=TRUE, line=1)	
481
+##		mtext("B", 2, outer=TRUE, line=1)
472 482
 ##		crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="C", lwd=2, col="black")
473 483
 ##		crlmm:::ellipse.CNSet(cnSet[i, ], copynumber=2, batch="Y", lwd=2, col="grey50")
474 484
 ##	} else {
... ...
@@ -493,10 +503,87 @@ plot(i, x, copynumber=2)
493 503
 %
494 504
 %See the technical report \citep{Scharpf2009}.
495 505
 
506
+\section{Trouble shooting}
507
+
508
+Suppose that you are only interested in estimating copy number at
509
+autosomal markers and you encountered an error using the
510
+crlmmCopynumber2 function for estimating copy number at nonpolymorphic
511
+markers on chromosome X.
512
+
513
+Making a subset of a cnSet2 without pulling all the data from disk to
514
+memory.
515
+<<>>=
516
+cnSet <- cnSet2
517
+@
518
+
519
+<<>>=
520
+cnSet <- crlmmCopynumber2(cnSet)
521
+@
522
+
523
+
524
+<<>>=
525
+marker.index <- which(chromosome(cnSet) < 23)
526
+nr <- length(marker.index)
527
+nc <- ncol(cnSet)
528
+@
529
+
530
+Set up a new directory for storing ff objects so that we can
531
+completely remove all the ff objects in the old directory when we're
532
+finished.
533
+
534
+<<>>=
535
+library(ff)
536
+ldPath("newDirectory")
537
+dir.create("newDirectory")
538
+@
539
+
540
+Next, initialize the container using utilities from the oligoClasses package.
541
+
542
+<<>>=
543
+CA <- initializeBigMatrix("CA", nr, nc)
544
+CB <- initializeBigMatrix("CB", nr, nc)
545
+A <- initializeBigMatrix("A", nr, nc)
546
+B <- initializeBigMatrix("B", nr, nc)
547
+GT <- initializeBigMatrix("GT", nr, nc)
548
+PP <- initializeBigMatrix("confs", nr, nc)
549
+cnSet.autosomes <- new("CNSetLM",
550
+		       alleleA=A,
551
+		       alleleB=B,
552
+		       call=GT,
553
+		       callProbability=PP,
554
+		       CA=CA,
555
+		       CB=CB,
556
+		       protocolData=protocolData(cnSet),
557
+		       featureData=featureData(cnSet)[marker.index, ],
558
+		       phenoData=phenoData(cnSet),
559
+		       annotation=annotation(cnSet))
560
+@
561
+
562
+Next, we need to populate the ff objects with data. The easiest way to
563
+do this without requiring too much RAM is to loop through the
564
+samples. Lets do 5 samples at a time.
565
+
566
+<<populateWithData>>=
567
+sample.index <- splitIndicesByLength(1:ncol(cnSet), 5)
568
+for(j in sample.index){
569
+	A(cnSet.autosomes)[, j] <- A(cnSet)[marker.index, j]
570
+	B(cnSet.autosomes)[, j] <- B(cnSet)[marker.index, j]
571
+	calls(cnSet.autosomes)[, j] <- calls(cnSet)[marker.index, j]
572
+	snpCallProbability(cnSet.autosomes)[, j] <- snpCallProbability(cnSet)[marker.index, j]
573
+}
574
+@
575
+
576
+Check to see that the data is there.
577
+
578
+The subset operation would pull all the data from disk and create a
579
+cnSet object with matrices.  If the dataset is very large, the subset
580
+operation would be very slow and potentially create a large amount of
581
+RAM.
582
+
496 583
 \section{Session information}
497 584
 <<sessionInfo, results=tex>>=
498 585
 toLatex(sessionInfo())
499
-@ 
586
+@
500 587
 
501 588
 \section*{References}
502 589