R/crlmm-illumina.R
cd8b85b6
 # function below works OK provided all .idat files are in the current working directory
 # - could add an option to allow files in Illumina directory structure to be handled
 # or to use the optional 'Path' column in sampleSheet
 # - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet
 
 readIdatFiles = function(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".",
                                 arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
cfdeb14b
                                 highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), saveDate=FALSE) {
cd8b85b6
        if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
          arrayNames=NULL
          if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
            barcode = sampleSheet[,arrayInfoColNames$barcode]
            arrayNames=barcode
          }
          if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {  
            position = sampleSheet[,arrayInfoColNames$position]
            if(is.null(arrayNames))
              arrayNames=position
            else
              arrayNames = paste(arrayNames, position, sep=sep)
            if(highDensity) {
              hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
              for(i in names(hdExt))
                arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
             }
          }
852cc5cf
          pd = new("AnnotatedDataFrame", data = sampleSheet)
cd8b85b6
        }
        if(is.null(arrayNames)) {
          arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
          if(!is.null(sampleSheet)) {
            sampleSheet=NULL
            cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
          }
852cc5cf
          pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
cd8b85b6
        }
cfdeb14b
        narrays = length(arrayNames)
cd8b85b6
        grnfiles = paste(arrayNames, fileExt$green, sep=sep)
        redfiles = paste(arrayNames, fileExt$red, sep=sep)
        if(length(grnfiles)==0 || length(redfiles)==0)
          stop("Cannot find .idat files")
        if(length(grnfiles)!=length(redfiles))
          stop("Cannot find matching .idat files")
        if(!all(c(redfiles,grnfiles) %in% dir(path=path)))
          stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
                  paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
        grnidats = file.path(path, grnfiles)
        redidats = file.path(path, redfiles)
        
cfdeb14b
        headerInfo = list(nProbes = rep(NA, narrays),
                          Barcode = rep(NA, narrays),
                          ChipType = rep(NA, narrays),
                          Manifest = rep(NA, narrays), # not sure about this one - sometimes blank
                          Position = rep(NA, narrays)) # this may also vary a bit
 
        dates = list(decode=rep(NA, narrays),
b938db26
                     scan=rep(NA, narrays))
cfdeb14b
 
cd8b85b6
        # read in the data
        for(i in seq(along=arrayNames)) {
          cat("reading", arrayNames[i], "\t")
          idsG = idsR = G = R = NULL
852cc5cf
 
cd8b85b6
          cat(paste(sep, fileExt$green, sep=""), "\t")
          G = readIDAT(grnidats[i])
          idsG = rownames(G$Quants)
          
          headerInfo$nProbes[i] = G$nSNPsRead
          headerInfo$Barcode[i] = G$Barcode
          headerInfo$ChipType[i] = G$ChipType
          headerInfo$Manifest[i] = G$Unknown$MostlyNull
          headerInfo$Position[i] = G$Unknowns$MostlyA
          
852cc5cf
          if(headerInfo$ChipType[i]!=headerInfo$ChipType[1] || headerInfo$Manifest[i]!=headerInfo$Manifest[1]) {
cd8b85b6
                   # || headerInfo$nProbes[i]!=headerInfo$nProbes[1] ## removed this condition as some arrays used the same manifest
                   # but differed by a few SNPs for some reason - most of the chip was the same though
852cc5cf
 #           stop("Chips are not of all of the same type - please check your data")
            warning("Chips are not of the same type.  Skipping ", basename(grnidats[i]), " and ", basename(redidats[i]))
            next()
          }
cfdeb14b
 
          dates$decode[i] = G$RunInfo[1, 1]
          dates$scan[i] = G$RunInfo[2, 1]
 
cd8b85b6
          if(i==1) {
            if(is.null(ids) && !is.null(G))
              ids = idsG
            else
              stop("Could not find probe IDs")
            nprobes = length(ids)
            narrays = length(arrayNames)
852cc5cf
            
            tmpmat = matrix(NA, nprobes, narrays)
            rownames(tmpmat) = ids
cd8b85b6
            if(!is.null(sampleSheet))
852cc5cf
              colnames(tmpmat) = sampleSheet$Sample_ID
cd8b85b6
            else
852cc5cf
              colnames(tmpmat) = arrayNames
 
            RG = new("NChannelSet",
                      R=tmpmat, G=tmpmat, Rnb=tmpmat, Gnb=tmpmat,
                      Rse=tmpmat, Gse=tmpmat, annotation=headerInfo$Manifest[1],
                      phenoData=pd, storage.mode="environment")
            rm(tmpmat)
            gc()
cd8b85b6
          }
          
          if(length(ids)==length(idsG)) {
            if(sum(ids==idsG)==nprobes) {
852cc5cf
              RG@assayData$G[,i] = G$Quants[, "Mean"]
              RG@assayData$Gnb[,i] = G$Quants[, "NBeads"]
              RG@assayData$Gse[,i] = G$Quants[, "SD"]
cd8b85b6
            }
          }
          else {
            indG = match(ids, idsG)
852cc5cf
            RG@assayData$G[,i] = G$Quants[indG, "Mean"]
            RG@assayData$Gnb[,i] = G$Quants[indG, "NBeads"]
            RG@assayData$Gse[,i] = G$Quants[indG, "SD"]
cd8b85b6
          }
852cc5cf
          rm(G)
          gc()
          
          cat(paste(sep, fileExt$red, sep=""), "\n")
          R = readIDAT(redidats[i])
          idsR = rownames(R$Quants)
          
cd8b85b6
          if(length(ids)==length(idsG)) {   
            if(sum(ids==idsR)==nprobes) {
852cc5cf
              RG@assayData$R[,i] = R$Quants[ ,"Mean"]
              RG@assayData$Rnb[,i] = R$Quants[ ,"NBeads"]
              RG@assayData$Rse[,i] = R$Quants[ ,"SD"]
cd8b85b6
            }
          }
          else {
            indR = match(ids, idsR)
852cc5cf
            RG@assayData$R[,i] = R$Quants[indR, "Mean"]
            RG@assayData$Rnb[,i] = R$Quants[indR, "NBeads"]
            RG@assayData$Rse[,i] = R$Quants[indR, "SD"]
cd8b85b6
          }
852cc5cf
          rm(R)
          gc()
cd8b85b6
        }
852cc5cf
        if(saveDate)
          RG@scanDates = dates$scan
        storageMode(RG) = "lockedEnvironment"
        RG
cd8b85b6
 }
 
 
 # the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008
 readIDAT <- function(idatFile){
 
   fileSize <- file.info(idatFile)$size
 
   tempCon <- file(idatFile,"rb")
   prefixCheck <- readChar(tempCon,4)
   if(prefixCheck != "IDAT"){
 
   }
 
   versionNumber <- readBin(tempCon, "integer", n=1, size=8, 
                            endian="little", signed=FALSE)
   
   nFields <- readBin(tempCon, "integer", n=1, size=4, 
                      endian="little", signed=FALSE)
 
   fields <- matrix(0,nFields,3);
   colnames(fields) <- c("Field Code", "Byte Offset", "Bytes")
   for(i1 in 1:nFields){
     fields[i1,"Field Code"] <- 
       readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE)
     fields[i1,"Byte Offset"] <- 
       readBin(tempCon, "integer", n=1, size=8, endian="little", signed=FALSE)
   }
 
   knownCodes <- c(1000, 102, 103, 104, 107, 200, 300, 400,
                   401, 402, 403, 404, 405, 406, 407, 408, 409)
   names(knownCodes) <- 
     c("nSNPsRead",  # 1000
       "IlluminaID", #  102
       "SD",         #  103
       "Mean",       #  104
       "NBeads",     #  107
       "MidBlock",   #  200
       "RunInfo",    #  300
       "RedGreen",   #  400
       "MostlyNull", #  401
       "Barcode",    #  402
       "ChipType",   #  403
       "MostlyA",    #  404
       "Unknown.1",  #  405
       "Unknown.2",  #  406
       "Unknown.3",  #  407
       "Unknown.4",  #  408
       "Unknown.5"   #  409
       )
 
   nNewFields <- 1
   rownames(fields) <- paste("Null", 1:nFields)
   for(i1 in 1:nFields){
     temp <- match(fields[i1,"Field Code"], knownCodes)
     if(!is.na(temp)){
       rownames(fields)[i1] <- names(knownCodes)[temp]
     }else{
       rownames(fields)[i1] <- paste("newField", nNewFields, sep=".")
       nNewFields <- nNewFields + 1
     }
   }
 
   seek(tempCon, fields["nSNPsRead", "Byte Offset"])
   nSNPsRead <- readBin(tempCon, "integer", n=1, size=4, 
                        endian="little", signed=FALSE)
 
   seek(tempCon, fields["IlluminaID", "Byte Offset"])
   IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4, 
                        endian="little", signed=FALSE)
 
   seek(tempCon, fields["SD", "Byte Offset"])
   SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2, 
                 endian="little", signed=FALSE)
 
   seek(tempCon, fields["Mean", "Byte Offset"])
   Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2, 
                   endian="little", signed=FALSE)
 
   seek(tempCon, fields["NBeads", "Byte Offset"])
   NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE)
 
   seek(tempCon, fields["MidBlock", "Byte Offset"])
   nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4, 
                               endian="little", signed=FALSE)
   MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4, 
                       endian="little", signed=FALSE)
 
   seek(tempCon, fields["RunInfo", "Byte Offset"])
   nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4, 
                             endian="little", signed=FALSE)
   RunInfo <- matrix(NA, nRunInfoBlocks, 5)
   colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars", 
                          "BlockCode", "CodeVersion")
b938db26
   for(i1 in 1:2) { #nRunInfoBlocks){  ## MR edit
cd8b85b6
     for(i2 in 1:5){
       nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
       RunInfo[i1,i2] <- readChar(tempCon, nChars)
     }
   }
 
   seek(tempCon, fields["RedGreen", "Byte Offset"])
   RedGreen <- readBin(tempCon, "numeric", n=1, size=4, 
                       endian="little", signed=FALSE)
   #RedGreen <- readBin(tempCon, "integer", n=4, size=1, 
   #                    endian="little", signed=FALSE)
 
   seek(tempCon, fields["MostlyNull", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   MostlyNull <- readChar(tempCon, nChars) 
 
   seek(tempCon, fields["Barcode", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   Barcode <- readChar(tempCon, nChars) 
 
   seek(tempCon, fields["ChipType", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   ChipType <- readChar(tempCon, nChars) 
 
   seek(tempCon, fields["MostlyA", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   MostlyA <- readChar(tempCon, nChars) 
 
   seek(tempCon, fields["Unknown.1", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   Unknown.1 <- readChar(tempCon, nChars) 
 
   seek(tempCon, fields["Unknown.2", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   Unknown.2 <- readChar(tempCon, nChars) 
 
   seek(tempCon, fields["Unknown.3", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   Unknown.3 <- readChar(tempCon, nChars) 
 
   seek(tempCon, fields["Unknown.4", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   Unknown.4 <- readChar(tempCon, nChars) 
 
   seek(tempCon, fields["Unknown.5", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   Unknown.5 <- readChar(tempCon, nChars) 
 
   close(tempCon)
 
   Unknowns <- 
     list(MostlyNull=MostlyNull,
          MostlyA=MostlyA,
          Unknown.1=Unknown.1,
          Unknown.2=Unknown.2,
          Unknown.3=Unknown.3,
          Unknown.4=Unknown.4,
          Unknown.5=Unknown.5)
 
   Quants <- cbind(Mean, SD, NBeads)
   colnames(Quants) <- c("Mean", "SD", "NBeads")
   rownames(Quants) <- as.character(IlluminaID)
 
   idatValues <- 
     list(fileSize=fileSize, 
          versionNumber=versionNumber, 
          nFields=nFields, 
          fields=fields,
          nSNPsRead=nSNPsRead, 
          #IlluminaID=IlluminaID, 
          #SD=SD, 
          #Mean=Mean, 
          #NBeads=NBeads,
          Quants=Quants,
          MidBlock=MidBlock, 
          RunInfo=RunInfo, 
          RedGreen=RedGreen, 
          Barcode=Barcode, 
          ChipType=ChipType,
          Unknowns=Unknowns)
 
   idatValues
 
 }
 
 readBPM <- function(bpmFile){
 
   ## Reads and parses Illumina BPM files
   
   fileSize <- file.info(bpmFile)$size
 
   tempCon <- file(bpmFile,"rb")
 
   # The first few bytes of the egtFile are some type of
   # header, but there's no related byte offset information. 
 
   prefixCheck <- readChar(tempCon,3) ## should be "BPM"
 
   null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   ## should be 1  
   
   versionNumber <- 
     readBin(tempCon, "integer", n=1, size=4, endian="little", signed=FALSE)
   ## should be 4
 
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   chipType <- readChar(tempCon, nChars)
 
   null.2 <- readBin(tempCon, "integer", n=2, size=1, signed=FALSE)
 
   csvLines <- readLines(tempCon, 22)
 
   entriesByteOffset <- seek(tempCon);
   nEntries <- readBin(tempCon, "integer", n=1, size=4,
                       endian="little", signed=FALSE)
     
   if(FALSE){
     
     snpIndexByteOffset <- seek(tempCon)
     snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4,
                         endian="little", signed=FALSE)
     ## for the 1M array, these are simply in order from 1 to 1072820.
     
     snpNamesByteOffset <- seek(tempCon)
     snpNames <- rep("A", nEntries)
     for(i1 in 1:nEntries){
       nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
       snpNames[i1] <- readChar(tempCon, nChars)
     }
 
   }
 
   seek(tempCon, 15278138)
     
   normIDByteOffset <- seek(tempCon)
   normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1
   
   newBlockByteOffset <- seek(tempCon)
   newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE)
   
   close(tempCon)
 
   byteOffsets <- list(entriesByteOffset=entriesByteOffset,
                       #snpIndexByteOffset=snpIndexByteOffset, 
                       #snpNamesByteOffset=snpNamesByteOffset,
                       normIDByteOffset=normIDByteOffset,
                       newBlockByteOffset=newBlockByteOffset)
   
   allStuff <- list(prefixCheck=prefixCheck,
                    null.1=null.1,
                    versionNumber=versionNumber,
                    chipType=chipType,
                    null.2=null.2,
                    csvLines=csvLines,
                    nEntries=nEntries,
                    #snpIndex=snpIndex,
                    #snpNames=snpNames,
                    normID=normID,
                    newBlock=newBlock,
                    byteOffsets=byteOffsets)
   allStuff
   
 }
 
 
cfdeb14b
 RGtoXY = function(RG, chipType, verbose=TRUE) {
cd8b85b6
   chipList = c("human1mv1c",             # 1M
                "human370v1c",            # 370CNV
                "human650v3a",            # 650Y
                "human610quadv1b",        # 610 quad
                "human660quadv1a",        # 660 quad
                "human370quadv3c",        # 370CNV quad
                "human550v3b",            # 550K
                "human1mduov3b")          # 1M Duo   
   if(missing(chipType))
      chipType = match.arg(annotation(RG), chipList)
   else
      chipType = match.arg(chipType, chipList)
   
   pkgname <- getCrlmmAnnotationName(chipType)
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
      suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
      msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
      message(strwrap(msg))
      stop("Package ", pkgname, " could not be found.")
      rm(suggCall, msg)
   }
   if(verbose) message("Loading chip annotation information.")
cfdeb14b
     loader("address.rda", .crlmmPkgEnv, pkgname)
cd8b85b6
 #    data(annotation, package=pkgname, envir=.crlmmPkgEnv)
cfdeb14b
   aids <- getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest
   bids <- getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
   ids <- names(aids)
   snpbase <- getVarInEnv("base")
cd8b85b6
   
cfdeb14b
   nsnps = length(aids)
cd8b85b6
   narrays = ncol(RG)
cfdeb14b
   
 #  aidcol = match("AddressA_ID", colnames(annot))
 #  if(is.na(aidcol))
 #    aidcol = match("Address", colnames(annot))
 #  bidcol = match("AddressB_ID", colnames(annot))
 #  if(is.na(bidcol)) 
 #    bidcol = match("Address2", colnames(annot))
 #  aids = annot[, aidcol]
 #  bids = annot[, bidcol]
 #  snpids = annot[,"Name"]
 #  snpbase = annot[,"SNP"] 
   infI = !is.na(bids) & bids!=0  
cd8b85b6
   aord = match(aids, featureNames(RG)) # NAs are possible here
cfdeb14b
   bord = match(bids, featureNames(RG)) # and here
cd8b85b6
 #  argrg = aids[rrgg]
 #  brgrg = bids[rrgg]
cfdeb14b
 
852cc5cf
   tmpmat = matrix(0, nsnps, narrays)
   rownames(tmpmat) = ids
   colnames(tmpmat) = sampleNames(RG)
   XY = new("NChannelSet", X=tmpmat, Y=tmpmat, Xnb=tmpmat, Ynb=tmpmat, Xse=tmpmat, Yse=tmpmat, zero=tmpmat,
                  annotation=chipType, phenoData=RG@phenoData, scanDates=RG@scanDates, storage.mode="environment")
   rm(tmpmat)
   gc()
   
cfdeb14b
   # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
852cc5cf
   XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
   XY@assayData$Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
   XY@assayData$Xnb[!is.na(aord),] = exprs(channel(RG, "Rnb"))[aord[!is.na(aord)],]
   XY@assayData$Ynb[!is.na(aord),] = exprs(channel(RG, "Gnb"))[aord[!is.na(aord)],]
   XY@assayData$Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
   XY@assayData$Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
   gc()
cd8b85b6
   
cfdeb14b
   ## Warning - not 100% sure that the code below is correct - could be more complicated than this
   
   # Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
 #  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
   
 #  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
 #  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
 #  Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
 #  Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
 #  Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
 #  Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
cd8b85b6
   
cfdeb14b
   # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
 #  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
 
 #  X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
 #  Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
 #  Xnb[infIGG,] = exprs(channel(RG, "Gnb"))[aord[infIGG],]
 #  Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
 #  Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
 #  Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
     
   #  For now zero out Infinium I probes
852cc5cf
   XY@assayData$X[infI,] = 0
   XY@assayData$Y[infI,] = 0
   XY@assayData$Xnb[infI,] = 0
   XY@assayData$Ynb[infI,] = 0
   XY@assayData$Xse[infI,] = 0
   XY@assayData$Yse[infI,] = 0
 
   XY@assayData$zero[XY@assayData$X==0 | XY@assayData$Y==0] = 1
cd8b85b6
   gc()
852cc5cf
 
 #  storageMode(XY) = "lockedEnvironment"
   XY
cd8b85b6
 }
 
 stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
   pkgname <- getCrlmmAnnotationName(annotation(XY))
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
     suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
     msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
     message(strwrap(msg))
     stop("Package ", pkgname, " could not be found.")
     rm(suggCall, msg)
   }
   if(verbose) message("Loading strip and reference normalization information.")
   loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
 #  data(preprocStuff, package=pkgname, envir=.crlmmPkgEnv)
 
   stripnum <- getVarInEnv("stripnum")
   
   if(useTarget)
     targetdist = getVarInEnv("reference")
   
852cc5cf
 #  Xqws = Yqws = matrix(0, nrow(XY), ncol(XY))
 #  colnames(Xqws) = colnames(Yqws) = sampleNames(XY) #$X
 #  rownames(Xqws) = rownames(Yqws) = featureNames(XY)
cd8b85b6
 
   if(verbose){
     message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.")
     if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=max(stripnum), style=3)
   } 
   for(s in 1:max(stripnum)) {
     if(verbose) {
       if (getRversion() > '2.7.0') setTxtProgressBar(pb, s)
       else cat(".")
     }
     sel = stripnum==s
     subX = exprs(channel(XY, "X"))[sel,]
     subY = exprs(channel(XY, "Y"))[sel,]
     if(useTarget)
       tmp = normalize.quantiles.use.target(as.matrix(cbind(subX, subY)), targetdist[[s]])
     else
       tmp = normalize.quantiles(as.matrix(cbind(subX, subY)))
852cc5cf
     XY@assayData$X[sel,] = tmp[,1:(ncol(tmp)/2)]
     XY@assayData$Y[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)]
 #    Xqws[sel,] = tmp[,1:(ncol(tmp)/2)]
 #    Yqws[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)]
cd8b85b6
     rm(subX, subY, tmp, sel)
     gc()
   }
   if(verbose)
     cat("\n")
852cc5cf
   XY
 #  XYNorm = new("NChannelSet",
 #                X=Xqws+16,
 #                Y=Yqws+16,
 #                Xnb=exprs(channel(XY, "Xnb")),
 #                Ynb=exprs(channel(XY, "Ynb")),
 #                Xse=exprs(channel(XY, "Xse")),
 #                Yse=exprs(channel(XY, "Yse")),
 #                zero=exprs(channel(XY, "zero")),
 #                annotation=annotation(XY),
 #                phenoData=XY@phenoData)
cd8b85b6
 }
 
 
 preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, fitMixture=TRUE, eps=0.1, verbose=TRUE, seed=1,
cfdeb14b
                                cdfName, sns, stripNorm=TRUE, useTarget=TRUE, save.it=FALSE, intensityFile) {
cd8b85b6
   if(stripNorm)
     XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
cfdeb14b
 
cd8b85b6
 ## MR: the code below is mostly straight from snprma.R
9564b5c5
   if (missing(sns)) sns <- sampleNames(XY) #$X
cd8b85b6
   if(missing(cdfName))
     cdfName <- annotation(XY)
 ##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
   pkgname <- getCrlmmAnnotationName(cdfName)
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
     suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
     msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
     message(strwrap(msg))
     stop("Package ", pkgname, " could not be found.")
     rm(suggCall, msg)
   }
   if(verbose) message("Loading snp annotation and mixture model parameters.")
   loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
cfdeb14b
   loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
   loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname)
   loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
cd8b85b6
 #  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
   autosomeIndex <- getVarInEnv("autosomeIndex")
   #  pnsa <- getVarInEnv("pnsa")
   #  pnsb <- getVarInEnv("pnsb")
   #  fid <- getVarInEnv("fid")
   #  reference <- getVarInEnv("reference")
   #  aIndex <- getVarInEnv("aIndex")
   #  bIndex <- getVarInEnv("bIndex")
   SMEDIAN <- getVarInEnv("SMEDIAN")
   theKnots <- getVarInEnv("theKnots")
   gns <- getVarInEnv("gns")
cfdeb14b
   
   # separate out copy number probes
   npIndex = getVarInEnv("npProbesFid")
   nprobes = length(npIndex)
   narrays = ncol(XY)
   A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays)
   B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
   cnAB = list(A=A, B=B, sns=sns, gns=names(npIndex), cdfName=cdfName)
   
   # next process snp probes
   snpIndex = getVarInEnv("snpProbesFid")
   nprobes <- length(snpIndex)
cd8b85b6
   
   ##We will read each cel file, summarize, and run EM one by one
   ##We will save parameters of EM to use later
   mixtureParams <- matrix(0, 4, narrays)
   SNR <- vector("numeric", narrays)
   SKW <- vector("numeric", narrays)
 
   ## This is the sample for the fitting of splines
   ## BC: I like better the idea of the user passing the seed,
   ##     because this might intefere with other analyses
   ##     (like what happened to GCRMA)
   set.seed(seed)
 
   idx <- sort(sample(autosomeIndex, mixtureSampleSize))
   
   ##S will hold (A+B)/2 and M will hold A-B
   ##NOTE: We actually dont need to save S. Only for pics etc...
   ##f is the correction. we save to avoid recomputing
cfdeb14b
   A <- matrix(as.integer(exprs(channel(XY, "X"))[snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames))
   B <- matrix(as.integer(exprs(channel(XY, "Y"))[snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames))
cd8b85b6
 
   # new lines below - useful to keep track of zeroed out SNPs
cfdeb14b
   zero <- matrix(as.integer(exprs(channel(XY, "zero"))[snpIndex,]), nprobes, narrays)
cd8b85b6
     
cfdeb14b
   colnames(A) <- colnames(B) <- colnames(zero) <- sns
   rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
cd8b85b6
   
   if(verbose){
      message("Calibrating ", narrays, " arrays.")
      if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
   }
 
   idx2 <- sample(nprobes, 10^5)
   for(i in 1:narrays){
     SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
     if(fitMixture){
       S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
       M <- log2(A[idx, i])-log2(B[idx, i])
 
       ##we need to test the choice of eps.. it is not the max diff between funcs
       tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
 
       mixtureParams[, i] <- tmp[["coef"]]
       SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
     }
     if(verbose) {
cfdeb14b
        if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
        else cat(".")
cd8b85b6
     }
   }
   if (verbose) {
     if (getRversion() > '2.7.0') close(pb)
     else cat("\n")
   }
   if (!fitMixture) SNR <- mixtureParams <- NA
   ## gns comes from preprocStuff.rda
cfdeb14b
   res = list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
 
   if(save.it & !missing(intensityFile)) {
     t0 <- proc.time() 
     save(cnAB, res, file=intensityFile)
     t0 <- proc.time()-t0
     if(verbose) message("Used ", round(t0[3],1), " seconds to save ", intensityFile, ".")
   }
   return(res)
cd8b85b6
 }
 
 
cfdeb14b
 ## MR: Could add arguments to allow this function to read in idat data as well,
 ## although this would add a further 7 arguments, which might over-complicate things
cd8b85b6
 crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
                   row.names=TRUE, col.names=TRUE,
                   probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
d6533a61
                   seed=1, save.it=FALSE, load.it=FALSE, intensityFile,
cd8b85b6
                   mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
                   cdfName, sns, recallMin=10, recallRegMin=1000,
                   returnParams=FALSE, badSNP=.7) {
cfdeb14b
 
d6533a61
   if ((load.it | save.it) & missing(intensityFile))
     stop("'intensityFile' is missing, and you chose either load.it or save.it")
   if (!missing(intensityFile))
     if (load.it & !file.exists(intensityFile)){
       load.it <- FALSE
       message("File ", intensityFile, " does not exist.")
       message("Not loading it, but running SNPRMA from scratch.")
   }
   if (!load.it){
cfdeb14b
     if(!missing(RG)) {
       if(missing(XY))
         XY = RGtoXY(RG, chipType=cdfName)
       else
         stop("Both RG and XY specified - please use one or the other")
d6533a61
     }
9564b5c5
     if (missing(sns)) sns <- sampleNames(XY) #$X
cfdeb14b
     
     res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
                         seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget,
                         save.it=save.it, intensityFile=intensityFile)
d6533a61
   }else{
       if(verbose) message("Loading ", intensityFile, ".")
         obj <- load(intensityFile)
         if(verbose) message("Done.")
cfdeb14b
         if(!any(obj == "res"))
           stop("Object in ", intensityFile, " seems to be invalid.")
d6533a61
   }
cd8b85b6
   if(row.names) row.names=res$gns else row.names=NULL
   if(col.names) col.names=res$sns else col.names=NULL
 
   res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]],
                   res[["mixtureParams"]], res[["cdfName"]],
                   gender=gender, row.names=row.names,
                   col.names=col.names, recallMin=recallMin,
                   recallRegMin=1000, SNRMin=SNRMin,
                   returnParams=returnParams, badSNP=badSNP,
                   verbose=verbose)
   
   res2[["SNR"]] <- res[["SNR"]]
   res2[["SKW"]] <- res[["SKW"]]
cfdeb14b
   return(list2SnpSet(res2, returnParams=returnParams)) # return(res2)
cd8b85b6
 }