R/crlmm-illumina.R
063b3d14
 					# function below works OK provided all .idat files are in the current working directory
cd8b85b6
 # - could add an option to allow files in Illumina directory structure to be handled
 # or to use the optional 'Path' column in sampleSheet
 # - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet
 
450a6d8a
 readIdatFiles = function(sampleSheet=NULL,
5fcfed7d
 			  arrayNames=NULL,
 			  ids=NULL,
 			  path=".",
 			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 			  highDensity=FALSE,
 			  sep="_",
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
99202387
 			  saveDate=FALSE, verbose=FALSE) {
453e688a
 	verbose <- FALSE
e609f78b
        if(!is.null(arrayNames)) {
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
        }
        if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
 	       if(is.null(arrayNames)){
 		       if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
 			       barcode = sampleSheet[,arrayInfoColNames$barcode]
 			       arrayNames=barcode
 		       }
f1e14c0e
 		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
e609f78b
 			       position = sampleSheet[,arrayInfoColNames$position]
 			       if(is.null(arrayNames))
 				       arrayNames=position
 			       else
 				       arrayNames = paste(arrayNames, position, sep=sep)
 			       if(highDensity) {
 				       hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
 				       for(i in names(hdExt))
 					       arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
 			       }
 		       }
 	       }
 	       pd = new("AnnotatedDataFrame", data = sampleSheet)
063b3d14
 	       sampleNames(pd) = make.unique(basename(arrayNames))
e609f78b
        }
        if(is.null(arrayNames)) {
                arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
                if(!is.null(sampleSheet)) {
                       sampleSheet=NULL
                       cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
                }
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
        }
        narrays = length(arrayNames)
        grnfiles = paste(arrayNames, fileExt$green, sep=sep)
        redfiles = paste(arrayNames, fileExt$red, sep=sep)
        if(length(grnfiles)==0 || length(redfiles)==0)
 	       stop("Cannot find .idat files")
        if(length(grnfiles)!=length(redfiles))
 	       stop("Cannot find matching .idat files")
d98bfc16
        if(path[1] != "." & path[1] != ""){
e609f78b
 	       grnidats = file.path(path, grnfiles)
 	       redidats = file.path(path, redfiles)
        }  else {
d98bfc16
 	       message("path arg not set.  Assuming files are in local directory, or that complete path is provided")
450a6d8a
 	       grnidats = grnfiles
 	       redidats = redfiles
e609f78b
        }
        if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
f1e14c0e
        if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
e609f78b
        headerInfo = list(nProbes = rep(NA, narrays),
                          Barcode = rep(NA, narrays),
                          ChipType = rep(NA, narrays),
                          Manifest = rep(NA, narrays), # not sure about this one - sometimes blank
                          Position = rep(NA, narrays)) # this may also vary a bit
        dates = list(decode=rep(NA, narrays),
                     scan=rep(NA, narrays))
        ## read in the data
d98bfc16
        for(i in seq_along(arrayNames)) {
99202387
 	       if(verbose) {
453e688a
 	       ## RS
 		       ##cat("reading", arrayNames[i], "\t")
 		       cat("reading", basename(arrayNames[i]), "\t")
 		       cat(paste(sep, fileExt$green, sep=""), "\t")
99202387
 	       }
e609f78b
 	       idsG = idsR = G = R = NULL
adde216c
 	       G = readIDAT(grnidats[i])
e609f78b
 	       idsG = rownames(G$Quants)
 	       headerInfo$nProbes[i] = G$nSNPsRead
 	       headerInfo$Barcode[i] = G$Barcode
 	       headerInfo$ChipType[i] = G$ChipType
 	       headerInfo$Manifest[i] = G$Unknown$MostlyNull
 	       headerInfo$Position[i] = G$Unknowns$MostlyA
f7308027
                if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) {
e609f78b
 		       warning("Chips are not of the same type.  Skipping ", basename(grnidats[i]), " and ", basename(redidats[i]))
 		       next()
 	       }
 	       dates$decode[i] = G$RunInfo[1, 1]
 	       dates$scan[i] = G$RunInfo[2, 1]
 	       if(i==1) {
 		       if(is.null(ids) && !is.null(G)){
 			       ids = idsG
063b3d14
 		       } # else stop("Could not find probe IDs")
e609f78b
 		       nprobes = length(ids)
 		       narrays = length(arrayNames)
450a6d8a
 		       RG = new("NChannelSet",
99202387
 		                 R=matrix(NA, nprobes, narrays),
 		                 G=matrix(NA, nprobes, narrays),
 		                 zero=matrix(NA, nprobes, narrays),
 				 annotation=headerInfo$Manifest[1],
 				 phenoData=pd, storage.mode="environment")
 		       featureNames(RG) = ids
e609f78b
 		       if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){
063b3d14
 		            sampleNames(RG) = make.unique(sampleSheet$Sample_ID)
 		       } else  sampleNames(RG) = make.unique(basename(arrayNames))
a1394042
 		       gc(verbose=FALSE)
e609f78b
 	       }
 	       if(length(ids)==length(idsG)) {
 		       if(sum(ids==idsG)==nprobes) {
 			       RG@assayData$G[,i] = G$Quants[, "Mean"]
 			       zeroG = G$Quants[, "NBeads"]==0
 		       }
 	       } else {
 		       indG = match(ids, idsG)
 		       RG@assayData$G[,i] = G$Quants[indG, "Mean"]
 		       zeroG = G$Quants[indG, "NBeads"]==0
 	       }
 	       rm(G)
a1394042
 	       gc(verbose=FALSE)
e6e6981f
 	       if(verbose) {
99202387
                       cat(paste(sep, fileExt$red, sep=""), "\n")
 	       }
e609f78b
 	       R = readIDAT(redidats[i])
 	       idsR = rownames(R$Quants)
f1e14c0e
 
 	       if(length(ids)==length(idsG)) {
e609f78b
 		       if(sum(ids==idsR)==nprobes) {
 			       RG@assayData$R[,i] = R$Quants[ ,"Mean"]
99202387
 		               zeroR = R$Quants[ ,"NBeads"]==0
e609f78b
 		       }
 	       } else {
 		       indR = match(ids, idsR)
 		       RG@assayData$R[,i] = R$Quants[indR, "Mean"]
 		       zeroR = R$Quants[indR, "NBeads"]==0
 	       }
 	       RG@assayData$zero[,i] = zeroG | zeroR
 	       rm(R, zeroG, zeroR)
a1394042
 	       gc(verbose=FALSE)
e609f78b
        }
        if(saveDate) {
 	       protocolData(RG)[["ScanDate"]] = dates$scan
        }
        storageMode(RG) = "lockedEnvironment"
        RG
 }
 
 
64463948
 ## the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008
d773ad80
 ## edits provided by Kasper Daniel Hansen, 4/10/2011
cd8b85b6
 readIDAT <- function(idatFile){
   fileSize <- file.info(idatFile)$size
 
   tempCon <- file(idatFile,"rb")
   prefixCheck <- readChar(tempCon,4)
e3c2c3b9
 #  if(prefixCheck != "IDAT"){
 #       warning("May need to check format of ", idatFile)
 #  }
cd8b85b6
 
d773ad80
   versionNumber <- readBin(tempCon, "integer", n=1, size=8, endian="little")
b5d1d7e5
 
   if(versionNumber<3)
 	  stop("Older style IDAT files not supported:  consider updating your scanner settings")
f1e14c0e
 
d773ad80
   nFields <- readBin(tempCon, "integer", n=1, size=4, endian="little")
cd8b85b6
 
   fields <- matrix(0,nFields,3);
   colnames(fields) <- c("Field Code", "Byte Offset", "Bytes")
   for(i1 in 1:nFields){
f1e14c0e
     fields[i1,"Field Code"] <-
cd8b85b6
       readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE)
f1e14c0e
     fields[i1,"Byte Offset"] <-
82ef3a72
       readBin(tempCon, "integer", n=1, size=8, endian="little")
cd8b85b6
   }
 
d773ad80
   knownCodes <-
     c("nSNPsRead"  = 1000,
       "IlluminaID" =  102,
       "SD"         =  103,
       "Mean"       =  104,
       "NBeads"     =  107,
       "MidBlock"   =  200,
       "RunInfo"    =  300,
       "RedGreen"   =  400,
       "MostlyNull" =  401,
       "Barcode"    =  402,
       "ChipType"   =  403,
       "MostlyA"    =  404,
       "Unknown.1"  =  405,
       "Unknown.2"  =  406,
       "Unknown.3"  =  407,
       "Unknown.4"  =  408,
       "Unknown.5"  =  409,
       "Unknown.6"  =  410,
       "Unknown.7"  =  510
cd8b85b6
       )
 
   nNewFields <- 1
   rownames(fields) <- paste("Null", 1:nFields)
   for(i1 in 1:nFields){
     temp <- match(fields[i1,"Field Code"], knownCodes)
     if(!is.na(temp)){
       rownames(fields)[i1] <- names(knownCodes)[temp]
     }else{
       rownames(fields)[i1] <- paste("newField", nNewFields, sep=".")
       nNewFields <- nNewFields + 1
     }
   }
 
d773ad80
   fields <- fields[order(fields[, "Byte Offset"]),]
cd8b85b6
 
d773ad80
   seek(tempCon, fields["nSNPsRead", "Byte Offset"])
   nSNPsRead <- readBin(tempCon, "integer", n=1, size=4, endian="little")
 
   readBlock <- function(nam) {
       switch(nam,
              "IlluminaID" = {
                  seek(tempCon, fields["IlluminaID", "Byte Offset"])
                  IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4, endian="little")
                  IlluminaID
              },
              "SD" = {
                  seek(tempCon, fields["SD", "Byte Offset"])
                  SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2, endian="little", signed=FALSE)
                  SD
              },
              "Mean" = {
                  seek(tempCon, fields["Mean", "Byte Offset"])
                  Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2, endian="little", signed=FALSE)
                  Mean
              },
              "NBeads" = {
                  seek(tempCon, fields["NBeads", "Byte Offset"])
                  NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE)
                  NBeads
              },
              "MidBlock" = {
                  seek(tempCon, fields["MidBlock", "Byte Offset"])
                  nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4, endian="little")
                  MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4,
                                      endian="little")
                  MidBlock
              },
              "RunInfo" = {
                  seek(tempCon, fields["RunInfo", "Byte Offset"])
                  nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4, endian="little")
                  RunInfo <- matrix(NA, nRunInfoBlocks, 5)
                  colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars",
                                         "BlockCode", "CodeVersion")
                  for(i1 in 1:2) { #nRunInfoBlocks){  ## MR edit
                      for(i2 in 1:5){
                          nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                          RunInfo[i1,i2] <- readChar(tempCon, nChars)
                      }
                  }
                  RunInfo
              },
              "RedGreen" = {
                  seek(tempCon, fields["RedGreen", "Byte Offset"])
                  RedGreen <- readBin(tempCon, "numeric", n=1, size=4,
                                      endian="little")
                  RedGreen
              },
              "MostlyNull" = {
                  seek(tempCon, fields["MostlyNull", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  MostlyNull <- readChar(tempCon, nChars)
                  MostlyNull
              },
af08b5d6
              "Barcode" = {
d773ad80
                  seek(tempCon, fields["Barcode", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  Barcode <- readChar(tempCon, nChars)
                  Barcode
              },
              "ChipType" = {
                  seek(tempCon, fields["ChipType", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  ChipType <- readChar(tempCon, nChars)
                  ChipType
              },
              "MostlyA" = {
                  seek(tempCon, fields["MostlyA", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  MostlyA <- readChar(tempCon, nChars)
              },
              "Unknown.1" = {
                  seek(tempCon, fields["Unknown.1", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  Unknown.1 <- readChar(tempCon, nChars)
                  Unknown.1
              },
              "Unknown.2" = {
                  seek(tempCon, fields["Unknown.2", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  Unknown.2 <- readChar(tempCon, nChars)
                  Unknown.2
              },
              "Unknown.3" = {
                  seek(tempCon, fields["Unknown.3", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  Unknown.3 <- readChar(tempCon, nChars)
                  Unknown.3
              },
              "Unknown.4" = {
                  seek(tempCon, fields["Unknown.4", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  Unknown.4 <- readChar(tempCon, nChars)
                  Unknown.4
              },
              "Unknown.5" = {
                  seek(tempCon, fields["Unknown.5", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  Unknown.5 <- readChar(tempCon, nChars)
                  Unknown.5
              },
              "Unknown.6" = {
                  seek(tempCon, fields["Unknown.6", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  Unknown.6 <- readChar(tempCon, nChars)
                  Unknown.6
              },
              "Unknown.7" = {
                  seek(tempCon, fields["Unknown.7", "Byte Offset"])
                  nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
                  Unknown.7 <- readChar(tempCon, nChars)
                  Unknown.7
              })
cd8b85b6
   }
 
d773ad80
   readFields <- setdiff(rownames(fields), "nSNPsRead")
   names(readFields) <- readFields
af08b5d6
 
d773ad80
   allFields <- lapply(readFields, readBlock)
cd8b85b6
 
   close(tempCon)
 
d773ad80
   UnknownNames <- c("MostlyNull", "MostlyA", "Unknown.1",
                     "Unknown.2", "Unknown.3", "Unknown.4",
                     "Unknown.5", "Unknown.6", "Unknown.7")
   Unknowns <- allFields[intersect(names(allFields), UnknownNames)]
cd8b85b6
 
d773ad80
   Quants <- cbind(allFields$Mean, allFields$SD, allFields$NBeads)
cd8b85b6
   colnames(Quants) <- c("Mean", "SD", "NBeads")
d773ad80
   rownames(Quants) <- as.character(allFields$IlluminaID)
cd8b85b6
 
d773ad80
   InfoNames <- c("MidBlock", "RunInfo", "RedGreen", "Barcode", "ChipType")
   Info <- allFields[intersect(names(allFields), InfoNames)]
af08b5d6
 
f1e14c0e
   idatValues <-
     list(fileSize=fileSize,
          versionNumber=versionNumber,
          nFields=nFields,
cd8b85b6
          fields=fields,
f1e14c0e
          nSNPsRead=nSNPsRead,
d773ad80
          Quants=Quants)
   idatValues <- c(idatValues, Info, list(Unknowns = Unknowns))
cd8b85b6
   idatValues
 }
 
d773ad80
 
cd8b85b6
 readBPM <- function(bpmFile){
 
   ## Reads and parses Illumina BPM files
f1e14c0e
 
cd8b85b6
   fileSize <- file.info(bpmFile)$size
 
   tempCon <- file(bpmFile,"rb")
 
   # The first few bytes of the egtFile are some type of
f1e14c0e
   # header, but there's no related byte offset information.
cd8b85b6
 
   prefixCheck <- readChar(tempCon,3) ## should be "BPM"
 
   null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
f1e14c0e
   ## should be 1
 
   versionNumber <-
82ef3a72
     readBin(tempCon, "integer", n=1, size=4, endian="little")
cd8b85b6
   ## should be 4
 
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   chipType <- readChar(tempCon, nChars)
 
   null.2 <- readBin(tempCon, "integer", n=2, size=1, signed=FALSE)
 
   csvLines <- readLines(tempCon, 22)
 
   entriesByteOffset <- seek(tempCon);
   nEntries <- readBin(tempCon, "integer", n=1, size=4,
82ef3a72
                       endian="little")
f1e14c0e
 
cd8b85b6
   if(FALSE){
f1e14c0e
 
cd8b85b6
     snpIndexByteOffset <- seek(tempCon)
     snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4,
82ef3a72
                         endian="little")
cd8b85b6
     ## for the 1M array, these are simply in order from 1 to 1072820.
f1e14c0e
 
cd8b85b6
     snpNamesByteOffset <- seek(tempCon)
     snpNames <- rep("A", nEntries)
     for(i1 in 1:nEntries){
       nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
       snpNames[i1] <- readChar(tempCon, nChars)
     }
 
   }
 
   seek(tempCon, 15278138)
f1e14c0e
 
cd8b85b6
   normIDByteOffset <- seek(tempCon)
   normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1
f1e14c0e
 
cd8b85b6
   newBlockByteOffset <- seek(tempCon)
   newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE)
f1e14c0e
 
cd8b85b6
   close(tempCon)
 
   byteOffsets <- list(entriesByteOffset=entriesByteOffset,
f1e14c0e
                       #snpIndexByteOffset=snpIndexByteOffset,
cd8b85b6
                       #snpNamesByteOffset=snpNamesByteOffset,
                       normIDByteOffset=normIDByteOffset,
                       newBlockByteOffset=newBlockByteOffset)
f1e14c0e
 
cd8b85b6
   allStuff <- list(prefixCheck=prefixCheck,
                    null.1=null.1,
                    versionNumber=versionNumber,
                    chipType=chipType,
                    null.2=null.2,
                    csvLines=csvLines,
                    nEntries=nEntries,
                    #snpIndex=snpIndex,
                    #snpNames=snpNames,
                    normID=normID,
                    newBlock=newBlock,
                    byteOffsets=byteOffsets)
   allStuff
f1e14c0e
 
cd8b85b6
 }
 
5fcfed7d
 
d3668036
 readGenCallOutput = function(file, path=".", cdfName,
7c0c9ac5
     colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"),
d3668036
     type=list("SampleID"="character", "SNPID"="character", "XRaw"="integer", "YRaw"="integer"), verbose=FALSE) {
 
     if(!identical(names(type), names(colnames)))
        stop("The arguments 'colnames' and 'type' must have consistent names")
5cdc0c76
     if(verbose)
d3668036
   	cat("Reading", file, "\n")
     tmp=readLines(file.path(path,file),n=15)
     s=c("\t",",")
5cdc0c76
     a=unlist(strsplit(tmp[10][1],s[1]))
d3668036
     if(length(a)!=1){
 	sepp=s[1]
5cdc0c76
       	a1=unlist(strsplit(tmp[10][1],s[1]))
     }
d3668036
     if(length(a)==1){
 	sepp=s[2]
 	a1=unlist(strsplit(tmp[10][1],s[2]))
5cdc0c76
     }
7c0c9ac5
 
d3668036
     if(sum(is.na(match(colnames,a1)))!=0)
7c0c9ac5
 	stop("Cannot find required columns: ", colnames[is.na(match(colnames,a1))], " in ", file, "\nPlease check whether this data was exported.")
d3668036
 
     m1=m=match(a1,colnames)
     m[is.na(m1)==FALSE] =type[m1[!is.na(m1)]]
     m[is.na(m)==TRUE] = list(NULL)
     names(m) = names(colnames)[m1]
 
5cdc0c76
     fc = file(file.path(path, file), open="r")
7c0c9ac5
 
d3668036
     dat = scan(fc, what=m, skip=10,sep=sepp)
     close(fc)
7c0c9ac5
 
d3668036
     samples = unique(dat$"SampleID")
     nsamples = length(samples)
     snps = unique(dat$"SNPID")
     nsnps = length(snps)
     if(verbose)
         cat("Check ordering for samples","\n")
7c0c9ac5
 
d3668036
     X = Y = zeroes = matrix(0, nsnps, nsamples)
7c0c9ac5
 
d3668036
     for(i in 1:length(samples)) {
         ind = dat$"SampleID"==samples[i]
 	    if(sum(dat$"SNPID"[ind]==snps)==nsnps) {
 #           if(verbose)
 #	            cat(paste("Correct ordering for sample", samples[i], "\n"))
 			X[,i] = dat$"XRaw"[ind]
 	        Y[,i] = dat$"YRaw"[ind]
 	    }
         if(sum(dat$"SNPID"[ind]==snps)!=nsnps) {
 	        if(verbose)
                 cat("Reordering sample ", samples[i],"\n")
             m=match(snps,dat$"SNPID"[ind])
             X[,i]= dat$"XRaw"[ind][m]
             Y[,i]= dat$"YRaw"[ind][m]
         }
a1394042
         gc(verbose=FALSE)
d3668036
     }
7c0c9ac5
 
5cdc0c76
     zeroes=(X=="0"|Y=="0")
d3668036
     colnames(X) = colnames(Y) =  colnames(zeroes) = samples
     rownames(X) = rownames(Y) = snps
7c0c9ac5
 
     if(verbose)
d3668036
         cat("Creating NChannelSet object\n")
7c0c9ac5
 
5cdc0c76
     XY = new("NChannelSet", X = initializeBigMatrix(name = "X", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=X),
 			 Y = initializeBigMatrix(name = "Y", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=Y),
 			 zero = initializeBigMatrix(name = "zero", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=zeroes),
 			 annotation = cdfName, storage.mode = "environment")
d3668036
     sampleNames(XY)=colnames(X)
7c0c9ac5
 
5cdc0c76
     if(verbose)
       cat("Done\n")
7c0c9ac5
 
5cdc0c76
     XY
 }
 
 
9a4a5148
 RGtoXY = function(RG, chipType, verbose=TRUE) {
453e688a
   needToLoad <- !all(sapply(c('addressA', 'addressB', 'base'), isLoaded))
   if(needToLoad){
01aea5e1
 	  chipList = c("human1mv1c",# 1M
453e688a
 	  "human370v1c",            # 370CNV
 	  "human650v3a",            # 650Y
 	  "human610quadv1b",        # 610 quad
 	  "human660quadv1a",        # 660 quad
 	  "human370quadv3c",        # 370CNV quad
 	  "human550v3b",            # 550K
 	  "human1mduov3b",          # 1M Duo
 	  "humanomni1quadv1b",      # Omni1 quad
 	  "humanomni25quadv1b",     # Omni2.5 quad
063b3d14
 	  "humanomni258v1a",        # Omni2.5 8 sample
453e688a
 	  "humanomniexpress12v1b",  # Omni express 12
01aea5e1
 	  "humanimmuno12v1b",       # Immuno chip 12
           "humancytosnp12v2p1h")    # CytoSNP 12
063b3d14
 	  ## RS: added cleancdfname()
453e688a
 	  if(missing(chipType)){
063b3d14
 		  chipType = match.arg(cleancdfname(annotation(RG)), chipList)
 	  } else chipType = match.arg(cleancdfname(chipType), chipList)
453e688a
 	  pkgname = getCrlmmAnnotationName(chipType)
 	  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
 		  suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
 		  msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
 		  message(strwrap(msg))
 		  stop("Package ", pkgname, " could not be found.")
 		  rm(suggCall, msg)
 	  }
 	  if(verbose) message("Loading chip annotation information.")
 	  loader("address.rda", .crlmmPkgEnv, pkgname)
e609f78b
   }
450a6d8a
   aids = getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest
   bids = getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
   snpbase = getVarInEnv("base")
453e688a
 ##  loader(‘file.rda’)
 ##  x = getVarInEnv(‘x’)
 ##  y = getVarInEnv(‘y’)
 ##
 ##  I’d consider using something like:
 ##
 ##	  needToLoad = !all(sapply(c(‘x’, ‘y’), isLoaded))
 ##  if (needToLoad){
 ##	  loader(‘file.rda’)
 ##	  x = getVarInEnv(‘x’)
 ##	  y = getVarInEnv(‘y’)
 ##  }
   ids = names(aids)
e609f78b
   nsnps = length(aids)
   narrays = ncol(RG)
 #  aidcol = match("AddressA_ID", colnames(annot))
 #  if(is.na(aidcol))
 #    aidcol = match("Address", colnames(annot))
 #  bidcol = match("AddressB_ID", colnames(annot))
f1e14c0e
 #  if(is.na(bidcol))
e609f78b
 #    bidcol = match("Address2", colnames(annot))
 #  aids = annot[, aidcol]
 #  bids = annot[, bidcol]
 #  snpids = annot[,"Name"]
f1e14c0e
 #  snpbase = annot[,"SNP"]
   infI = !is.na(bids) & bids!=0
e609f78b
   aord = match(aids, featureNames(RG)) # NAs are possible here
   bord = match(bids, featureNames(RG)) # and here
 #  argrg = aids[rrgg]
 #  brgrg = bids[rrgg]
453e688a
   XY <- new("NChannelSet",
 	    X=matrix(0, nsnps, narrays),
 	    Y=matrix(0, nsnps, narrays),
 	    zero=matrix(0, nsnps, narrays),
 	    annotation=chipType, phenoData=RG@phenoData,
 	    protocolData=RG@protocolData, storage.mode="environment")
3f404581
   featureNames(XY) = ids
e972ec4a
   sampleNames(XY) = sampleNames(RG)
453e688a
   ## RS
   rm(list=c("bord", "infI", "aids", "bids", "ids", "snpbase"))
a1394042
 #  print(gc())
   gc(verbose=FALSE)
a11efc9c
   # Need to initialize - matrices filled with NAs to begin with
137e5619
 #  XY@assayData$X[1:nsnps,] = 0
 #  XY@assayData$Y[1:nsnps,] = 0
 #  XY@assayData$zero[1:nsnps,] = 0
f1e14c0e
 
e609f78b
   # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
453e688a
   ## RS added
   not.na <- !is.na(aord)
   not.na.aord <- aord[not.na]
   ## RS substitution  !is.na(aord) -> not.na
   ##r <- as.matrix(exprs(channel(RG, "R"))[not.na.aord,]) # mostly red
   r <- as.matrix(assayData(RG)[["R"]][not.na.aord, ])
   XY@assayData$X[not.na,] <- r
a1394042
   rm(r);gc(verbose=FALSE)
453e688a
   g <- as.matrix(assayData(RG)[["G"]][not.na.aord,]) # mostly green
   XY@assayData$Y[not.na,] <- g
a1394042
   rm(g); gc(verbose=FALSE)
453e688a
   ##z <- as.matrix(exprs(channel(RG, "zero"))[not.na.aord,]) # mostly green
   z <- as.matrix(assayData(RG)[["zero"]][not.na.aord, ])
   XY@assayData$zero[not.na,] <- z
a1394042
   rm(z); gc(verbose=FALSE)
453e688a
   ##RS added
   rm(RG)
a1394042
 #  print(gc())
   gc(verbose=FALSE)
e609f78b
   ## Warning - not 100% sure that the code below is correct - could be more complicated than this
 #  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
f1e14c0e
 
e609f78b
 #  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
 #  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
f1e14c0e
 
e609f78b
   # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
 #  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
 
 #  X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
 #  Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
f1e14c0e
 
e609f78b
   #  For now zero out Infinium I probes
99202387
 #  XY@assayData$X[infI,] = 0
 #  XY@assayData$Y[infI,] = 0
 #  XY@assayData$zero[infI,] = 0
a1394042
 #  gc(verbose=FALSE)
e609f78b
   XY
 }
 
 
cd8b85b6
 stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
453e688a
 	if(useTarget){
 		objectsNeeded <- c("stripnum", "reference")
 	} else objectsNeeded <- "stripnum"
 	needToLoad <- !all(sapply(objectsNeeded, isLoaded))
 	if(needToLoad){
 		pkgname = getCrlmmAnnotationName(annotation(XY))
 		if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
 			suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
 			msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
 			message(strwrap(msg))
 			stop("Package ", pkgname, " could not be found.")
 			rm(suggCall, msg)
 		}
 		if(verbose) message("Loading strip and reference normalization information.")
 		loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
 	}
 	stripnum = getVarInEnv("stripnum")
 	if(useTarget)
 		targetdist = getVarInEnv("reference")
 	if(verbose){
 		message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.")
 		if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=max(stripnum), style=3)
 	}
5fcfed7d
   for(s in 1:max(stripnum)) {
     if(verbose) {
       if (getRversion() > '2.7.0') setTxtProgressBar(pb, s)
       else cat(".")
     }
     sel = stripnum==s
453e688a
     ##RS: faster to access data directly
     ##subX = as.matrix(exprs(channel(XY, "X"))[sel,])
     ##subY = as.matrix(exprs(channel(XY, "Y"))[sel,])
     subX <- as.matrix(assayData(XY)[["X"]][sel, ])
     subY <- as.matrix(assayData(XY)[["Y"]][sel, ])
5fcfed7d
     if(useTarget)
453e688a
       tmp = normalize.quantiles.use.target(cbind(subX, subY), targetdist[[s]])
5fcfed7d
     else
453e688a
       tmp = normalize.quantiles(cbind(subX, subY))
a11efc9c
     XY@assayData$X[sel,] = matrix(as.integer(tmp[,1:(ncol(tmp)/2)]+16), nrow(tmp), ncol(tmp)/2)
     XY@assayData$Y[sel,] = matrix(as.integer(tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16), nrow(tmp), ncol(tmp)/2)
5fcfed7d
     rm(subX, subY, tmp, sel)
a1394042
     gc(verbose=FALSE)
5fcfed7d
   }
   if(verbose)
     cat("\n")
   XY
cd8b85b6
 }
 
 
450a6d8a
 preprocessInfinium2 = function(XY, mixtureSampleSize=10^5,
e609f78b
 				fitMixture=TRUE,
 				eps=0.1,
 				verbose=TRUE,
 				seed=1,
 				cdfName,
 				sns,
 				stripNorm=TRUE,
99202387
 				useTarget=TRUE) { #,
 #               outdir=".") {
e972ec4a
 #				save.it=FALSE,
 #				snpFile,
 #				cnFile) {
453e688a
 	if(stripNorm)
 		XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
 	## MR: the code below is mostly straight from snprma.R
 	if (missing(sns)) sns = sampleNames(XY) #$X
 	if(missing(cdfName))
 		cdfName = annotation(XY)
 	needToLoad <- !all(sapply(c("autosomeIndex",
 				    "SMEDIAN",
 				    "theKnots",
 				    "npProbesFid"), isLoaded))
 	if(needToLoad){
 		pkgname = getCrlmmAnnotationName(cdfName)
 		if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
 			suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
 			msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
 			message(strwrap(msg))
 			stop("Package ", pkgname, " could not be found.")
 			rm(suggCall, msg)
 		}
 		if(verbose) message("Loading snp annotation and mixture model parameters.")
 		loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
 		loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
 		loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname)
 		loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
 	}
 	autosomeIndex = getVarInEnv("autosomeIndex")
 	SMEDIAN = getVarInEnv("SMEDIAN")
 	theKnots = getVarInEnv("theKnots")
 	npIndex = getVarInEnv("npProbesFid")
 	snpIndex = getVarInEnv("snpProbesFid")
 	narrays = ncol(XY)
 	nprobes = length(npIndex)
 	if(length(nprobes)>0) {
 		## RS: channel creates an expression set.  This is much slower than directly accessing the data
 		A <- matrix(as.integer(assayData(XY)[["X"]][npIndex, ]), nprobes, narrays)
 		##system.time(A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays))
 		B <- matrix(as.integer(assayData(XY)[["Y"]][npIndex, ]), nprobes, narrays)
 		##B = matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
 		## new lines below - useful to keep track of zeroed out probes
 		zero <- matrix(as.integer(assayData(XY)[["zero"]][npIndex, ]), nprobes, narrays)
 		##zero = matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
 		colnames(A) = colnames(B) = colnames(zero) = sns
 		rownames(A) = rownames(B) = rownames(zero) = names(npIndex)
 		cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
 		##      t0 <- proc.time()
 		##      save(cnAB, file=cnFile)
 		##      t0 <- proc.time()-t0
 		##      if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
 		rm(A, B, zero)
a1394042
 #		print(gc())
 		gc(verbose=FALSE)
453e688a
 	}
e609f78b
   # next process snp probes
450a6d8a
   nprobes = length(snpIndex)
e609f78b
   ##We will read each cel file, summarize, and run EM one by one
   ##We will save parameters of EM to use later
99202387
   mixtureParams = matrix(NA, 4, narrays)
   SNR = rep(NA, narrays)
   SKW = rep(NA, narrays)
e609f78b
   ## This is the sample for the fitting of splines
   ## BC: I like better the idea of the user passing the seed,
   ##     because this might intefere with other analyses
   ##     (like what happened to GCRMA)
   set.seed(seed)
450a6d8a
   idx = sort(sample(autosomeIndex, mixtureSampleSize))
   idx2 = sample(nprobes, 10^5)
e609f78b
   ##S will hold (A+B)/2 and M will hold A-B
   ##NOTE: We actually dont need to save S. Only for pics etc...
   ##f is the correction. we save to avoid recomputing
a11efc9c
 
99202387
   A = matrix(NA, nprobes, narrays)
   B = matrix(NA, nprobes, narrays)
   zero = matrix(NA, nprobes, narrays)
f1e14c0e
 
e609f78b
   if(verbose){
      message("Calibrating ", narrays, " arrays.")
450a6d8a
      if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=narrays, style=3)
e609f78b
   }
   for(i in 1:narrays){
453e688a
 	  ## RS: faster to access data directly without using channel method
 	  ##A[,i] = as.integer(exprs(channel(XY, "X"))[snpIndex,i])
 	  A[, i] <- as.integer(assayData(XY)[["X"]][snpIndex, i])
 	  B[, i] <- as.integer(assayData(XY)[["Y"]][snpIndex, i])
 	  zero[, i] <- as.integer(assayData(XY)[["zero"]][snpIndex, i])
 	  ##B[,i] = as.integer(exprs(channel(XY, "Y"))[snpIndex,i])
 	  ##zero[,i] = as.integer(exprs(channel(XY, "zero"))[snpIndex,i])
 	  SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
 	  if(fitMixture){
 		  S = (log2(A[idx,i])+log2(B[idx,i]))/2 - SMEDIAN
 		  M = log2(A[idx,i])-log2(B[idx,i])
 		  ##we need to test the choice of eps.. it is not the max diff between funcs
 		  tmp = fitAffySnpMixture56(S, M, theKnots, eps=eps)
 		  mixtureParams[, i] = tmp[["coef"]]
 		  SNR[i] = tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
 	  }
 	  if(verbose) {
 		  if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
 		  else cat(".")
 	  }
 	  ## run garbage collection every now and then
a1394042
 	  if(i %% 100 == 0) gc(verbose=FALSE);
e609f78b
   }
   if (verbose) {
453e688a
 	  if (getRversion() > '2.7.0') close(pb)
 	  else cat("\n")
e609f78b
   }
450a6d8a
   if (!fitMixture) SNR = mixtureParams = NA
e609f78b
   ## gns comes from preprocStuff.rda
8d6ca965
 
e972ec4a
 #  if(save.it & !missing(snpFile)) {
 #    t0 <- proc.time()
 #    save(res, file=snpFile)
 #    t0 <- proc.time()-t0
 #    if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
 #  }
f1e14c0e
 
33753706
   res = list(A=A, B=B,
              zero=zero, sns=sns, gns=names(snpIndex), SNR=SNR, SKW=SKW,
e972ec4a
              mixtureParams=mixtureParams, cdfName=cdfName, cnAB=cnAB)
3f404581
 
e972ec4a
 #  open(res[["A"]])
 #  open(res[["B"]])
 #  open(res[["zero"]])
 #  open(res[["SNR"]])
 #  open(res[["mixtureParams"]])
 
 #  if(save.it & !missing(snpFile)) {
 #    t0 <- proc.time()
 #    save(res, file=snpFile)
 #    t0 <- proc.time()-t0
 #    if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
 #  }
8d6ca965
 
e972ec4a
 #  close(res[["A"]])
 #  close(res[["B"]])
 #  close(res[["zero"]])
 #  close(res[["SNR"]])
 #  close(res[["mixtureParams"]])
a1a4312d
 
e609f78b
   return(res)
 }
 
 
3be95c2b
 crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
 			  row.names=TRUE, col.names=TRUE,
 			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
 			  seed=1,
 			  mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
 			  cdfName, sns, recallMin=10, recallRegMin=1000,
 			  returnParams=FALSE, badSNP=.7) {
 	if(missing(cdfName)) {
 		if(!missing(RG))
 			cdfName = annotation(RG)
 		if(!missing(XY))
 			cdfName = annotation(XY)
 	}
 	if(!isValidCdfName(cdfName))
 		stop("cdfName not valid.  see validCdfNames")
 	if(!missing(RG)) {
 		if(missing(XY))
 			XY = RGtoXY(RG, chipType=cdfName)
 		else
 			stop("Both RG and XY specified - please use one or the other")
 	}
 	if (missing(sns)) sns = sampleNames(XY)
 	res <- preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize,
 				   fitMixture=TRUE, verbose=verbose,
 				   seed=seed, eps=eps, cdfName=cdfName, sns=sns,
 				   stripNorm=stripNorm, useTarget=useTarget) #,
 	if(row.names) row.names=res$gns else row.names=NULL
 	if(col.names) col.names=res$sns else col.names=NULL
 	res2 <- crlmmGT(A=res[["A"]],
 			B=res[["B"]],
 			SNR=res[["SNR"]],
 			mixtureParams=res[["mixtureParams"]],
 			cdfName=cdfName,
 			row.names=row.names,
 			col.names=col.names,
 			probs=probs,
 			DF=DF,
 			SNRMin=SNRMin,
 			recallMin=recallMin,
 			recallRegMin=recallRegMin,
 			gender=gender,
 			verbose=verbose,
 			returnParams=returnParams,
 			badSNP=badSNP)
 	res2[["SNR"]] = res[["SNR"]]
 	res2[["SKW"]] = res[["SKW"]]
 	rm(res); gc(verbose=FALSE)
 	return(list2SnpSet(res2, returnParams=returnParams))
e609f78b
 }
 
 
f1e14c0e
 ## MR: Below is a more memory efficient version of crlmmIllumina() which
 ## reads in the .idats and genotypes in the one function and removes objects
4e4601af
 ## after they have been used
5fcfed7d
 crlmmIlluminaV2 = function(sampleSheet=NULL,
e609f78b
 			  arrayNames=NULL,
5fcfed7d
 			  ids=NULL,
 			  path=".",
 			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 			  highDensity=FALSE,
 			  sep="_",
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
 			  saveDate=FALSE,
 			  stripNorm=TRUE,
 			  useTarget=TRUE,
99202387
  #                         outdir=".",
8e2d9355
 			  row.names=TRUE,
5fcfed7d
 			  col.names=TRUE,
 			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
e972ec4a
                           seed=1,  # save.it=FALSE, snpFile, cnFile,
e609f78b
                           mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
                           cdfName, sns, recallMin=10, recallRegMin=1000,
                           returnParams=FALSE, badSNP=.7) {
8e2d9355
 
450a6d8a
     if(missing(cdfName)) stop("must specify cdfName")
     if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
3f404581
 
99202387
 #    is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
     is.lds=FALSE
450a6d8a
     RG = readIdatFiles(sampleSheet=sampleSheet, arrayNames=arrayNames,
4e4601af
                        ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
                        highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
 
 
9a4a5148
     XY = RGtoXY(RG, chipType=cdfName)
99202387
 #    if(is.lds) {
 #      open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero)
 #      delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero)
 #    }
a1394042
     rm(RG); gc(verbose=FALSE)
3f404581
 
450a6d8a
     if (missing(sns)) { sns = sampleNames(XY)
3f404581
     }
9a4a5148
     res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
99202387
                                seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir) #,
e972ec4a
 #                               save.it=save.it, snpFile=snpFile, cnFile=cnFile)
 
99202387
 #    if(is.lds) {
 #      open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero)
 #      delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero)
 #    }
a1394042
     rm(XY); gc(verbose=FALSE)
e609f78b
 
450a6d8a
     if(row.names) row.names=res$gns else row.names=NULL
     if(col.names) col.names=res$sns else col.names=NULL
3be95c2b
 ##
 ##  if(is.lds){
 ##	  res2 <- crlmmGT2(A=res[["A"]],
 ##			   B=res[["B"]],
 ##			   SNR=res[["SNR"]],
 ##			   mixtureParams=res[["mixtureParams"]],
 ##			   cdfName=cdfName,
 ##			   row.names=row.names,
 ##			   col.names=col.names,
 ##			   probs=probs,
 ##			   DF=DF,
 ##			   SNRMin=SNRMin,
 ##			   recallMin=recallMin,
 ##			   recallRegMin=recallRegMin,
 ##			   gender=gender,
 ##			   verbose=verbose,
 ##			   returnParams=returnParams,
 ##			   badSNP=badSNP)
 ##  } else {
 	  res2 <- crlmmGT(A=res[["A"]],
 			   B=res[["B"]],
 			   SNR=res[["SNR"]],
 			   mixtureParams=res[["mixtureParams"]],
 			   cdfName=cdfName,
 			   row.names=row.names,
 			   col.names=col.names,
 			   probs=probs,
 			   DF=DF,
 			   SNRMin=SNRMin,
 			   recallMin=recallMin,
 			   recallRegMin=recallRegMin,
 			   gender=gender,
 			   verbose=verbose,
 			   returnParams=returnParams,
 			   badSNP=badSNP)
 ##  }
3f404581
 
3be95c2b
 ##    FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT")
 ##    ## genotyping
 ##    crlmmGTfxn = function(FUN,...){
 ##		switch(FUN,
 ##		       crlmmGT2=crlmmGT2(...),
 ##		       crlmmGT=crlmmGT(...))
 ##              }
 ##    res2 = crlmmGTfxn(FUN,
 ##                     A=res[["A"]],
 ##                     B=res[["B"]],
 ##                     SNR=res[["SNR"]],
 ##                     mixtureParams=res[["mixtureParams"]],
 ##                     cdfName=cdfName,
 ##                     row.names=row.names,
 ##                     col.names=col.names,
 ##                     probs=probs,
 ##                     DF=DF,
 ##                     SNRMin=SNRMin,
 ##                     recallMin=recallMin,
 ##                     recallRegMin=recallRegMin,
 ##                     gender=gender,
 ##                     verbose=verbose,
 ##                     returnParams=returnParams,
 ##                     badSNP=badSNP)
3f404581
 
99202387
 #    if(is.lds) {
 #      open(res[["SNR"]]); open(res[["SKW"]])
 #    }
450a6d8a
     res2[["SNR"]] = res[["SNR"]]
     res2[["SKW"]] = res[["SKW"]]
  #  if(is.lds) {
  #    delete(res[["A"]]); delete(res[["B"]])
  #    delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]])
  #  }
a1394042
     rm(res); gc(verbose=FALSE)
450a6d8a
     return(list2SnpSet(res2, returnParams=returnParams))
4e4601af
 }
e972ec4a
 
450a6d8a
 # Functions analogous to Rob's Affy functions to set up container
99202387
 getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verbose=FALSE) {
e972ec4a
        narrays = length(filenames)
 
        headerInfo = list(nProbes = rep(NA, narrays),
                          Barcode = rep(NA, narrays),
                          ChipType = rep(NA, narrays),
450a6d8a
                          Manifest = rep(NA, narrays),
                          Position = rep(NA, narrays))
e972ec4a
 
        scanDates = data.frame(ScanDate=rep(NA, narrays), DecodeDate=rep(NA, narrays))
063b3d14
        rownames(scanDates) <- make.unique(gsub(paste(sep, fileExt, sep=""), "", filenames))
e972ec4a
        ## read in the data
        for(i in seq_along(filenames)) {
99202387
                if(verbose)
 	               cat("reading", filenames[i], "\n")
e972ec4a
 	       idsG = G = NULL
 	       G = readIDAT(filenames[i])
 	       idsG = rownames(G$Quants)
 	       headerInfo$nProbes[i] = G$nSNPsRead
 	       headerInfo$Barcode[i] = G$Barcode
 	       headerInfo$ChipType[i] = G$ChipType
 	       headerInfo$Manifest[i] = G$Unknown$MostlyNull
 	       headerInfo$Position[i] = G$Unknowns$MostlyA
                if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) {
 		       warning("Chips are not of the same type.  Skipping ", basename(filenames[i]))
 		       next()
 	       }
 	       scanDates$ScanDate[i] = G$RunInfo[1, 1]
 	       scanDates$DecodeDate[i] = G$RunInfo[2, 1]
 	       rm(G)
a1394042
 	       gc(verbose=FALSE)
e972ec4a
        }
        protocoldata = new("AnnotatedDataFrame",
 			    data=scanDates,
 			    varMetadata=data.frame(labelDescription=colnames(scanDates),
 			                           row.names=colnames(scanDates)))
063b3d14
        return(protocoldata)
e972ec4a
 }
 
3be95c2b
 getAvailableIlluminaGenomeBuild <- function(path){
 	snp.file <- list.files(path, pattern="snpProbes_hg")
 	if(length(snp.file) > 1){
 		## use hg19
 		message("genome build not specified. Using build hg19 for annotation.")
 		snp.file <- snp.file[1]
 	}
a3b625d4
 	if(length(snp.file) == 1)
 		genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]])
 	if(length(snp.file)==0) genome <- ""
 	genome
3be95c2b
 }
 
e972ec4a
 
f0f8ba16
 constructInf <- function(sampleSheet=NULL,
415a3b06
 			 arrayNames=NULL,
 			 path=".",
 			 arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 			 highDensity=FALSE,
 			 sep="_",
 			 fileExt=list(green="Grn.idat", red="Red.idat"),
 			 cdfName,
 			 verbose=FALSE,
 			 batch, #fns,
 			 saveDate=TRUE) { #, outdir="."){
453e688a
 	verbose <- FALSE
 	if(!is.null(arrayNames)) {
 		pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
 	}
 	if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
 		if(is.null(arrayNames)) {
 			if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
 				barcode = sampleSheet[,arrayInfoColNames$barcode]
 				arrayNames=barcode
 			}
 			if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
 				position = sampleSheet[,arrayInfoColNames$position]
 				if(is.null(arrayNames))
 					arrayNames=position
 				else
 					arrayNames = paste(arrayNames, position, sep=sep)
 				if(highDensity) {
 					hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
 					for(i in names(hdExt))
 						arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
 				}
 			}
 		}
 		pd = new("AnnotatedDataFrame", data = sampleSheet)
063b3d14
 		sampleNames(pd) = make.unique(basename(arrayNames))
453e688a
 	}
 	if(is.null(arrayNames)) {
 		arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
 		if(!is.null(sampleSheet)) {
 			sampleSheet=NULL
 			cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
 		}
 		pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
 	}
 	narrays = length(arrayNames)
3f404581
 
453e688a
 	if(!missing(batch)) {
e972ec4a
 		stopifnot(length(batch) == narrays)
453e688a
 	}
 	if(missing(batch)) {
137e5619
                 stop("Must specify 'batch'") # batch = as.factor(rep(1, narrays))
453e688a
 	}
e972ec4a
 
453e688a
 	grnfiles = paste(arrayNames, fileExt$green, sep=sep)
 	redfiles = paste(arrayNames, fileExt$red, sep=sep)
 	if(length(grnfiles)==0 || length(redfiles)==0)
 		stop("Cannot find .idat files")
 	if(length(grnfiles)!=length(redfiles))
 		stop("Cannot find matching .idat files")
 	if(path[1] != "." & path[1] != ""){
 		grnidats = file.path(path, grnfiles)
 		redidats = file.path(path, redfiles)
 	}  else {
 		message("path arg not set.  Assuming files are in local directory, or that complete path is provided")
 		grnidats = grnfiles
 		redidats = redfiles
 	}
 	if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
 	if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
e972ec4a
 
d3320c21
 	if(verbose) message("Initializing container for genotyping and copy number estimation")
3be95c2b
 	pkgname <- getCrlmmAnnotationName(cdfName)
 	path <- system.file("extdata", package=pkgname)
 	genome <- getAvailableIlluminaGenomeBuild(path)
 	featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome)
450a6d8a
 	nr = nrow(featureData); nc = narrays
063b3d14
 	sns <-  if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
 		make.unique(sampleSheet$Sample_ID)
         } else{
 		make.unique(basename(arrayNames))
 	}
 	biga <- initializeBigMatrix(name="A", nr, nc)
 	bigb <- initializeBigMatrix(name="B", nr, nc)
 	bigc <- initializeBigMatrix(name="call", nr, nc)
 	bigd <- initializeBigMatrix(name="callPr", nr,nc)
 	colnames(biga) <- colnames(bigb) <- colnames(bigc) <- colnames(bigd) <- sns
453e688a
 	cnSet <- new("CNSet",
063b3d14
 		     alleleA=biga,
 		     alleleB=bigb,
 		     call=bigc,
 		     callProbability=bigd,
e972ec4a
 		     annotation=cdfName,
7c0c9ac5
 		     featureData=featureData,
3be95c2b
 		     batch=batch,
 		     genome=genome)
063b3d14
 ##        if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) {
 ##		sampleNames(cnSet) = make.unique(sampleSheet$Sample_ID)
 ##        } else{
 ##		sampleNames(cnSet) <- make.unique(basename(arrayNames))
 ##	}
e972ec4a
 	if(saveDate){
99202387
 		protocolData = getProtocolData.Illumina(grnidats, sep=sep, fileExt=fileExt$green, verbose=verbose)
e972ec4a
 	} else{
450a6d8a
 		protocolData = annotatedDataFrameFrom(A(cnSet), byrow=FALSE)
e972ec4a
 	}
450a6d8a
 	rownames(pData(protocolData)) = sampleNames(cnSet)
 	protocolData(cnSet) = protocolData
7c0c9ac5
 	##featureData(cnSet) = featureData
450a6d8a
 	featureNames(cnSet) = featureNames(featureData)
cfa9fbbc
 	cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer")
 	cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double")
 	cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double")
063b3d14
 	##sampleNames(cnSet) <- basename(sampleNames(cnSet))
e972ec4a
 	return(cnSet)
 }
f0f8ba16
 construct.Illumina <- constructInf
e972ec4a
 
f0f8ba16
 preprocessInf <- function(cnSet,
 			  sampleSheet=NULL,
cfa9fbbc
 		       arrayNames=NULL,
 		       ids=NULL,
 		       path=".",
 		       arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 		       highDensity=TRUE,
 		       sep="_",
 		       fileExt=list(green="Grn.idat", red="Red.idat"),
 		       saveDate=TRUE,
 		       stripNorm=TRUE,
 		       useTarget=TRUE,
 		       mixtureSampleSize=10^5,
7c0c9ac5
 			  fitMixture=TRUE,
cfa9fbbc
 		       eps=0.1,
 		       verbose=TRUE,
7c0c9ac5
 		       seed=1, cdfName){
cfa9fbbc
 	stopifnot(all(c("gender", "SNR", "SKW") %in% varLabels(cnSet)))
 	open(A(cnSet))
 	open(B(cnSet))
 	sns <- sampleNames(cnSet)
 	pkgname = getCrlmmAnnotationName(annotation(cnSet))
 	is.snp = isSnp(cnSet)
 	snp.index = which(is.snp)
 	narrays = ncol(cnSet)
063b3d14
 	sampleBatches <- splitIndicesByLength(seq_len(ncol(cnSet)), ocSamples()/getDoParWorkers())
cfa9fbbc
 	mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
 	ocLapply(seq_along(sampleBatches),
3d159620
 		 crlmm:::processIDAT,
cfa9fbbc
 		 sampleBatches=sampleBatches,
 		 sampleSheet=sampleSheet,
 		 arrayNames=arrayNames,
 		 ids=ids,
 		 path=path,
 		 arrayInfoColNames=arrayInfoColNames,
 		 highDensity=highDensity,
 		 sep=sep,
 		 fileExt=fileExt,
 		 saveDate=saveDate,
 		 verbose=verbose,
 		 mixtureSampleSize=mixtureSampleSize,
 		 fitMixture=fitMixture,
 		 eps=eps,
 		 seed=seed,
 		 cdfName=cdfName,
 		 sns=sns,
 		 stripNorm=stripNorm,
 		 useTarget=useTarget,
 		 A=A(cnSet),
 		 B=B(cnSet),
5c9f60e9
 		 GT=calls(cnSet),
 		 GTP=snpCallProbability(cnSet),
cfa9fbbc
 		 SKW=cnSet$SKW,
 		 SNR=cnSet$SNR,
 		 mixtureParams=mixtureParams,
 		 is.snp=is.snp,
 		 neededPkgs=c("crlmm", pkgname)) # outdir=outdir,
 	return(mixtureParams)
 }
f0f8ba16
 preprocess <- preprocessInf
cfa9fbbc
 
f0f8ba16
 genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3),
 			SNRMin=5,
 			recallMin=10,
 			recallRegMin=1000,
 			verbose=TRUE,
 			returnParams=TRUE,
 			badSNP=0.7,
 			gender=NULL,
7c0c9ac5
 			DF=6,
 			cdfName){
284565dd
 	is.snp = isSnp(cnSet)
 	snp.index = which(is.snp)
5c9f60e9
 ##	narrays = ncol(cnSet)
 ##	open(A(cnSet))
 ##	open(B(cnSet))
 ##	open(snpCall(cnSet))
 ##	open(snpCallProbability(cnSet))
 ##	## crlmmGT2 overwrites the normalized intensities with calls and confidenceScores
 ##	message("Writing to call and callProbability slots")
 ##	for(j in 1:ncol(cnSet)){
 ##		snpCall(cnSet)[snp.index, j] <- as.integer(A(cnSet)[snp.index, j])
 ##		snpCallProbability(cnSet)[snp.index, j] <- as.integer(B(cnSet)[snp.index, j])
 ##	}
 ##	message("Writing complete.  Begin genotyping...")
 ##	close(A(cnSet))
 ##	close(B(cnSet))
3be95c2b
 	tmp <- crlmmGT2(A=A(cnSet),
 			B=B(cnSet),
 			SNR=cnSet$SNR,
 			mixtureParams=mixtureParams,
 			cdfName=cdfName,
 			col.names=sampleNames(cnSet),
 			probs=probs,
 			DF=DF,
 			SNRMin=SNRMin,
 			recallMin=recallMin,
 			recallRegMin=recallRegMin,
 			gender=gender,
 			verbose=verbose,
 			returnParams=returnParams,
 			badSNP=badSNP,
 			callsGt=calls(cnSet),
 			callsPr=snpCallProbability(cnSet))#,
1e0ed16d
 	##RS:  I changed the API for crlmmGT2 to be consistent with crlmmGT
 	## snp.names=featureNames(cnSet)[snp.index])
cfa9fbbc
 	if(verbose) message("Genotyping finished.  Updating container with genotype calls and confidence scores.")
252da926
 	open(cnSet$gender)
cfa9fbbc
 	cnSet$gender[,] = tmp$gender
252da926
 	close(cnSet$gender)
cfa9fbbc
 	TRUE
 }
 
e972ec4a
 
453e688a
 genotype.Illumina <- function(sampleSheet=NULL,
 			      arrayNames=NULL,
 			      ids=NULL,
 			      path=".",
 			      arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 			      highDensity=FALSE,
 			      sep="_",
 			      fileExt=list(green="Grn.idat", red="Red.idat"),
 			      cdfName,
 			      copynumber=TRUE,
 			      batch,
 			      saveDate=TRUE,
 			      stripNorm=TRUE,
 			      useTarget=TRUE,
 			      mixtureSampleSize=10^5,
 			      fitMixture=TRUE,
 			      eps=0.1,
 			      verbose=TRUE,
 			      seed=1,
 			      sns,
 			      probs=rep(1/3, 3),
 			      DF=6,
 			      SNRMin=5,
 			      recallMin=10,
 			      recallRegMin=1000,
 			      gender=NULL,
 			      returnParams=TRUE,
ac7e0c8f
 			      badSNP=0.7) {
450a6d8a
 	is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE)
d878c182
 	stopifnot(is.lds)
d3320c21
 	if(missing(cdfName)) stop("must specify cdfName")
 	if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
f0f8ba16
 	stopifnot(!missing(batch))
7c0c9ac5
 	if(is(batch, "factor")) batch <- as.character(batch)
f0f8ba16
         pkgname <- getCrlmmAnnotationName(cdfName)
 	message("Instantiate CNSet container.")
 	cnSet <- constructInf(sampleSheet=sampleSheet,
 				    arrayNames=arrayNames,
 				    path=path,
 				    arrayInfoColNames=arrayInfoColNames,
 				    highDensity=highDensity,
 				    sep=sep,
 				    fileExt=fileExt,
 				    cdfName=cdfName,
 				    verbose=verbose,
 				    batch=batch,
 				    saveDate=saveDate)
 	mixtureParams <- preprocessInf(cnSet=cnSet,
 				    sampleSheet=sampleSheet,
 				    arrayNames=arrayNames,
 				    ids=ids,
 				    path=path,
 				    arrayInfoColNames=arrayInfoColNames,
 				    highDensity=highDensity,
 				    sep=sep,
 				    fileExt=fileExt,
 				    saveDate=saveDate,
 				    stripNorm=stripNorm,
 				    useTarget=useTarget,
 				    mixtureSampleSize=mixtureSampleSize,
 				    fitMixture=fitMixture,
 				    eps=eps,
 				    verbose=verbose,
7c0c9ac5
 				    seed=seed,
 				       cdfName=cdfName)
f0f8ba16
 	message("Preprocessing complete.  Begin genotyping...")
 	genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams,
3be95c2b
 		    probs=probs,
f0f8ba16
 		   SNRMin=SNRMin,
 		   recallMin=recallMin,
 		   recallRegMin=recallRegMin,
 		   verbose=verbose,
 		   returnParams=returnParams,
 		   badSNP=badSNP,
 		   gender=gender,
7c0c9ac5
 		   DF=DF,
 		    cdfName=cdfName)
f0f8ba16
 	return(cnSet)
d3320c21
 }
 
 
453e688a
 
 
edc69ca1
 processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL,
453e688a
 			arrayNames=NULL,
802ca707
 			ids=NULL,
 			path=".",
 			arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 			highDensity=FALSE,
 			sep="_",
 			fileExt=list(green="Grn.idat", red="Red.idat"),
 			saveDate=FALSE,
 			verbose=TRUE,
453e688a
 			mixtureSampleSize=10^5,
802ca707
 			fitMixture=TRUE,
 			eps=0.1,
 			seed=1,
 			cdfName,
 			sns,
 			stripNorm=TRUE,
 			useTarget=TRUE,
5c9f60e9
 			A, B,
 			GT,
 			GTP,
 			SKW, SNR, mixtureParams, is.snp) { #, outdir=".") {
edc69ca1
 	message("Processing sample stratum ", stratum, " of ", length(sampleBatches))
 	sel <- sampleBatches[[stratum]]
d3320c21
         if(length(path)>= length(sel)) path = path[sel]
         RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel],
                        ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
99202387
                        highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose)
d3320c21
         XY = RGtoXY(RG, chipType=cdfName)
99202387
         rm(RG)
a1394042
         gc(verbose=FALSE)
9774abc3
         if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY)
d3320c21
         res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
99202387
                                seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir)
         rm(XY)
a1394042
         gc(verbose=FALSE)
5c9f60e9
 	if(verbose) message("Finished preprocessing.")
d3320c21
         snp.index = which(is.snp)
e6e6981f
 	np.index = which(!is.snp)
453e688a
 	open(A); open(B);
5c9f60e9
 	open(GT); open(GTP)
802ca707
 	Amatrix <- res[["A"]]
 	Bmatrix <- res[["B"]]
5c9f60e9
 
 	## Amatrix and Bmatrix are ordinary matrices--not ff objects.
 	## By writing columns of a ordinary matrix to GT and GTP, we
 	## save one read step later on.  GT and GTP will be updated to
 	## calls and call probabilities by the crlmmGT2 function. The A
 	## and B slots will retain the normalized intensities for the
 	## A and B alleles
802ca707
 	for(j in seq_along(sel)){
 		jj <- sel[j]
 		A[snp.index, jj] <- Amatrix[, j]
5c9f60e9
 		GT[snp.index, jj] <- Amatrix[, j]
802ca707
 		B[snp.index, jj] <- Bmatrix[, j]
5c9f60e9
 		GTP[snp.index, jj] <- Bmatrix[, j]
802ca707
 	}
5c9f60e9
 	if(length(np.index)>0) {
 		cnAmatrix <- res[["cnAB"]]$A
 		cnBmatrix <- res[["cnAB"]]$B
 		for(j in seq_along(sel)){
 			jj <- sel[j]
 			A[np.index, jj] <- cnAmatrix[, j]
 			GT[np.index, jj] <- cnAmatrix[, j]
 			B[np.index, jj] <- cnBmatrix[, j]
 			GTP[np.index, jj] <- cnBmatrix[, j]
 		}
450a6d8a
         }
453e688a
 	open(SKW); open(SNR); open(mixtureParams)
802ca707
 	SKW[sel] = res[["SKW"]][]
 	SNR[sel] = res[["SNR"]][]
 	mixtureParams[, sel] = res[["mixtureParams"]][]
5c9f60e9
         close(A); close(B)
 	close(GT); close(GTP)
         close(SNR); close(SKW)
d3320c21
         close(mixtureParams)
         rm(res)
a1394042
 	gc(verbose=FALSE)
d3320c21
         TRUE
2564dd8b
       }