R/crlmm-illumina.R
cd8b85b6
 # function below works OK provided all .idat files are in the current working directory
 # - could add an option to allow files in Illumina directory structure to be handled
 # or to use the optional 'Path' column in sampleSheet
 # - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet
 
5fcfed7d
 readIdatFiles <- function(sampleSheet=NULL,
 			  arrayNames=NULL,
 			  ids=NULL,
 			  path=".",
 			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 			  highDensity=FALSE,
 			  sep="_",
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
 			  saveDate=FALSE) {
 #       if(!is.null(arrayNames)) {
 #	       arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
 #	       if(!is.null(sampleSheet)) {
 #		       sampleSheet=NULL
 #		       cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
 #	       }
 #	       pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
 #       }
        if(!is.null(arrayNames)) {
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
        }
        if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
 	       if(is.null(arrayNames)){
 		       ##arrayNames=NULL
 		       if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
 			       barcode = sampleSheet[,arrayInfoColNames$barcode]
 			       arrayNames=barcode
 		       }
56816837
 		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
5fcfed7d
 			       position = sampleSheet[,arrayInfoColNames$position]
 			       if(is.null(arrayNames))
 				       arrayNames=position
 			       else
 				       arrayNames = paste(arrayNames, position, sep=sep)
 			       if(highDensity) {
 				       hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
 				       for(i in names(hdExt))
 					       arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
 			       }
 		       }
 	       }
 	       pd = new("AnnotatedDataFrame", data = sampleSheet)
56816837
 	       sampleNames(pd) <- basename(arrayNames)
5fcfed7d
        }
        if(is.null(arrayNames)) {
                arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
                if(!is.null(sampleSheet)) {
                       sampleSheet=NULL
                       cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
                }
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
        }
56816837
 
cfdeb14b
        narrays = length(arrayNames)
cd8b85b6
        grnfiles = paste(arrayNames, fileExt$green, sep=sep)
        redfiles = paste(arrayNames, fileExt$red, sep=sep)
        if(length(grnfiles)==0 || length(redfiles)==0)
64463948
 	       stop("Cannot find .idat files")
cd8b85b6
        if(length(grnfiles)!=length(redfiles))
64463948
 	       stop("Cannot find matching .idat files")
bc16b510
        if(path[1] != "."){
e4e0b214
 	       grnidats = file.path(path, grnfiles)
 	       redidats = file.path(path, redfiles)
        }  else {
5fcfed7d
 	       message("path arg not set.  Assuming files are in local directory")
e4e0b214
 	       grnidats <- grnfiles
 	       redidats <- redfiles
2ae7850e
        }
d3a7556e
        if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
56816837
        if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
e4e0b214
 ##       if(!all(c(redfiles,grnfiles) %in% dir(path=path))){
 ##	       stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
 ##		    paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
 ##       }
cfdeb14b
        headerInfo = list(nProbes = rep(NA, narrays),
                          Barcode = rep(NA, narrays),
                          ChipType = rep(NA, narrays),
                          Manifest = rep(NA, narrays), # not sure about this one - sometimes blank
                          Position = rep(NA, narrays)) # this may also vary a bit
5fcfed7d
        dates = list(decode=rep(NA, narrays),
                     scan=rep(NA, narrays))
        ## read in the data
cd8b85b6
        for(i in seq(along=arrayNames)) {
64463948
 	       cat("reading", arrayNames[i], "\t")
 	       idsG = idsR = G = R = NULL
 	       cat(paste(sep, fileExt$green, sep=""), "\t")
 	       G = readIDAT(grnidats[i])
 	       idsG = rownames(G$Quants)
 	       headerInfo$nProbes[i] = G$nSNPsRead
 	       headerInfo$Barcode[i] = G$Barcode
 	       headerInfo$ChipType[i] = G$ChipType
 	       headerInfo$Manifest[i] = G$Unknown$MostlyNull
 	       headerInfo$Position[i] = G$Unknowns$MostlyA
8499df94
                if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) {
                        ## headerInfo$ChipType[i]!=headerInfo$ChipType[1] || headerInfo$Manifest[i]!=headerInfo$Manifest[1]) {
64463948
 		       ## || headerInfo$nProbes[i]!=headerInfo$nProbes[1] ## removed this condition as some arrays used the same manifest
 		       ## but differed by a few SNPs for some reason - most of the chip was the same though
 		       ##           stop("Chips are not of all of the same type - please check your data")
 		       warning("Chips are not of the same type.  Skipping ", basename(grnidats[i]), " and ", basename(redidats[i]))
 		       next()
 	       }
 	       dates$decode[i] = G$RunInfo[1, 1]
 	       dates$scan[i] = G$RunInfo[2, 1]
 	       if(i==1) {
fe3e3f61
 		       if(is.null(ids) && !is.null(G)){
64463948
 			       ids = idsG
fe3e3f61
 		       }else stop("Could not find probe IDs")
64463948
 		       nprobes = length(ids)
 		       narrays = length(arrayNames)
5fcfed7d
 		       tmpmat = matrix(NA, nprobes, narrays)
 		       rownames(tmpmat) = ids
 		       if(!is.null(sampleSheet)){
 			       colnames(tmpmat) = sampleSheet$Sample_ID
 		       }else  colnames(tmpmat) = arrayNames
 		       RG <- new("NChannelSet",
 				 R=tmpmat,
 				 G=tmpmat,
 				 zero=tmpmat,
 #				 Rnb=tmpmat,
 #				 Gnb=tmpmat,
 #				 Rse=tmpmat,
 #				 Gse=tmpmat,
 				 annotation=headerInfo$Manifest[1],
 				 phenoData=pd,
 				 storage.mode="environment")
 		       rm(tmpmat)
64463948
 		       gc()
 	       }
 	       if(length(ids)==length(idsG)) {
 		       if(sum(ids==idsG)==nprobes) {
 			       RG@assayData$G[,i] = G$Quants[, "Mean"]
5fcfed7d
 				   zeroG = G$Quants[, "NBeads"]==0
4e4601af
 #			       RG@assayData$Gnb[,i] = G$Quants[, "NBeads"]
 #			       RG@assayData$Gse[,i] = G$Quants[, "SD"]
64463948
 		       }
fe3e3f61
 	       } else {
64463948
 		       indG = match(ids, idsG)
 		       RG@assayData$G[,i] = G$Quants[indG, "Mean"]
5fcfed7d
 			   zeroG = G$Quants[indG, "NBeads"]==0
4e4601af
 #		       RG@assayData$Gnb[,i] = G$Quants[indG, "NBeads"]
5fcfed7d
 #		       RG@assayData$Gse[,i] = G$Quants[indG, "SD"]
64463948
 	       }
 	       rm(G)
 	       gc()
56816837
 
64463948
 	       cat(paste(sep, fileExt$red, sep=""), "\n")
 	       R = readIDAT(redidats[i])
 	       idsR = rownames(R$Quants)
56816837
 
 	       if(length(ids)==length(idsG)) {
64463948
 		       if(sum(ids==idsR)==nprobes) {
 			       RG@assayData$R[,i] = R$Quants[ ,"Mean"]
5fcfed7d
 				   zeroR = R$Quants[ ,"NBeads"]==0
4e4601af
 #			       RG@assayData$Rnb[,i] = R$Quants[ ,"NBeads"]
 #			       RG@assayData$Rse[,i] = R$Quants[ ,"SD"]
64463948
 		       }
fe3e3f61
 	       } else {
64463948
 		       indR = match(ids, idsR)
 		       RG@assayData$R[,i] = R$Quants[indR, "Mean"]
5fcfed7d
 			   zeroR = R$Quants[indR, "NBeads"]==0
4e4601af
 #		       RG@assayData$Rnb[,i] = R$Quants[indR, "NBeads"]
 #		       RG@assayData$Rse[,i] = R$Quants[indR, "SD"]
64463948
 	       }
5fcfed7d
 		   RG@assayData$zero[,i] = zeroG | zeroR
4e4601af
 	       rm(R, zeroG, zeroR)
64463948
 	       gc()
cd8b85b6
        }
5fcfed7d
        if(saveDate) {
 	       protocolData(RG)[["ScanDate"]] = dates$scan
        }
        storageMode(RG) = "lockedEnvironment"
        RG
cd8b85b6
 }
 
 
e609f78b
 readIdatFiles2 <- function(sampleSheet=NULL,
 			  arrayNames=NULL,
 			  ids=NULL,
 			  path=".",
 			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 			  highDensity=FALSE,
 			  sep="_",
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
 			  saveDate=FALSE) {
 #       if(!is.null(arrayNames)) {
 #	       arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
 #	       if(!is.null(sampleSheet)) {
 #		       sampleSheet=NULL
 #		       cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
 #	       }
 #	       pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
 #       }
        if(!is.null(arrayNames)) {
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
        }
        if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet
 	       if(is.null(arrayNames)){
 		       ##arrayNames=NULL
 		       if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) {
 			       barcode = sampleSheet[,arrayInfoColNames$barcode]
 			       arrayNames=barcode
 		       }
56816837
 		       if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) {
e609f78b
 			       position = sampleSheet[,arrayInfoColNames$position]
 			       if(is.null(arrayNames))
 				       arrayNames=position
 			       else
 				       arrayNames = paste(arrayNames, position, sep=sep)
 			       if(highDensity) {
 				       hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02")
 				       for(i in names(hdExt))
 					       arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames)
 			       }
 		       }
 	       }
 	       pd = new("AnnotatedDataFrame", data = sampleSheet)
56816837
 	       sampleNames(pd) <- basename(arrayNames)
e609f78b
        }
        if(is.null(arrayNames)) {
                arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path))
                if(!is.null(sampleSheet)) {
                       sampleSheet=NULL
                       cat("Could not find required info in \'sampleSheet\' - ignoring.  Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n")
                }
                pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames))
        }
56816837
 
e609f78b
        narrays = length(arrayNames)
        grnfiles = paste(arrayNames, fileExt$green, sep=sep)
        redfiles = paste(arrayNames, fileExt$red, sep=sep)
        if(length(grnfiles)==0 || length(redfiles)==0)
 	       stop("Cannot find .idat files")
        if(length(grnfiles)!=length(redfiles))
 	       stop("Cannot find matching .idat files")
6ec3ec41
        if(path[1] != "." & path[1] != ""){
e609f78b
 	       grnidats = file.path(path, grnfiles)
 	       redidats = file.path(path, redfiles)
        }  else {
 	       message("path arg not set.  Assuming files are in local directory")
 	       grnidats <- grnfiles
 	       redidats <- redfiles
        }
        if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files")
56816837
        if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files")
e609f78b
 ##       if(!all(c(redfiles,grnfiles) %in% dir(path=path))){
 ##	       stop("Missing .idat files: red\n", paste(redfiles[!(redfiles %in% dir(path=path))], sep=" "), "\n green\n",
 ##		    paste(grnfiles[!(grnfiles %in% dir(path=path))], sep=" "))
 ##       }
        headerInfo = list(nProbes = rep(NA, narrays),
                          Barcode = rep(NA, narrays),
                          ChipType = rep(NA, narrays),
                          Manifest = rep(NA, narrays), # not sure about this one - sometimes blank
                          Position = rep(NA, narrays)) # this may also vary a bit
        dates = list(decode=rep(NA, narrays),
                     scan=rep(NA, narrays))
        ## read in the data
        for(i in seq(along=arrayNames)) {
 	       cat("reading", arrayNames[i], "\t")
 	       idsG = idsR = G = R = NULL
 	       cat(paste(sep, fileExt$green, sep=""), "\t")
 	       G = readIDAT(grnidats[i])
 	       idsG = rownames(G$Quants)
 	       headerInfo$nProbes[i] = G$nSNPsRead
 	       headerInfo$Barcode[i] = G$Barcode
 	       headerInfo$ChipType[i] = G$ChipType
 	       headerInfo$Manifest[i] = G$Unknown$MostlyNull
 	       headerInfo$Position[i] = G$Unknowns$MostlyA
                if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) {
                        ## headerInfo$ChipType[i]!=headerInfo$ChipType[1] || headerInfo$Manifest[i]!=headerInfo$Manifest[1]) {
 		       ## || headerInfo$nProbes[i]!=headerInfo$nProbes[1] ## removed this condition as some arrays used the same manifest
 		       ## but differed by a few SNPs for some reason - most of the chip was the same though
 		       ##           stop("Chips are not of all of the same type - please check your data")
 		       warning("Chips are not of the same type.  Skipping ", basename(grnidats[i]), " and ", basename(redidats[i]))
 		       next()
 	       }
 	       dates$decode[i] = G$RunInfo[1, 1]
 	       dates$scan[i] = G$RunInfo[2, 1]
 	       if(i==1) {
 		       if(is.null(ids) && !is.null(G)){
 			       ids = idsG
 		       } else stop("Could not find probe IDs")
 		       nprobes = length(ids)
 		       narrays = length(arrayNames)
 #		        tmpmat = matrix(NA, nprobes, narrays)
 #		        rownames(tmpmat) = ids
 #		       fD <- new("AnnotatedDataFrame", data=data.frame(row.names=ids))##, varMetadata=data.frame(labelDescript
 		       RG <- new("NChannelSet",
 		                 R=initializeBigMatrix(name="R", nr=nprobes, nc=narrays, vmode="integer"),
 		                 G=initializeBigMatrix(name="G", nr=nprobes, nc=narrays, vmode="integer"),
 		                 zero=initializeBigMatrix(name="zero", nr=nprobes, nc=narrays, vmode="integer"),
 #		                 featureData=fD,
 #		                 annotation=cdfName)
 #				 R=tmpmat,
 #				 G=tmpmat,
 #				 zero=tmpmat,
 #				 Rnb=tmpmat,
 #				 Gnb=tmpmat,
 #				 Rse=tmpmat,
 #				 Gse=tmpmat,
 				 annotation=headerInfo$Manifest[1],
 				 phenoData=pd,
 				 storage.mode="environment")
                        featureNames(RG) = ids
 #		       rm(tmpmat)
 		       if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){
 		            sampleNames(RG) = sampleSheet$Sample_ID
 		       } else  sampleNames(RG) = arrayNames
 		       gc()
 	       }
 	       if(length(ids)==length(idsG)) {
 		       if(sum(ids==idsG)==nprobes) {
 			       RG@assayData$G[,i] = G$Quants[, "Mean"]
 			       zeroG = G$Quants[, "NBeads"]==0
 #			       RG@assayData$Gnb[,i] = G$Quants[, "NBeads"]
 #			       RG@assayData$Gse[,i] = G$Quants[, "SD"]
 		       }
 	       } else {
 		       indG = match(ids, idsG)
 		       RG@assayData$G[,i] = G$Quants[indG, "Mean"]
 		       zeroG = G$Quants[indG, "NBeads"]==0
 #		       RG@assayData$Gnb[,i] = G$Quants[indG, "NBeads"]
 #		       RG@assayData$Gse[,i] = G$Quants[indG, "SD"]
 	       }
 	       rm(G)
 	       gc()
56816837
 
e609f78b
 	       cat(paste(sep, fileExt$red, sep=""), "\n")
 	       R = readIDAT(redidats[i])
 	       idsR = rownames(R$Quants)
56816837
 
 	       if(length(ids)==length(idsG)) {
e609f78b
 		       if(sum(ids==idsR)==nprobes) {
 			       RG@assayData$R[,i] = R$Quants[ ,"Mean"]
 		               zeroR = R$Quants[ ,"NBeads"]==0
 #			       RG@assayData$Rnb[,i] = R$Quants[ ,"NBeads"]
 #			       RG@assayData$Rse[,i] = R$Quants[ ,"SD"]
 		       }
 	       } else {
 		       indR = match(ids, idsR)
 		       RG@assayData$R[,i] = R$Quants[indR, "Mean"]
 		       zeroR = R$Quants[indR, "NBeads"]==0
 #		       RG@assayData$Rnb[,i] = R$Quants[indR, "NBeads"]
 #		       RG@assayData$Rse[,i] = R$Quants[indR, "SD"]
 	       }
 	       RG@assayData$zero[,i] = zeroG | zeroR
 	       rm(R, zeroG, zeroR)
 	       gc()
        }
        if(saveDate) {
 	       protocolData(RG)[["ScanDate"]] = dates$scan
        }
        storageMode(RG) = "lockedEnvironment"
        RG
 }
 
 
64463948
 ## the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008
cd8b85b6
 readIDAT <- function(idatFile){
 
   fileSize <- file.info(idatFile)$size
 
   tempCon <- file(idatFile,"rb")
   prefixCheck <- readChar(tempCon,4)
   if(prefixCheck != "IDAT"){
 
   }
 
56816837
   versionNumber <- readBin(tempCon, "integer", n=1, size=8,
cd8b85b6
                            endian="little", signed=FALSE)
56816837
 
   nFields <- readBin(tempCon, "integer", n=1, size=4,
cd8b85b6
                      endian="little", signed=FALSE)
 
   fields <- matrix(0,nFields,3);
   colnames(fields) <- c("Field Code", "Byte Offset", "Bytes")
   for(i1 in 1:nFields){
56816837
     fields[i1,"Field Code"] <-
cd8b85b6
       readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE)
56816837
     fields[i1,"Byte Offset"] <-
cd8b85b6
       readBin(tempCon, "integer", n=1, size=8, endian="little", signed=FALSE)
   }
 
   knownCodes <- c(1000, 102, 103, 104, 107, 200, 300, 400,
                   401, 402, 403, 404, 405, 406, 407, 408, 409)
56816837
   names(knownCodes) <-
cd8b85b6
     c("nSNPsRead",  # 1000
       "IlluminaID", #  102
       "SD",         #  103
       "Mean",       #  104
       "NBeads",     #  107
       "MidBlock",   #  200
       "RunInfo",    #  300
       "RedGreen",   #  400
       "MostlyNull", #  401
       "Barcode",    #  402
       "ChipType",   #  403
       "MostlyA",    #  404
       "Unknown.1",  #  405
       "Unknown.2",  #  406
       "Unknown.3",  #  407
       "Unknown.4",  #  408
       "Unknown.5"   #  409
       )
 
   nNewFields <- 1
   rownames(fields) <- paste("Null", 1:nFields)
   for(i1 in 1:nFields){
     temp <- match(fields[i1,"Field Code"], knownCodes)
     if(!is.na(temp)){
       rownames(fields)[i1] <- names(knownCodes)[temp]
     }else{
       rownames(fields)[i1] <- paste("newField", nNewFields, sep=".")
       nNewFields <- nNewFields + 1
     }
   }
 
   seek(tempCon, fields["nSNPsRead", "Byte Offset"])
56816837
   nSNPsRead <- readBin(tempCon, "integer", n=1, size=4,
cd8b85b6
                        endian="little", signed=FALSE)
 
   seek(tempCon, fields["IlluminaID", "Byte Offset"])
56816837
   IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4,
cd8b85b6
                        endian="little", signed=FALSE)
 
   seek(tempCon, fields["SD", "Byte Offset"])
56816837
   SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2,
cd8b85b6
                 endian="little", signed=FALSE)
 
   seek(tempCon, fields["Mean", "Byte Offset"])
56816837
   Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2,
cd8b85b6
                   endian="little", signed=FALSE)
 
   seek(tempCon, fields["NBeads", "Byte Offset"])
   NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE)
 
   seek(tempCon, fields["MidBlock", "Byte Offset"])
56816837
   nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4,
cd8b85b6
                               endian="little", signed=FALSE)
56816837
   MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4,
cd8b85b6
                       endian="little", signed=FALSE)
 
   seek(tempCon, fields["RunInfo", "Byte Offset"])
56816837
   nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4,
cd8b85b6
                             endian="little", signed=FALSE)
   RunInfo <- matrix(NA, nRunInfoBlocks, 5)
56816837
   colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars",
cd8b85b6
                          "BlockCode", "CodeVersion")
b938db26
   for(i1 in 1:2) { #nRunInfoBlocks){  ## MR edit
cd8b85b6
     for(i2 in 1:5){
       nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
       RunInfo[i1,i2] <- readChar(tempCon, nChars)
     }
   }
 
   seek(tempCon, fields["RedGreen", "Byte Offset"])
56816837
   RedGreen <- readBin(tempCon, "numeric", n=1, size=4,
cd8b85b6
                       endian="little", signed=FALSE)
56816837
   #RedGreen <- readBin(tempCon, "integer", n=4, size=1,
cd8b85b6
   #                    endian="little", signed=FALSE)
 
   seek(tempCon, fields["MostlyNull", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   MostlyNull <- readChar(tempCon, nChars)
cd8b85b6
 
   seek(tempCon, fields["Barcode", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   Barcode <- readChar(tempCon, nChars)
cd8b85b6
 
   seek(tempCon, fields["ChipType", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   ChipType <- readChar(tempCon, nChars)
cd8b85b6
 
   seek(tempCon, fields["MostlyA", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   MostlyA <- readChar(tempCon, nChars)
cd8b85b6
 
   seek(tempCon, fields["Unknown.1", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   Unknown.1 <- readChar(tempCon, nChars)
cd8b85b6
 
   seek(tempCon, fields["Unknown.2", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   Unknown.2 <- readChar(tempCon, nChars)
cd8b85b6
 
   seek(tempCon, fields["Unknown.3", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   Unknown.3 <- readChar(tempCon, nChars)
cd8b85b6
 
   seek(tempCon, fields["Unknown.4", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   Unknown.4 <- readChar(tempCon, nChars)
cd8b85b6
 
   seek(tempCon, fields["Unknown.5", "Byte Offset"])
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   Unknown.5 <- readChar(tempCon, nChars)
cd8b85b6
 
   close(tempCon)
 
56816837
   Unknowns <-
cd8b85b6
     list(MostlyNull=MostlyNull,
          MostlyA=MostlyA,
          Unknown.1=Unknown.1,
          Unknown.2=Unknown.2,
          Unknown.3=Unknown.3,
          Unknown.4=Unknown.4,
          Unknown.5=Unknown.5)
 
   Quants <- cbind(Mean, SD, NBeads)
   colnames(Quants) <- c("Mean", "SD", "NBeads")
   rownames(Quants) <- as.character(IlluminaID)
 
56816837
   idatValues <-
     list(fileSize=fileSize,
          versionNumber=versionNumber,
          nFields=nFields,
cd8b85b6
          fields=fields,
56816837
          nSNPsRead=nSNPsRead,
          #IlluminaID=IlluminaID,
          #SD=SD,
          #Mean=Mean,
cd8b85b6
          #NBeads=NBeads,
          Quants=Quants,
56816837
          MidBlock=MidBlock,
          RunInfo=RunInfo,
          RedGreen=RedGreen,
          Barcode=Barcode,
cd8b85b6
          ChipType=ChipType,
          Unknowns=Unknowns)
 
   idatValues
 
 }
 
 readBPM <- function(bpmFile){
 
   ## Reads and parses Illumina BPM files
56816837
 
cd8b85b6
   fileSize <- file.info(bpmFile)$size
 
   tempCon <- file(bpmFile,"rb")
 
   # The first few bytes of the egtFile are some type of
56816837
   # header, but there's no related byte offset information.
cd8b85b6
 
   prefixCheck <- readChar(tempCon,3) ## should be "BPM"
 
   null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
56816837
   ## should be 1
 
   versionNumber <-
cd8b85b6
     readBin(tempCon, "integer", n=1, size=4, endian="little", signed=FALSE)
   ## should be 4
 
   nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
   chipType <- readChar(tempCon, nChars)
 
   null.2 <- readBin(tempCon, "integer", n=2, size=1, signed=FALSE)
 
   csvLines <- readLines(tempCon, 22)
 
   entriesByteOffset <- seek(tempCon);
   nEntries <- readBin(tempCon, "integer", n=1, size=4,
                       endian="little", signed=FALSE)
56816837
 
cd8b85b6
   if(FALSE){
56816837
 
cd8b85b6
     snpIndexByteOffset <- seek(tempCon)
     snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4,
                         endian="little", signed=FALSE)
     ## for the 1M array, these are simply in order from 1 to 1072820.
56816837
 
cd8b85b6
     snpNamesByteOffset <- seek(tempCon)
     snpNames <- rep("A", nEntries)
     for(i1 in 1:nEntries){
       nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE)
       snpNames[i1] <- readChar(tempCon, nChars)
     }
 
   }
 
   seek(tempCon, 15278138)
56816837
 
cd8b85b6
   normIDByteOffset <- seek(tempCon)
   normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1
56816837
 
cd8b85b6
   newBlockByteOffset <- seek(tempCon)
   newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE)
56816837
 
cd8b85b6
   close(tempCon)
 
   byteOffsets <- list(entriesByteOffset=entriesByteOffset,
56816837
                       #snpIndexByteOffset=snpIndexByteOffset,
cd8b85b6
                       #snpNamesByteOffset=snpNamesByteOffset,
                       normIDByteOffset=normIDByteOffset,
                       newBlockByteOffset=newBlockByteOffset)
56816837
 
cd8b85b6
   allStuff <- list(prefixCheck=prefixCheck,
                    null.1=null.1,
                    versionNumber=versionNumber,
                    chipType=chipType,
                    null.2=null.2,
                    csvLines=csvLines,
                    nEntries=nEntries,
                    #snpIndex=snpIndex,
                    #snpNames=snpNames,
                    normID=normID,
                    newBlock=newBlock,
                    byteOffsets=byteOffsets)
   allStuff
56816837
 
cd8b85b6
 }
 
5fcfed7d
 RGtoXY = function(RG, chipType, verbose=TRUE) {
   chipList = c("human1mv1c",             # 1M
                "human370v1c",            # 370CNV
                "human650v3a",            # 650Y
                "human610quadv1b",        # 610 quad
                "human660quadv1a",        # 660 quad
                "human370quadv3c",        # 370CNV quad
                "human550v3b",            # 550K
                "human1mduov3b",          # 1M Duo
56816837
                "humanomni1quadv1b",      # Omni1 quad
 	       "humanomniexpress12v1b")  # Omni express 12
5fcfed7d
   if(missing(chipType)){
 	  chipType = match.arg(annotation(RG), chipList)
   } else chipType = match.arg(chipType, chipList)
56816837
 
5fcfed7d
   pkgname <- getCrlmmAnnotationName(chipType)
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
      suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
      msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
      message(strwrap(msg))
      stop("Package ", pkgname, " could not be found.")
      rm(suggCall, msg)
   }
   if(verbose) message("Loading chip annotation information.")
     loader("address.rda", .crlmmPkgEnv, pkgname)
 #    data(annotation, package=pkgname, envir=.crlmmPkgEnv)
   aids <- getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest
   bids <- getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
   ids <- names(aids)
   snpbase <- getVarInEnv("base")
56816837
 
5fcfed7d
   nsnps = length(aids)
   narrays = ncol(RG)
56816837
 
5fcfed7d
 #  aidcol = match("AddressA_ID", colnames(annot))
 #  if(is.na(aidcol))
 #    aidcol = match("Address", colnames(annot))
 #  bidcol = match("AddressB_ID", colnames(annot))
56816837
 #  if(is.na(bidcol))
5fcfed7d
 #    bidcol = match("Address2", colnames(annot))
 #  aids = annot[, aidcol]
 #  bids = annot[, bidcol]
 #  snpids = annot[,"Name"]
56816837
 #  snpbase = annot[,"SNP"]
   infI = !is.na(bids) & bids!=0
5fcfed7d
   aord = match(aids, featureNames(RG)) # NAs are possible here
   bord = match(bids, featureNames(RG)) # and here
 #  argrg = aids[rrgg]
 #  brgrg = bids[rrgg]
 
a11efc9c
   tmpmat = matrix(as.integer(0), nsnps, narrays)
5fcfed7d
   rownames(tmpmat) = ids
   colnames(tmpmat) = sampleNames(RG)
   XY = new("NChannelSet", X=tmpmat, Y=tmpmat, zero=tmpmat, # Xnb=tmpmat, Ynb=tmpmat, Xse=tmpmat, Yse=tmpmat, zero=tmpmat,
                  annotation=chipType, phenoData=RG@phenoData, protocolData=RG@protocolData, storage.mode="environment")
   rm(tmpmat)
   gc()
56816837
 
5fcfed7d
   # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
   XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
   XY@assayData$Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
   XY@assayData$zero[!is.na(aord),] = exprs(channel(RG, "zero"))[aord[!is.na(aord)],] # mostly green
 #  XY@assayData$Xnb[!is.na(aord),] = exprs(channel(RG, "Rnb"))[aord[!is.na(aord)],]
 #  XY@assayData$Ynb[!is.na(aord),] = exprs(channel(RG, "Gnb"))[aord[!is.na(aord)],]
 #  XY@assayData$Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
 #  XY@assayData$Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
   gc()
56816837
 
5fcfed7d
   ## Warning - not 100% sure that the code below is correct - could be more complicated than this
56816837
 
5fcfed7d
   # Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
 #  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
56816837
 
5fcfed7d
 #  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
 #  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
 #  Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
 #  Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
 #  Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
 #  Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
56816837
 
5fcfed7d
   # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
 #  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
 
 #  X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
 #  Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
 #  Xnb[infIGG,] = exprs(channel(RG, "Gnb"))[aord[infIGG],]
 #  Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
 #  Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
 #  Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
56816837
 
5fcfed7d
   #  For now zero out Infinium I probes
   XY@assayData$X[infI,] = 0
   XY@assayData$Y[infI,] = 0
56816837
   XY@assayData$zero[infI,] = 0
5fcfed7d
 #  XY@assayData$Xnb[infI,] = 0
 #  XY@assayData$Ynb[infI,] = 0
 #  XY@assayData$Xse[infI,] = 0
 #  XY@assayData$Yse[infI,] = 0
 
 #  XY@assayData$zero[XY@assayData$X==0 | XY@assayData$Y==0] = 1
   gc()
 
 #  storageMode(XY) = "lockedEnvironment"
   XY
cd8b85b6
 }
 
e609f78b
 
 RGtoXY2 = function(RG, chipType, verbose=TRUE) {
56816837
 	  chipList = c("human1mv1c",             # 1M
e609f78b
                "human370v1c",            # 370CNV
                "human650v3a",            # 650Y
                "human610quadv1b",        # 610 quad
                "human660quadv1a",        # 660 quad
                "human370quadv3c",        # 370CNV quad
                "human550v3b",            # 550K
                "human1mduov3b",          # 1M Duo
56816837
                "humanomni1quadv1b",      # Omni1 quad
 	       "humanomniexpress12v1b")  # Omni express 12
e609f78b
   if(missing(chipType)){
 	  chipType = match.arg(annotation(RG), chipList)
   } else chipType = match.arg(chipType, chipList)
56816837
 
e609f78b
   pkgname <- getCrlmmAnnotationName(chipType)
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
      suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
      msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
      message(strwrap(msg))
      stop("Package ", pkgname, " could not be found.")
      rm(suggCall, msg)
   }
   if(verbose) message("Loading chip annotation information.")
     loader("address.rda", .crlmmPkgEnv, pkgname)
 #    data(annotation, package=pkgname, envir=.crlmmPkgEnv)
   aids <- getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest
   bids <- getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest
   ids <- names(aids)
   snpbase <- getVarInEnv("base")
56816837
 
e609f78b
   nsnps = length(aids)
   narrays = ncol(RG)
56816837
 
e609f78b
 #  aidcol = match("AddressA_ID", colnames(annot))
 #  if(is.na(aidcol))
 #    aidcol = match("Address", colnames(annot))
 #  bidcol = match("AddressB_ID", colnames(annot))
56816837
 #  if(is.na(bidcol))
e609f78b
 #    bidcol = match("Address2", colnames(annot))
 #  aids = annot[, aidcol]
 #  bids = annot[, bidcol]
 #  snpids = annot[,"Name"]
56816837
 #  snpbase = annot[,"SNP"]
   infI = !is.na(bids) & bids!=0
e609f78b
   aord = match(aids, featureNames(RG)) # NAs are possible here
   bord = match(bids, featureNames(RG)) # and here
 #  argrg = aids[rrgg]
 #  brgrg = bids[rrgg]
 
 #  fD <- new("AnnotatedDataFrame", data=data.frame(row.names=ids))##, varMetadata=data.frame(labelDescript
   XY <- new("NChannelSet",
 	     X=initializeBigMatrix(name="X", nr=nsnps, nc=narrays, vmode="integer"),
 	     Y=initializeBigMatrix(name="Y", nr=nsnps, nc=narrays, vmode="integer"),
 	     zero=initializeBigMatrix(name="zero", nr=nsnps, nc=narrays, vmode="integer"),
 	     annotation=chipType, phenoData=RG@phenoData, # featureData=fD
 	     protocolData=RG@protocolData, storage.mode="environment")
   featureNames(XY) = ids # featureNames(RG)
   gc()
a11efc9c
   # Need to initialize - matrices filled with NAs to begin with
   XY@assayData$X[1:nsnps,] = 0
   XY@assayData$Y[1:nsnps,] = 0
   XY@assayData$zero[1:nsnps,] = 0
56816837
 
e609f78b
   # First sort out Infinium II SNPs, X -> R (allele A)  and Y -> G (allele B) from the same probe
   XY@assayData$X[!is.na(aord),] = exprs(channel(RG, "R"))[aord[!is.na(aord)],] # mostly red
   XY@assayData$Y[!is.na(aord),] = exprs(channel(RG, "G"))[aord[!is.na(aord)],] # mostly green
   XY@assayData$zero[!is.na(aord),] = exprs(channel(RG, "zero"))[aord[!is.na(aord)],] # mostly green
 #  XY@assayData$Xnb[!is.na(aord),] = exprs(channel(RG, "Rnb"))[aord[!is.na(aord)],]
 #  XY@assayData$Ynb[!is.na(aord),] = exprs(channel(RG, "Gnb"))[aord[!is.na(aord)],]
 #  XY@assayData$Xse[!is.na(aord),] = exprs(channel(RG, "Rse"))[aord[!is.na(aord)],]
 #  XY@assayData$Yse[!is.na(aord),] = exprs(channel(RG, "Gse"))[aord[!is.na(aord)],]
   gc()
56816837
 
e609f78b
   ## Warning - not 100% sure that the code below is correct - could be more complicated than this
56816837
 
e609f78b
   # Next Infinium I where X -> R from allele A probe and Y -> R from allele B probe
 #  infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]")
56816837
 
e609f78b
 #  X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red
 #  Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green
 #  Xnb[infIRR,] = exprs(channel(RG, "Rnb"))[aord[infIRR],]
 #  Ynb[infIRR,] = exprs(channel(RG, "Rnb"))[bord[infIRR],]
 #  Xse[infIRR,] = exprs(channel(RG, "Rse"))[aord[infIRR],]
 #  Yse[infIRR,] = exprs(channel(RG, "Rse"))[bord[infIRR],]
56816837
 
e609f78b
   # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe
 #  infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]")
 
 #  X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red
 #  Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green
 #  Xnb[infIGG,] = exprs(channel(RG, "Gnb"))[aord[infIGG],]
 #  Ynb[infIGG,] = exprs(channel(RG, "Gnb"))[bord[infIGG],]
 #  Xse[infIGG,] = exprs(channel(RG, "Gse"))[aord[infIGG],]
 #  Yse[infIGG,] = exprs(channel(RG, "Gse"))[bord[infIGG],]
56816837
 
e609f78b
   #  For now zero out Infinium I probes
   XY@assayData$X[infI,] = 0
   XY@assayData$Y[infI,] = 0
56816837
   XY@assayData$zero[infI,] = 0
e609f78b
 #  XY@assayData$Xnb[infI,] = 0
 #  XY@assayData$Ynb[infI,] = 0
 #  XY@assayData$Xse[infI,] = 0
 #  XY@assayData$Yse[infI,] = 0
 
 #  XY@assayData$zero[XY@assayData$X==0 | XY@assayData$Y==0] = 1
   gc()
 
 #  storageMode(XY) = "lockedEnvironment"
   XY
 }
 
 
cd8b85b6
 stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) {
5fcfed7d
   pkgname <- getCrlmmAnnotationName(annotation(XY))
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
     suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
     msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
     message(strwrap(msg))
     stop("Package ", pkgname, " could not be found.")
     rm(suggCall, msg)
   }
   if(verbose) message("Loading strip and reference normalization information.")
   loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
 #  data(preprocStuff, package=pkgname, envir=.crlmmPkgEnv)
 
   stripnum <- getVarInEnv("stripnum")
56816837
 
5fcfed7d
   if(useTarget)
     targetdist = getVarInEnv("reference")
56816837
 
5fcfed7d
 #  Xqws = Yqws = matrix(0, nrow(XY), ncol(XY))
 #  colnames(Xqws) = colnames(Yqws) = sampleNames(XY) #$X
 #  rownames(Xqws) = rownames(Yqws) = featureNames(XY)
 
   if(verbose){
     message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.")
     if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=max(stripnum), style=3)
56816837
   }
5fcfed7d
   for(s in 1:max(stripnum)) {
     if(verbose) {
       if (getRversion() > '2.7.0') setTxtProgressBar(pb, s)
       else cat(".")
     }
     sel = stripnum==s
     subX = exprs(channel(XY, "X"))[sel,]
     subY = exprs(channel(XY, "Y"))[sel,]
     if(useTarget)
       tmp = normalize.quantiles.use.target(as.matrix(cbind(subX, subY)), targetdist[[s]])
     else
       tmp = normalize.quantiles(as.matrix(cbind(subX, subY)))
a11efc9c
     XY@assayData$X[sel,] = matrix(as.integer(tmp[,1:(ncol(tmp)/2)]+16), nrow(tmp), ncol(tmp)/2)
     XY@assayData$Y[sel,] = matrix(as.integer(tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16), nrow(tmp), ncol(tmp)/2)
5fcfed7d
 #    Xqws[sel,] = tmp[,1:(ncol(tmp)/2)]
 #    Yqws[sel,] = tmp[,(ncol(tmp)/2+1):ncol(tmp)]
     rm(subX, subY, tmp, sel)
     gc()
   }
   if(verbose)
     cat("\n")
   XY
 #  XYNorm = new("NChannelSet",
 #                X=Xqws+16,
 #                Y=Yqws+16,
 #                Xnb=exprs(channel(XY, "Xnb")),
 #                Ynb=exprs(channel(XY, "Ynb")),
 #                Xse=exprs(channel(XY, "Xse")),
 #                Yse=exprs(channel(XY, "Yse")),
 #                zero=exprs(channel(XY, "zero")),
 #                annotation=annotation(XY),
 #                phenoData=XY@phenoData)
cd8b85b6
 }
 
 
5fcfed7d
 preprocessInfinium2 <- function(XY, mixtureSampleSize=10^5,
 				fitMixture=TRUE,
 				eps=0.1,
 				verbose=TRUE,
 				seed=1,
 				cdfName,
 				sns,
 				stripNorm=TRUE,
 				useTarget=TRUE,
 				save.it=FALSE,
 				snpFile,
 				cnFile) {
   if(stripNorm)
     XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
 
 ## MR: the code below is mostly straight from snprma.R
   if (missing(sns)) sns <- sampleNames(XY) #$X
   if(missing(cdfName))
     cdfName <- annotation(XY)
 ##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
   pkgname <- getCrlmmAnnotationName(cdfName)
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
     suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
     msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
     message(strwrap(msg))
     stop("Package ", pkgname, " could not be found.")
     rm(suggCall, msg)
   }
   if(verbose) message("Loading snp annotation and mixture model parameters.")
   loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
   loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
   loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname)
   loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
 #  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
   autosomeIndex <- getVarInEnv("autosomeIndex")
   #  pnsa <- getVarInEnv("pnsa")
cd8b85b6
   #  pnsb <- getVarInEnv("pnsb")
   #  fid <- getVarInEnv("fid")
   #  reference <- getVarInEnv("reference")
   #  aIndex <- getVarInEnv("aIndex")
   #  bIndex <- getVarInEnv("bIndex")
5fcfed7d
   SMEDIAN <- getVarInEnv("SMEDIAN")
   theKnots <- getVarInEnv("theKnots")
   gns <- getVarInEnv("gns")
   narrays = ncol(XY)
56816837
 
5fcfed7d
   if(save.it & !missing(cnFile)) {
d6564036
 	  ## separate out copy number probes
 	  npIndex = getVarInEnv("npProbesFid")
 	  nprobes = length(npIndex)
 	  if(length(nprobes > 0)){
348dfe40
 		  is.ff <- is(XY@assayData$X, "ff") | is(XY@assayData$X, "ffdf")
 		  if(is.ff){
 			  open(XY@assayData$X)
 			  open(XY@assayData$Y)
 			  open(XY@assayData$zero)
 		  }
d6564036
 
 		  A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays)
 		  B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
 
 		  ## new lines below - useful to keep track of zeroed out probes
 		  zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
 
 		  colnames(A) <- colnames(B) <- colnames(zero) <- sns
 		  rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex)
 
 		  cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
 
 		  t0 <- proc.time()
 		  save(cnAB, file=cnFile)
 		  t0 <- proc.time()-t0
 		  if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
 		  rm(cnAB, B, zero)
 	  }
5fcfed7d
   }
56816837
 
5fcfed7d
   # next process snp probes
   snpIndex = getVarInEnv("snpProbesFid")
   nprobes <- length(snpIndex)
56816837
 
5fcfed7d
   ##We will read each cel file, summarize, and run EM one by one
   ##We will save parameters of EM to use later
   mixtureParams <- matrix(0, 4, narrays)
   SNR <- vector("numeric", narrays)
   SKW <- vector("numeric", narrays)
 
   ## This is the sample for the fitting of splines
   ## BC: I like better the idea of the user passing the seed,
   ##     because this might intefere with other analyses
   ##     (like what happened to GCRMA)
   set.seed(seed)
   idx <- sort(sample(autosomeIndex, mixtureSampleSize))
a11efc9c
   idx2 <- sample(nprobes, 10^5)
56816837
 
5fcfed7d
   ##S will hold (A+B)/2 and M will hold A-B
   ##NOTE: We actually dont need to save S. Only for pics etc...
   ##f is the correction. we save to avoid recomputing
   A <- matrix(as.integer(exprs(channel(XY, "X"))[snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames))
   B <- matrix(as.integer(exprs(channel(XY, "Y"))[snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames))
 
   # new lines below - useful to keep track of zeroed out SNPs
   zero <- matrix(as.integer(exprs(channel(XY, "zero"))[snpIndex,]), nprobes, narrays)
56816837
 
5fcfed7d
   colnames(A) <- colnames(B) <- colnames(zero) <- sns
   rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
56816837
 
5fcfed7d
   if(verbose){
      message("Calibrating ", narrays, " arrays.")
      if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
   }
 
a11efc9c
 
5fcfed7d
   for(i in 1:narrays){
     SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
     if(fitMixture){
       S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
       M <- log2(A[idx, i])-log2(B[idx, i])
 
       ##we need to test the choice of eps.. it is not the max diff between funcs
       tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
 
       mixtureParams[, i] <- tmp[["coef"]]
       SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
     }
     if(verbose) {
        if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
        else cat(".")
     }
   }
   if (verbose) {
     if (getRversion() > '2.7.0') close(pb)
     else cat("\n")
   }
   if (!fitMixture) SNR <- mixtureParams <- NA
   ## gns comes from preprocStuff.rda
   res = list(A=A, B=B, zero=zero, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
 
   if(save.it & !missing(snpFile)) {
56816837
     t0 <- proc.time()
     save(res, file=snpFile)
5fcfed7d
     t0 <- proc.time()-t0
     if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
   }
   return(res)
cd8b85b6
 }
 
 
e609f78b
 preprocessInfinium2v2 <- function(XY, mixtureSampleSize=10^5,
 				fitMixture=TRUE,
 				eps=0.1,
 				verbose=TRUE,
 				seed=1,
 				cdfName,
 				sns,
 				stripNorm=TRUE,
a11efc9c
 				useTarget=TRUE, # ) { #,
 				save.it=FALSE,
 				snpFile,
 				cnFile) {
e609f78b
   if(stripNorm)
     XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose)
 
 ## MR: the code below is mostly straight from snprma.R
   if (missing(sns)) sns <- sampleNames(XY) #$X
   if(missing(cdfName))
     cdfName <- annotation(XY)
 ##  stuffDir <- changeToCrlmmAnnotationName(cdfName)
   pkgname <- getCrlmmAnnotationName(cdfName)
   if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
     suggCall <- paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="")
     msg <- paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall)
     message(strwrap(msg))
     stop("Package ", pkgname, " could not be found.")
     rm(suggCall, msg)
   }
   if(verbose) message("Loading snp annotation and mixture model parameters.")
   loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
   loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
   loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname)
   loader("npProbesFid.rda", .crlmmPkgEnv, pkgname)
 #  data(genotypeStuff, mixtureStuff, package=pkgname, envir=.crlmmPkgEnv)
   autosomeIndex <- getVarInEnv("autosomeIndex")
   #  pnsa <- getVarInEnv("pnsa")
   #  pnsb <- getVarInEnv("pnsb")
   #  fid <- getVarInEnv("fid")
   #  reference <- getVarInEnv("reference")
   #  aIndex <- getVarInEnv("aIndex")
   #  bIndex <- getVarInEnv("bIndex")
   SMEDIAN <- getVarInEnv("SMEDIAN")
   theKnots <- getVarInEnv("theKnots")
a11efc9c
 #  gns <- featureNames(XY) # getVarInEnv("gns") # needs to include np probes - gns is only for snps
e609f78b
   narrays = ncol(XY)
56816837
 
a11efc9c
   if(save.it & !missing(cnFile)) {
     # separate out copy number probes
     npIndex = getVarInEnv("npProbesFid")
     nprobes = length(npIndex)
     A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays)
     B <- matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays)
e609f78b
 
a11efc9c
     # new lines below - useful to keep track of zeroed out probes
56816837
     zero <- matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays)
e609f78b
 
a11efc9c
     colnames(A) <- colnames(B) <- colnames(zero) <- sns
     rownames(A) <- rownames(B) <- rownames(zero) <- names(npIndex)
56816837
 
a11efc9c
     cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName)
56816837
 
     t0 <- proc.time()
     save(cnAB, file=cnFile)
a11efc9c
     t0 <- proc.time()-t0
     if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".")
      rm(cnAB, B, zero)
   }
56816837
 
e609f78b
   # next process snp probes
   snpIndex = getVarInEnv("snpProbesFid")
   nprobes <- length(snpIndex)
56816837
 
e609f78b
   ##We will read each cel file, summarize, and run EM one by one
   ##We will save parameters of EM to use later
a11efc9c
   mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, narrays, "double")
   SNR <- initializeBigVector("crlmmSNR-", narrays, "double")
56816837
   SKW <- initializeBigVector("crlmmSKW-", narrays, "double")
a11efc9c
 #  mixtureParams <- matrix(0, 4, narrays)
 #  SNR <- vector("numeric", narrays)
 #  SKW <- vector("numeric", narrays)
e609f78b
 
   ## This is the sample for the fitting of splines
   ## BC: I like better the idea of the user passing the seed,
   ##     because this might intefere with other analyses
   ##     (like what happened to GCRMA)
   set.seed(seed)
   idx <- sort(sample(autosomeIndex, mixtureSampleSize))
a11efc9c
   idx2 <- sample(nprobes, 10^5)
56816837
 
e609f78b
   ##S will hold (A+B)/2 and M will hold A-B
   ##NOTE: We actually dont need to save S. Only for pics etc...
   ##f is the correction. we save to avoid recomputing
a11efc9c
 #  A <- exprs(channel(XY, "X"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsa), length(filenames))
 #  B <- exprs(channel(XY, "Y"))[,] # ), nrow(XY), narrays) # [snpIndex,]), nprobes, narrays) # matrix(as.integer(0), length(pnsb), length(filenames))
e609f78b
 
   # new lines below - useful to keep track of zeroed out SNPs
a11efc9c
 #  zero <- exprs(channel(XY, "zero"))[,] # )) #[snpIndex,]), nprobes, narrays)
e609f78b
 
 #  if(!is.matrix(A)) {
 #     A = A[,]
 #     B = B[,]
 #     zero = zero[,]
 #  }
 
a11efc9c
 #  if(!is.integer(A)) {
 #     A = matrix(as.integer(A), nrow(A), ncol(A))
 #     B = matrix(as.integer(B), nrow(B), ncol(B))
56816837
 #  }
 
e609f78b
 #  colnames(A) <- colnames(B) <- colnames(zero) <- sns
 #  rownames(A) <- rownames(B) <- rownames(zero) <- names(snpIndex) # gns # featureNames(XY)
a11efc9c
 
   A <- initializeBigMatrix("crlmmA-", nprobes, narrays, "integer")
   B <- initializeBigMatrix("crlmmB-", nprobes, narrays, "integer")
   zero <- initializeBigMatrix("crlmmZero-", nprobes, narrays, "integer")
56816837
 
e609f78b
   if(verbose){
      message("Calibrating ", narrays, " arrays.")
      if (getRversion() > '2.7.0') pb <- txtProgressBar(min=0, max=narrays, style=3)
   }
 
   for(i in 1:narrays){
a11efc9c
      A[,i] = exprs(channel(XY, "X"))[snpIndex,i]
      B[,i] = exprs(channel(XY, "Y"))[snpIndex,i]
56816837
      zero[,i] = exprs(channel(XY, "zero"))[snpIndex,i]
a11efc9c
 #    SKW[i] = mean((A[snpIndex,i][idx2]-mean(A[snpIndex,i][idx2]))^3)/(sd(A[snpIndex,i][idx2])^3)
      SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3)
e609f78b
     if(fitMixture){
a11efc9c
       S <- (log2(A[idx,i])+log2(B[idx,i]))/2 - SMEDIAN
       M <- log2(A[idx,i])-log2(B[idx,i])
e609f78b
 
       ##we need to test the choice of eps.. it is not the max diff between funcs
       tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
 
       mixtureParams[, i] <- tmp[["coef"]]
       SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
     }
     if(verbose) {
        if (getRversion() > '2.7.0') setTxtProgressBar(pb, i)
        else cat(".")
     }
   }
   if (verbose) {
     if (getRversion() > '2.7.0') close(pb)
     else cat("\n")
   }
   if (!fitMixture) SNR <- mixtureParams <- NA
   ## gns comes from preprocStuff.rda
a11efc9c
   res = list(A=A, B=B,
              zero=zero, sns=sns, gns=names(snpIndex), SNR=SNR, SKW=SKW,
              mixtureParams=mixtureParams, cdfName=cdfName) # , snpIndex=snpIndex, npIndex=npIndex)
e609f78b
 
a11efc9c
   if(save.it & !missing(snpFile)) {
56816837
     t0 <- proc.time()
     save(res, file=snpFile)
a11efc9c
     t0 <- proc.time()-t0
     if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".")
   }
   close(A)
   close(B)
   close(zero)
   close(SKW)
   close(mixtureParams)
   close(SNR)
e609f78b
   return(res)
 }
 
 
5fcfed7d
 crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
                   row.names=TRUE, col.names=TRUE,
                   probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
                   seed=1, save.it=FALSE, load.it=FALSE, snpFile, cnFile,
                   mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
                   cdfName, sns, recallMin=10, recallRegMin=1000,
                   returnParams=FALSE, badSNP=.7) {
   if (save.it & (missing(snpFile) | missing(cnFile)))
     stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it")
   if (load.it & missing(snpFile))
     stop("'snpFile' is missing and you chose to load.it")
   if (!missing(snpFile))
     if (load.it & !file.exists(snpFile)){
       load.it <- FALSE
       message("File ", snpFile, " does not exist.")
       stop("Cannot load SNP data.")
   }
   if (!load.it){
     if(!missing(RG)) {
       if(missing(XY))
         XY = RGtoXY(RG, chipType=cdfName)
       else
         stop("Both RG and XY specified - please use one or the other")
     }
     if (missing(sns)) sns <- sampleNames(XY) #$X
56816837
 
5fcfed7d
     res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
                         seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget,
                         save.it=save.it, snpFile=snpFile, cnFile=cnFile)
   }else{
       if(verbose) message("Loading ", snpFile, ".")
         obj <- load(snpFile)
         if(verbose) message("Done.")
         if(!any(obj == "res"))
           stop("Object in ", snpFile, " seems to be invalid.")
   }
a11efc9c
   if(row.names) row.names=res[["gns"]] else row.names=NULL
   if(col.names) col.names=res[["sns"]] else col.names=NULL
5fcfed7d
 
   res2 <- crlmmGT(res[["A"]], res[["B"]], res[["SNR"]],
                   res[["mixtureParams"]], res[["cdfName"]],
                   gender=gender, row.names=row.names,
                   col.names=col.names, recallMin=recallMin,
                   recallRegMin=1000, SNRMin=SNRMin,
                   returnParams=returnParams, badSNP=badSNP,
                   verbose=verbose)
56816837
 
5fcfed7d
   res2[["SNR"]] <- res[["SNR"]]
   res2[["SKW"]] <- res[["SKW"]]
   rm(res)
   gc()
   return(list2SnpSet(res2, returnParams=returnParams)) # return(res2)
 }
 
e609f78b
 
 crlmmIllumina2 <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE,
                   row.names=TRUE, col.names=TRUE,
                   probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
a11efc9c
                   seed=1, save.it=FALSE, load.it=FALSE, snpFile, cnFile,
e609f78b
                   mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
                   cdfName, sns, recallMin=10, recallRegMin=1000,
                   returnParams=FALSE, badSNP=.7) {
a11efc9c
   if (save.it & (missing(snpFile) | missing(cnFile)))
     stop("'snpFile' and/or 'cnFile' is missing and you chose to save.it")
   if (load.it & missing(snpFile))
     stop("'snpFile' is missing and you chose to load.it")
   if (!missing(snpFile))
     if (load.it & !file.exists(snpFile)){
       load.it <- FALSE
       message("File ", snpFile, " does not exist.")
       stop("Cannot load SNP data.")
   }
   if (!load.it){
e609f78b
     if(!missing(RG)) {
       if(missing(XY))
         XY = RGtoXY2(RG, chipType=cdfName)
       else
         stop("Both RG and XY specified - please use one or the other")
     }
     if (missing(sns)) sns <- sampleNames(XY) #$X
56816837
 
e609f78b
     res = preprocessInfinium2v2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
                         seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #,
                       #  save.it=save.it, snpFile=snpFile, cnFile=cnFile)
 
a11efc9c
 #    fD = featureData(XY)
 #    phenD = XY@phenoData
 #    protD = XY@protocolData
 #    rm(XY)
 #    gc()
 #    if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
 #    callSet <- new("SnpSuperSet",
 #                    alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
 #                    alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
 #                    call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
 #                    callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
56816837
 #  	            annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD)
a11efc9c
 #    sampleNames(callSet) <- sns
 #    featureNames(callSet) <- res[["gns"]]
 #    pData(callSet)$SKW <- rep(NA, length(sns))
 #    pData(callSet)$SNR <- rep(NA, length(sns))
 #    pData(callSet)$gender <- rep(NA, length(sns))
56816837
 
a11efc9c
   }else{
       if(verbose) message("Loading ", snpFile, ".")
         obj <- load(snpFile)
         if(verbose) message("Done.")
         if(!any(obj == "res"))
           stop("Object in ", snpFile, " seems to be invalid.")
   }
e609f78b
 
a11efc9c
  #   rm(phenD, protD , fD)
56816837
 
 #    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))
a11efc9c
 #    suppressWarnings(A(callSet) <- res[["A"]])
 #    suppressWarnings(B(callSet) <- res[["B"]])
 #    pData(callSet)$SKW <- res$SKW
 #    pData(callSet)$SNR <- res$SNR
 #    mixtureParams <- res$mixtureParams
 #    rm(res); gc()
   if(row.names) row.names=res$gns else row.names=NULL
   if(col.names) col.names=res$sns else col.names=NULL
56816837
 
a11efc9c
   res2 <- crlmmGT2(A=res[["A"]], #as.matrix(A(callSet)[snp.index,]), # j]),
                   B=res[["B"]], # as.matrix(B(callSet)[snp.index,]),  # j]),
                   SNR=res[["SNR"]], # callSet$SNR, # [j],
                   mixtureParams=res[["mixtureParams"]], #,
                   cdfName=res[["cdfName"]], # annotation(callSet),
                   row.names=row.names, # featureNames(callSet)[snp.index],
                   col.names=col.names, # sampleNames(callSet), #[j],
e609f78b
                   probs=probs,
                   DF=DF,
                   SNRMin=SNRMin,
                   recallMin=recallMin,
                   recallRegMin=recallRegMin,
                   gender=gender,
                   verbose=verbose,
                   returnParams=returnParams,
                   badSNP=badSNP)
56816837
 #    rm(res); gc()
e609f78b
 #    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
 #    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
 #    callSet$gender[j] <- tmp$gender
a11efc9c
 #    suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]])
 #    suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]])
 #    callSet$gender <- tmp$gender
 #    rm(tmp); gc()
 #    return(callSet)
 
   res2[["SNR"]] <- res[["SNR"]]
   res2[["SKW"]] <- res[["SKW"]]
   rm(res); gc()
   return(list2SnpSet(res2, returnParams=returnParams))
e609f78b
 }
 
 
56816837
 ## MR: Below is a more memory efficient version of crlmmIllumina() which
 ## reads in the .idats and genotypes in the one function and removes objects
4e4601af
 ## after they have been used
5fcfed7d
 crlmmIlluminaV2 = function(sampleSheet=NULL,
e609f78b
 			  arrayNames=NULL,
5fcfed7d
 			  ids=NULL,
 			  path=".",
 			  arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"),
 			  highDensity=FALSE,
 			  sep="_",
 			  fileExt=list(green="Grn.idat", red="Red.idat"),
 			  saveDate=FALSE,
e609f78b
 #                          save.rg=FALSE,
 #                          rgFile,
5fcfed7d
 			  stripNorm=TRUE,
 			  useTarget=TRUE,
56816837
           	          row.names=TRUE,
5fcfed7d
 			  col.names=TRUE,
 			  probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL,
a11efc9c
                           seed=1, save.ab=FALSE, snpFile, cnFile,
e609f78b
                           mixtureSampleSize=10^5, eps=0.1, verbose=TRUE,
                           cdfName, sns, recallMin=10, recallRegMin=1000,
                           returnParams=FALSE, badSNP=.7) {
 
   if(missing(cdfName)) stop("must specify cdfName")
   if(!isValidCdfName(cdfName)) stop("cdfName not valid.  see validCdfNames")
 #  if(missing(sns)) sns <- basename(arrayNames)
56816837
 
e609f78b
 #  if (save.rg & missing(rgFile))
 #    stop("'rgFile' is missing, and you chose save.rg")
a11efc9c
   if (save.ab & (missing(snpFile) | missing(cnFile)))
     stop("'snpFile' or 'cnFile' is missing and you chose save.ab")
e609f78b
 #  batches = NULL
 #  if(!is.null(arrayNames))
 #    batches <- rep(1, length(arrayNames)) # problem here if arrayNames not specified! # splitIndicesByLength(seq(along=arrayNames), ocSamples())
 #  if(!is.null(sampleSheet))
 #    batches <- rep(1, nrow(sampleSheet))
 #  if(is.null(batches))
 #    batches=1
 #  k <- 1
 #  for(j in batches){
 #     if(verbose) message("Batch ", k, " of ", length(batches))
 #     RG = readIdatFiles(sampleSheet=sampleSheet[j,], arrayNames=arrayNames[j],
 #                       ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
 #                       highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
      RG = readIdatFiles2(sampleSheet=sampleSheet, arrayNames=arrayNames,
4e4601af
                        ids=ids, path=path, arrayInfoColNames=arrayInfoColNames,
                        highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate)
 
e609f78b
 #  if(save.rg)
 #	save(RG, file=rgFile)
4e4601af
 
e609f78b
     XY = RGtoXY2(RG, chipType=cdfName)
a11efc9c
     rm(RG); gc()
e609f78b
     if (missing(sns)) { sns = sampleNames(XY) #subsns = sampleNames(XY)
a11efc9c
     } # else subsns = sns[j]
e609f78b
     res = preprocessInfinium2v2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose,
a11efc9c
                                seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget, #) # sns=subsns
                                save.it=save.ab, snpFile=snpFile, cnFile=cnFile)
 #    fD = featureData(XY)
 #    phenD = XY@phenoData
 #    protD = XY@protocolData
     rm(XY); gc()
e609f78b
 #    if(k == 1){
a11efc9c
 #    if(verbose) message("Initializing container for alleleA, alleleB, call, callProbability")
 #    callSet <- new("SnpSuperSet",
 #                    alleleA=initializeBigMatrix(name="A", nr=nrow(res[[1]]), nc=length(sns)),
 #                    alleleB=initializeBigMatrix(name="B", nr=nrow(res[[1]]), nc=length(sns)),
 #                    call=initializeBigMatrix(name="call", nr=nrow(res[[1]]), nc=length(sns)),
 #                    callProbability=initializeBigMatrix(name="callPr", nr=nrow(res[[1]]), nc=length(sns)),
 # 		    annotation=cdfName, protocolData=protD, phenoData=phenD, featureData=fD)
 #    sampleNames(callSet) <- sns
e609f78b
 #            phenoData(callSet) <- getPhenoData(sampleSheet=sampleSheet,
 #                                    arrayNames=sns,
 #								    arrayInfoColNames=arrayInfoColNames)
 #            pD <- data.frame(matrix(NA, length(sns), 1), row.names=sns)
 #            colnames(pD) <- "ScanDate"
 #            protocolData(callSet) <- pData(protD) # new("AnnotatedDataFrame", data=pD)
 #            pData(protocolData(callSet))[j, ] <- pData(protocolData)
a11efc9c
 #    featureNames(callSet) <- res[["gns"]]
 #    pData(callSet)$SKW <- rep(NA, length(sns))
 #    pData(callSet)$SNR <- rep(NA, length(sns))
56816837
 #    pData(callSet)$gender <- rep(NA, length(sns))
e609f78b
 #	}
 #	pData(callSet)[j,] <- phenD
 #	pData(protocolData(callSet))[j,] <- protD
 #	pData(callSet) <- phenD
 #	pData(protocolData(callSet)) <- protD
 
a11efc9c
 #    rm(phenD, protD, fD)
56816837
 
e609f78b
 #    if(k > 1 & nrow(res[[1]]) != nrow(callSet)){
         ##RS: I don't understand why the IDATS for the
         ##same platform potentially have different lengths
 #        res[["A"]] <- res[["A"]][res$gns %in% featureNames(callSet), ]
 #        res[["B"]] <- res[["B"]][res$gns %in% featureNames(callSet), ]
 #    }
 
56816837
 #    snp.index <- res$snpIndex #match(res$gns, featureNames(callSet))
e609f78b
 #    suppressWarnings(A(callSet)[, j] <- res[["A"]])
 #    suppressWarnings(B(callSet)[, j] <- res[["B"]])
a11efc9c
 #    suppressWarnings(A(callSet) <- res[["A"]])
 #    suppressWarnings(B(callSet) <- res[["B"]])
e609f78b
 #    pData(callSet)$SKW[j] <- res$SKW
 #    pData(callSet)$SNR[j] <- res$SNR
a11efc9c
 #    pData(callSet)$SKW <- res$SKW
 #    pData(callSet)$SNR <- res$SNR
 #    mixtureParams <- res$mixtureParams
 #    rm(res); gc()
   if(row.names) row.names=res$gns else row.names=NULL
56816837
   if(col.names) col.names=res$sns else col.names=NULL
a11efc9c
   res2 <- crlmmGT2(A=res[["A"]], #as.matrix(A(callSet)[snp.index,]), # j]),
                   B=res[["B"]], # as.matrix(B(callSet)[snp.index,]),  # j]),
                   SNR=res[["SNR"]], # callSet$SNR, # [j],
                   mixtureParams=res[["mixtureParams"]], #,
                   cdfName=res[["cdfName"]], # annotation(callSet),
                   row.names=row.names, # featureNames(callSet)[snp.index],
                   col.names=col.names, # sampleNames(callSet), #[j],
e609f78b
                   probs=probs,
                   DF=DF,
                   SNRMin=SNRMin,
                   recallMin=recallMin,
                   recallRegMin=recallRegMin,
                   gender=gender,
                   verbose=verbose,
                   returnParams=returnParams,
                   badSNP=badSNP)
56816837
 #    rm(res); gc()
a11efc9c
 #    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
 #    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
 #    callSet$gender[j] <- tmp$gender
 #    suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]])
 #    suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]])
 #    callSet$gender <- tmp$gender
 #    rm(tmp); gc()
 #    return(callSet)
 
   res2[["SNR"]] <- res[["SNR"]]
   res2[["SKW"]] <- res[["SKW"]]
   rm(res); gc()
   return(list2SnpSet(res2, returnParams=returnParams))
 #    tmp <- crlmmGT(A=as.matrix(A(callSet)[snp.index,]), # j]),
 #                  B=as.matrix(B(callSet)[snp.index,]),  # j]),
 #                  SNR=callSet$SNR, # [j],
 #                  mixtureParams=mixtureParams,
 #                  cdfName=annotation(callSet),
 #                  row.names=featureNames(callSet)[snp.index],
 #                  col.names=sampleNames(callSet), #[j],
 #                  probs=probs,
 #                  DF=DF,
 #                  SNRMin=SNRMin,
 #                  recallMin=recallMin,
 #                  recallRegMin=recallRegMin,
 #                  gender=gender,
 #                  verbose=verbose,
 #                  returnParams=returnParams,
 #                  badSNP=badSNP)
e609f78b
 #    suppressWarnings(snpCall(callSet)[snp.index, j] <- tmp[["calls"]])
 #    suppressWarnings(snpCallProbability(callSet)[snp.index, j] <- tmp[["confs"]])
 #    callSet$gender[j] <- tmp$gender
a11efc9c
 #    suppressWarnings(snpCall(callSet)[snp.index,] <- tmp[["calls"]])
 #    suppressWarnings(snpCallProbability(callSet)[snp.index,] <- tmp[["confs"]])
 #    callSet$gender <- tmp$gender
 #    rm(tmp); gc()
e609f78b
 #    k <- k+1
 #  }
a11efc9c
 #    return(callSet)
4e4601af
 }