# function below works OK provided all .idat files are in the current working directory # - could add an option to allow files in Illumina directory structure to be handled # or to use the optional 'Path' column in sampleSheet # - there is a lot of header information that is currently discarded - could try and store this somewhere in the resulting NChannelSet readIdatFiles = function(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), saveDate=FALSE, verbose=FALSE) { verbose <- FALSE if(!is.null(arrayNames)) { pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) } if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet if(is.null(arrayNames)){ if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) { barcode = sampleSheet[,arrayInfoColNames$barcode] arrayNames=barcode } if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) { position = sampleSheet[,arrayInfoColNames$position] if(is.null(arrayNames)) arrayNames=position else arrayNames = paste(arrayNames, position, sep=sep) if(highDensity) { hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02") for(i in names(hdExt)) arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames) } } } pd = new("AnnotatedDataFrame", data = sampleSheet) sampleNames(pd) = make.unique(basename(arrayNames)) } if(is.null(arrayNames)) { arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path)) if(!is.null(sampleSheet)) { sampleSheet=NULL cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n") } pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) } narrays = length(arrayNames) grnfiles = paste(arrayNames, fileExt$green, sep=sep) redfiles = paste(arrayNames, fileExt$red, sep=sep) if(length(grnfiles)==0 || length(redfiles)==0) stop("Cannot find .idat files") if(length(grnfiles)!=length(redfiles)) stop("Cannot find matching .idat files") if(path[1] != "." & path[1] != ""){ grnidats = file.path(path, grnfiles) redidats = file.path(path, redfiles) } else { message("path arg not set. Assuming files are in local directory, or that complete path is provided") grnidats = grnfiles redidats = redfiles } if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files") if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files") headerInfo = list(nProbes = rep(NA, narrays), Barcode = rep(NA, narrays), ChipType = rep(NA, narrays), Manifest = rep(NA, narrays), # not sure about this one - sometimes blank Position = rep(NA, narrays)) # this may also vary a bit dates = list(decode=rep(NA, narrays), scan=rep(NA, narrays)) ## read in the data for(i in seq_along(arrayNames)) { if(verbose) { ## RS ##cat("reading", arrayNames[i], "\t") cat("reading", basename(arrayNames[i]), "\t") cat(paste(sep, fileExt$green, sep=""), "\t") } idsG = idsR = G = R = NULL G = readIDAT(grnidats[i]) idsG = rownames(G$Quants) headerInfo$nProbes[i] = G$nSNPsRead headerInfo$Barcode[i] = G$Barcode headerInfo$ChipType[i] = G$ChipType headerInfo$Manifest[i] = G$Unknown$MostlyNull headerInfo$Position[i] = G$Unknowns$MostlyA if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) { warning("Chips are not of the same type. Skipping ", basename(grnidats[i]), " and ", basename(redidats[i])) next() } dates$decode[i] = G$RunInfo[1, 1] dates$scan[i] = G$RunInfo[2, 1] if(i==1) { if(is.null(ids) && !is.null(G)){ ids = idsG } # else stop("Could not find probe IDs") nprobes = length(ids) narrays = length(arrayNames) RG = new("NChannelSet", R=matrix(NA, nprobes, narrays), G=matrix(NA, nprobes, narrays), zero=matrix(NA, nprobes, narrays), annotation=headerInfo$Manifest[1], phenoData=pd, storage.mode="environment") featureNames(RG) = ids if(!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)){ sampleNames(RG) = make.unique(sampleSheet$Sample_ID) } else sampleNames(RG) = make.unique(basename(arrayNames)) gc(verbose=FALSE) } if(length(ids)==length(idsG)) { if(sum(ids==idsG)==nprobes) { RG@assayData$G[,i] = G$Quants[, "Mean"] zeroG = G$Quants[, "NBeads"]==0 } } else { indG = match(ids, idsG) RG@assayData$G[,i] = G$Quants[indG, "Mean"] zeroG = G$Quants[indG, "NBeads"]==0 } rm(G) gc(verbose=FALSE) if(verbose) { cat(paste(sep, fileExt$red, sep=""), "\n") } R = readIDAT(redidats[i]) idsR = rownames(R$Quants) if(length(ids)==length(idsG)) { if(sum(ids==idsR)==nprobes) { RG@assayData$R[,i] = R$Quants[ ,"Mean"] zeroR = R$Quants[ ,"NBeads"]==0 } } else { indR = match(ids, idsR) RG@assayData$R[,i] = R$Quants[indR, "Mean"] zeroR = R$Quants[indR, "NBeads"]==0 } RG@assayData$zero[,i] = zeroG | zeroR rm(R, zeroG, zeroR) gc(verbose=FALSE) } if(saveDate) { protocolData(RG)[["ScanDate"]] = dates$scan } storageMode(RG) = "lockedEnvironment" RG } ## the readIDAT() and readBPM() functions below were provided by Keith Baggerly, 27/8/2008 ## edits provided by Kasper Daniel Hansen, 4/10/2011 readIDAT <- function(idatFile){ fileSize <- file.info(idatFile)$size tempCon <- file(idatFile,"rb") prefixCheck <- readChar(tempCon,4) # if(prefixCheck != "IDAT"){ # warning("May need to check format of ", idatFile) # } versionNumber <- readBin(tempCon, "integer", n=1, size=8, endian="little") if(versionNumber<3) stop("Older style IDAT files not supported: consider updating your scanner settings") nFields <- readBin(tempCon, "integer", n=1, size=4, endian="little") fields <- matrix(0,nFields,3); colnames(fields) <- c("Field Code", "Byte Offset", "Bytes") for(i1 in 1:nFields){ fields[i1,"Field Code"] <- readBin(tempCon, "integer", n=1, size=2, endian="little", signed=FALSE) fields[i1,"Byte Offset"] <- readBin(tempCon, "integer", n=1, size=8, endian="little") } knownCodes <- c("nSNPsRead" = 1000, "IlluminaID" = 102, "SD" = 103, "Mean" = 104, "NBeads" = 107, "MidBlock" = 200, "RunInfo" = 300, "RedGreen" = 400, "MostlyNull" = 401, "Barcode" = 402, "ChipType" = 403, "MostlyA" = 404, "Unknown.1" = 405, "Unknown.2" = 406, "Unknown.3" = 407, "Unknown.4" = 408, "Unknown.5" = 409, "Unknown.6" = 410, "Unknown.7" = 510 ) nNewFields <- 1 rownames(fields) <- paste("Null", 1:nFields) for(i1 in 1:nFields){ temp <- match(fields[i1,"Field Code"], knownCodes) if(!is.na(temp)){ rownames(fields)[i1] <- names(knownCodes)[temp] }else{ rownames(fields)[i1] <- paste("newField", nNewFields, sep=".") nNewFields <- nNewFields + 1 } } fields <- fields[order(fields[, "Byte Offset"]),] seek(tempCon, fields["nSNPsRead", "Byte Offset"]) nSNPsRead <- readBin(tempCon, "integer", n=1, size=4, endian="little") readBlock <- function(nam) { switch(nam, "IlluminaID" = { seek(tempCon, fields["IlluminaID", "Byte Offset"]) IlluminaID <- readBin(tempCon, "integer", n=nSNPsRead, size=4, endian="little") IlluminaID }, "SD" = { seek(tempCon, fields["SD", "Byte Offset"]) SD <- readBin(tempCon, "integer", n=nSNPsRead, size=2, endian="little", signed=FALSE) SD }, "Mean" = { seek(tempCon, fields["Mean", "Byte Offset"]) Mean <- readBin(tempCon, "integer", n=nSNPsRead, size=2, endian="little", signed=FALSE) Mean }, "NBeads" = { seek(tempCon, fields["NBeads", "Byte Offset"]) NBeads <- readBin(tempCon, "integer", n=nSNPsRead, size=1, signed=FALSE) NBeads }, "MidBlock" = { seek(tempCon, fields["MidBlock", "Byte Offset"]) nMidBlockEntries <- readBin(tempCon, "integer", n=1, size=4, endian="little") MidBlock <- readBin(tempCon, "integer", n=nMidBlockEntries, size=4, endian="little") MidBlock }, "RunInfo" = { seek(tempCon, fields["RunInfo", "Byte Offset"]) nRunInfoBlocks <- readBin(tempCon, "integer", n=1, size=4, endian="little") RunInfo <- matrix(NA, nRunInfoBlocks, 5) colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars", "BlockCode", "CodeVersion") for(i1 in 1:2) { #nRunInfoBlocks){ ## MR edit for(i2 in 1:5){ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) RunInfo[i1,i2] <- readChar(tempCon, nChars) } } RunInfo }, "RedGreen" = { seek(tempCon, fields["RedGreen", "Byte Offset"]) RedGreen <- readBin(tempCon, "numeric", n=1, size=4, endian="little") RedGreen }, "MostlyNull" = { seek(tempCon, fields["MostlyNull", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) MostlyNull <- readChar(tempCon, nChars) MostlyNull }, "Barcode" = { seek(tempCon, fields["Barcode", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) Barcode <- readChar(tempCon, nChars) Barcode }, "ChipType" = { seek(tempCon, fields["ChipType", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) ChipType <- readChar(tempCon, nChars) ChipType }, "MostlyA" = { seek(tempCon, fields["MostlyA", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) MostlyA <- readChar(tempCon, nChars) }, "Unknown.1" = { seek(tempCon, fields["Unknown.1", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) Unknown.1 <- readChar(tempCon, nChars) Unknown.1 }, "Unknown.2" = { seek(tempCon, fields["Unknown.2", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) Unknown.2 <- readChar(tempCon, nChars) Unknown.2 }, "Unknown.3" = { seek(tempCon, fields["Unknown.3", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) Unknown.3 <- readChar(tempCon, nChars) Unknown.3 }, "Unknown.4" = { seek(tempCon, fields["Unknown.4", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) Unknown.4 <- readChar(tempCon, nChars) Unknown.4 }, "Unknown.5" = { seek(tempCon, fields["Unknown.5", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) Unknown.5 <- readChar(tempCon, nChars) Unknown.5 }, "Unknown.6" = { seek(tempCon, fields["Unknown.6", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) Unknown.6 <- readChar(tempCon, nChars) Unknown.6 }, "Unknown.7" = { seek(tempCon, fields["Unknown.7", "Byte Offset"]) nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) Unknown.7 <- readChar(tempCon, nChars) Unknown.7 }) } readFields <- setdiff(rownames(fields), "nSNPsRead") names(readFields) <- readFields allFields <- lapply(readFields, readBlock) close(tempCon) UnknownNames <- c("MostlyNull", "MostlyA", "Unknown.1", "Unknown.2", "Unknown.3", "Unknown.4", "Unknown.5", "Unknown.6", "Unknown.7") Unknowns <- allFields[intersect(names(allFields), UnknownNames)] Quants <- cbind(allFields$Mean, allFields$SD, allFields$NBeads) colnames(Quants) <- c("Mean", "SD", "NBeads") rownames(Quants) <- as.character(allFields$IlluminaID) InfoNames <- c("MidBlock", "RunInfo", "RedGreen", "Barcode", "ChipType") Info <- allFields[intersect(names(allFields), InfoNames)] idatValues <- list(fileSize=fileSize, versionNumber=versionNumber, nFields=nFields, fields=fields, nSNPsRead=nSNPsRead, Quants=Quants) idatValues <- c(idatValues, Info, list(Unknowns = Unknowns)) idatValues } readBPM <- function(bpmFile){ ## Reads and parses Illumina BPM files fileSize <- file.info(bpmFile)$size tempCon <- file(bpmFile,"rb") # The first few bytes of the egtFile are some type of # header, but there's no related byte offset information. prefixCheck <- readChar(tempCon,3) ## should be "BPM" null.1 <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) ## should be 1 versionNumber <- readBin(tempCon, "integer", n=1, size=4, endian="little") ## should be 4 nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) chipType <- readChar(tempCon, nChars) null.2 <- readBin(tempCon, "integer", n=2, size=1, signed=FALSE) csvLines <- readLines(tempCon, 22) entriesByteOffset <- seek(tempCon); nEntries <- readBin(tempCon, "integer", n=1, size=4, endian="little") if(FALSE){ snpIndexByteOffset <- seek(tempCon) snpIndex <- readBin(tempCon, "integer", n=nEntries, size=4, endian="little") ## for the 1M array, these are simply in order from 1 to 1072820. snpNamesByteOffset <- seek(tempCon) snpNames <- rep("A", nEntries) for(i1 in 1:nEntries){ nChars <- readBin(tempCon, "integer", n=1, size=1, signed=FALSE) snpNames[i1] <- readChar(tempCon, nChars) } } seek(tempCon, 15278138) normIDByteOffset <- seek(tempCon) normID <- readBin(tempCon, "integer", n=nEntries, size=1, signed=FALSE) + 1 newBlockByteOffset <- seek(tempCon) newBlock <- readBin(tempCon, "integer", n=10000, size=1, signed=FALSE) close(tempCon) byteOffsets <- list(entriesByteOffset=entriesByteOffset, #snpIndexByteOffset=snpIndexByteOffset, #snpNamesByteOffset=snpNamesByteOffset, normIDByteOffset=normIDByteOffset, newBlockByteOffset=newBlockByteOffset) allStuff <- list(prefixCheck=prefixCheck, null.1=null.1, versionNumber=versionNumber, chipType=chipType, null.2=null.2, csvLines=csvLines, nEntries=nEntries, #snpIndex=snpIndex, #snpNames=snpNames, normID=normID, newBlock=newBlock, byteOffsets=byteOffsets) allStuff } readGenCallOutput = function(file, path=".", cdfName, colnames=list("SampleID"="Sample ID", "SNPID"="SNP Name", "XRaw"="X Raw", "YRaw"="Y Raw"), type=list("SampleID"="character", "SNPID"="character", "XRaw"="integer", "YRaw"="integer"), verbose=FALSE) { if(!identical(names(type), names(colnames))) stop("The arguments 'colnames' and 'type' must have consistent names") if(verbose) cat("Reading", file, "\n") tmp=readLines(file.path(path,file),n=15) s=c("\t",",") a=unlist(strsplit(tmp[10][1],s[1])) if(length(a)!=1){ sepp=s[1] a1=unlist(strsplit(tmp[10][1],s[1])) } if(length(a)==1){ sepp=s[2] a1=unlist(strsplit(tmp[10][1],s[2])) } if(sum(is.na(match(colnames,a1)))!=0) stop("Cannot find required columns: ", colnames[is.na(match(colnames,a1))], " in ", file, "\nPlease check whether this data was exported.") m1=m=match(a1,colnames) m[is.na(m1)==FALSE] =type[m1[!is.na(m1)]] m[is.na(m)==TRUE] = list(NULL) names(m) = names(colnames)[m1] fc = file(file.path(path, file), open="r") dat = scan(fc, what=m, skip=10,sep=sepp) close(fc) samples = unique(dat$"SampleID") nsamples = length(samples) snps = unique(dat$"SNPID") nsnps = length(snps) if(verbose) cat("Check ordering for samples","\n") X = Y = zeroes = matrix(0, nsnps, nsamples) for(i in 1:length(samples)) { ind = dat$"SampleID"==samples[i] if(sum(dat$"SNPID"[ind]==snps)==nsnps) { # if(verbose) # cat(paste("Correct ordering for sample", samples[i], "\n")) X[,i] = dat$"XRaw"[ind] Y[,i] = dat$"YRaw"[ind] } if(sum(dat$"SNPID"[ind]==snps)!=nsnps) { if(verbose) cat("Reordering sample ", samples[i],"\n") m=match(snps,dat$"SNPID"[ind]) X[,i]= dat$"XRaw"[ind][m] Y[,i]= dat$"YRaw"[ind][m] } gc(verbose=FALSE) } zeroes=(X=="0"|Y=="0") colnames(X) = colnames(Y) = colnames(zeroes) = samples rownames(X) = rownames(Y) = snps if(verbose) cat("Creating NChannelSet object\n") XY = new("NChannelSet", X = initializeBigMatrix(name = "X", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=X), Y = initializeBigMatrix(name = "Y", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=Y), zero = initializeBigMatrix(name = "zero", nr = nrow(X), nc = ncol(X), vmode = "integer", initdata=zeroes), annotation = cdfName, storage.mode = "environment") sampleNames(XY)=colnames(X) if(verbose) cat("Done\n") XY } RGtoXY = function(RG, chipType, verbose=TRUE) { needToLoad <- !all(sapply(c('addressA', 'addressB', 'base'), isLoaded)) if(needToLoad){ chipList = c("human1mv1c",# 1M "human370v1c", # 370CNV "human650v3a", # 650Y "human610quadv1b", # 610 quad "human660quadv1a", # 660 quad "human370quadv3c", # 370CNV quad "human550v3b", # 550K "human1mduov3b", # 1M Duo "humanomni1quadv1b", # Omni1 quad "humanomni25quadv1b", # Omni2.5 quad "humanomni258v1a", # Omni2.5 8 sample "humanomniexpress12v1b", # Omni express 12 "humanimmuno12v1b", # Immuno chip 12 "humancytosnp12v2p1h") # CytoSNP 12 ## RS: added cleancdfname() if(missing(chipType)){ chipType = match.arg(cleancdfname(annotation(RG)), chipList) } else chipType = match.arg(cleancdfname(chipType), chipList) pkgname = getCrlmmAnnotationName(chipType) if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) message(strwrap(msg)) stop("Package ", pkgname, " could not be found.") rm(suggCall, msg) } if(verbose) message("Loading chip annotation information.") loader("address.rda", .crlmmPkgEnv, pkgname) } aids = getVarInEnv("addressA") # comes from AddressA_ID or Address column in manifest bids = getVarInEnv("addressB") # comes from AddressB_ID or Address2 column in manifest snpbase = getVarInEnv("base") ## loader(‘file.rda’) ## x = getVarInEnv(‘x’) ## y = getVarInEnv(‘y’) ## ## I’d consider using something like: ## ## needToLoad = !all(sapply(c(‘x’, ‘y’), isLoaded)) ## if (needToLoad){ ## loader(‘file.rda’) ## x = getVarInEnv(‘x’) ## y = getVarInEnv(‘y’) ## } ids = names(aids) nsnps = length(aids) narrays = ncol(RG) # aidcol = match("AddressA_ID", colnames(annot)) # if(is.na(aidcol)) # aidcol = match("Address", colnames(annot)) # bidcol = match("AddressB_ID", colnames(annot)) # if(is.na(bidcol)) # bidcol = match("Address2", colnames(annot)) # aids = annot[, aidcol] # bids = annot[, bidcol] # snpids = annot[,"Name"] # snpbase = annot[,"SNP"] infI = !is.na(bids) & bids!=0 aord = match(aids, featureNames(RG)) # NAs are possible here bord = match(bids, featureNames(RG)) # and here # argrg = aids[rrgg] # brgrg = bids[rrgg] XY <- new("NChannelSet", X=matrix(0, nsnps, narrays), Y=matrix(0, nsnps, narrays), zero=matrix(0, nsnps, narrays), annotation=chipType, phenoData=RG@phenoData, protocolData=RG@protocolData, storage.mode="environment") featureNames(XY) = ids sampleNames(XY) = sampleNames(RG) ## RS rm(list=c("bord", "infI", "aids", "bids", "ids", "snpbase")) # print(gc()) gc(verbose=FALSE) # Need to initialize - matrices filled with NAs to begin with # XY@assayData$X[1:nsnps,] = 0 # XY@assayData$Y[1:nsnps,] = 0 # XY@assayData$zero[1:nsnps,] = 0 # First sort out Infinium II SNPs, X -> R (allele A) and Y -> G (allele B) from the same probe ## RS added not.na <- !is.na(aord) not.na.aord <- aord[not.na] ## RS substitution !is.na(aord) -> not.na ##r <- as.matrix(exprs(channel(RG, "R"))[not.na.aord,]) # mostly red r <- as.matrix(assayData(RG)[["R"]][not.na.aord, ]) XY@assayData$X[not.na,] <- r rm(r);gc(verbose=FALSE) g <- as.matrix(assayData(RG)[["G"]][not.na.aord,]) # mostly green XY@assayData$Y[not.na,] <- g rm(g); gc(verbose=FALSE) ##z <- as.matrix(exprs(channel(RG, "zero"))[not.na.aord,]) # mostly green z <- as.matrix(assayData(RG)[["zero"]][not.na.aord, ]) XY@assayData$zero[not.na,] <- z rm(z); gc(verbose=FALSE) ##RS added rm(RG) # print(gc()) gc(verbose=FALSE) ## Warning - not 100% sure that the code below is correct - could be more complicated than this # infIRR = infI & (snpbase=="[A/T]" | snpbase=="[T/A]" | snpbase=="[a/t]" | snpbase=="[t/a]") # X[infIRR,] = exprs(channel(RG, "R"))[aord[infIRR],] # mostly red # Y[infIRR,] = exprs(channel(RG, "R"))[bord[infIRR],] # mostly green # Finally Infinium I where X -> G from allele A probe and Y -> G from allele B probe # infIGG = infI & (snpbase=="[C/G]" | snpbase=="[G/C]" | snpbase=="[g/c]" | snpbase=="[c/g]") # X[infIGG,] = exprs(channel(RG, "G"))[aord[infIGG],] # mostly red # Y[infIGG,] = exprs(channel(RG, "G"))[bord[infIGG],] # mostly green # For now zero out Infinium I probes # XY@assayData$X[infI,] = 0 # XY@assayData$Y[infI,] = 0 # XY@assayData$zero[infI,] = 0 # gc(verbose=FALSE) XY } stripNormalize = function(XY, useTarget=TRUE, verbose=TRUE) { if(useTarget){ objectsNeeded <- c("stripnum", "reference") } else objectsNeeded <- "stripnum" needToLoad <- !all(sapply(objectsNeeded, isLoaded)) if(needToLoad){ pkgname = getCrlmmAnnotationName(annotation(XY)) if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) message(strwrap(msg)) stop("Package ", pkgname, " could not be found.") rm(suggCall, msg) } if(verbose) message("Loading strip and reference normalization information.") loader("preprocStuff.rda", .crlmmPkgEnv, pkgname) } stripnum = getVarInEnv("stripnum") if(useTarget) targetdist = getVarInEnv("reference") if(verbose){ message("Quantile normalizing ", ncol(XY), " arrays by ", max(stripnum), " strips.") if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=max(stripnum), style=3) } for(s in 1:max(stripnum)) { if(verbose) { if (getRversion() > '2.7.0') setTxtProgressBar(pb, s) else cat(".") } sel = stripnum==s ##RS: faster to access data directly ##subX = as.matrix(exprs(channel(XY, "X"))[sel,]) ##subY = as.matrix(exprs(channel(XY, "Y"))[sel,]) subX <- as.matrix(assayData(XY)[["X"]][sel, ]) subY <- as.matrix(assayData(XY)[["Y"]][sel, ]) if(useTarget) tmp = normalize.quantiles.use.target(cbind(subX, subY), targetdist[[s]]) else tmp = normalize.quantiles(cbind(subX, subY)) XY@assayData$X[sel,] = matrix(as.integer(tmp[,1:(ncol(tmp)/2)]+16), nrow(tmp), ncol(tmp)/2) XY@assayData$Y[sel,] = matrix(as.integer(tmp[,(ncol(tmp)/2+1):ncol(tmp)]+16), nrow(tmp), ncol(tmp)/2) rm(subX, subY, tmp, sel) gc(verbose=FALSE) } if(verbose) cat("\n") XY } preprocessInfinium2 = function(XY, mixtureSampleSize=10^5, fitMixture=TRUE, eps=0.1, verbose=TRUE, seed=1, cdfName, sns, stripNorm=TRUE, useTarget=TRUE) { #, # outdir=".") { # save.it=FALSE, # snpFile, # cnFile) { if(stripNorm) XY = stripNormalize(XY, useTarget=useTarget, verbose=verbose) ## MR: the code below is mostly straight from snprma.R if (missing(sns)) sns = sampleNames(XY) #$X if(missing(cdfName)) cdfName = annotation(XY) needToLoad <- !all(sapply(c("autosomeIndex", "SMEDIAN", "theKnots", "npProbesFid"), isLoaded)) if(needToLoad){ pkgname = getCrlmmAnnotationName(cdfName) if(!require(pkgname, character.only=TRUE, quietly=!verbose)){ suggCall = paste("library(", pkgname, ", lib.loc='/Altern/Lib/Loc')", sep="") msg = paste("If", pkgname, "is installed on an alternative location, please load it manually by using", suggCall) message(strwrap(msg)) stop("Package ", pkgname, " could not be found.") rm(suggCall, msg) } if(verbose) message("Loading snp annotation and mixture model parameters.") loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname) loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname) loader("snpProbesFid.rda", .crlmmPkgEnv, pkgname) loader("npProbesFid.rda", .crlmmPkgEnv, pkgname) } autosomeIndex = getVarInEnv("autosomeIndex") SMEDIAN = getVarInEnv("SMEDIAN") theKnots = getVarInEnv("theKnots") npIndex = getVarInEnv("npProbesFid") snpIndex = getVarInEnv("snpProbesFid") narrays = ncol(XY) nprobes = length(npIndex) if(length(nprobes)>0) { ## RS: channel creates an expression set. This is much slower than directly accessing the data A <- matrix(as.integer(assayData(XY)[["X"]][npIndex, ]), nprobes, narrays) ##system.time(A <- matrix(as.integer(exprs(channel(XY, "X"))[npIndex,]), nprobes, narrays)) B <- matrix(as.integer(assayData(XY)[["Y"]][npIndex, ]), nprobes, narrays) ##B = matrix(as.integer(exprs(channel(XY, "Y"))[npIndex,]), nprobes, narrays) ## new lines below - useful to keep track of zeroed out probes zero <- matrix(as.integer(assayData(XY)[["zero"]][npIndex, ]), nprobes, narrays) ##zero = matrix(as.integer(exprs(channel(XY, "zero"))[npIndex,]), nprobes, narrays) colnames(A) = colnames(B) = colnames(zero) = sns rownames(A) = rownames(B) = rownames(zero) = names(npIndex) cnAB = list(A=A, B=B, zero=zero, sns=sns, gns=names(npIndex), cdfName=cdfName) ## t0 <- proc.time() ## save(cnAB, file=cnFile) ## t0 <- proc.time()-t0 ## if(verbose) message("Used ", round(t0[3],1), " seconds to save ", cnFile, ".") rm(A, B, zero) # print(gc()) gc(verbose=FALSE) } # next process snp probes nprobes = length(snpIndex) ##We will read each cel file, summarize, and run EM one by one ##We will save parameters of EM to use later mixtureParams = matrix(NA, 4, narrays) SNR = rep(NA, narrays) SKW = rep(NA, narrays) ## This is the sample for the fitting of splines ## BC: I like better the idea of the user passing the seed, ## because this might intefere with other analyses ## (like what happened to GCRMA) set.seed(seed) idx = sort(sample(autosomeIndex, mixtureSampleSize)) idx2 = sample(nprobes, 10^5) ##S will hold (A+B)/2 and M will hold A-B ##NOTE: We actually dont need to save S. Only for pics etc... ##f is the correction. we save to avoid recomputing A = matrix(NA, nprobes, narrays) B = matrix(NA, nprobes, narrays) zero = matrix(NA, nprobes, narrays) if(verbose){ message("Calibrating ", narrays, " arrays.") if (getRversion() > '2.7.0') pb = txtProgressBar(min=0, max=narrays, style=3) } for(i in 1:narrays){ ## RS: faster to access data directly without using channel method ##A[,i] = as.integer(exprs(channel(XY, "X"))[snpIndex,i]) A[, i] <- as.integer(assayData(XY)[["X"]][snpIndex, i]) B[, i] <- as.integer(assayData(XY)[["Y"]][snpIndex, i]) zero[, i] <- as.integer(assayData(XY)[["zero"]][snpIndex, i]) ##B[,i] = as.integer(exprs(channel(XY, "Y"))[snpIndex,i]) ##zero[,i] = as.integer(exprs(channel(XY, "zero"))[snpIndex,i]) SKW[i] = mean((A[idx2,i]-mean(A[idx2,i]))^3)/(sd(A[idx2,i])^3) if(fitMixture){ S = (log2(A[idx,i])+log2(B[idx,i]))/2 - SMEDIAN M = log2(A[idx,i])-log2(B[idx,i]) ##we need to test the choice of eps.. it is not the max diff between funcs tmp = fitAffySnpMixture56(S, M, theKnots, eps=eps) mixtureParams[, i] = tmp[["coef"]] SNR[i] = tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2) } if(verbose) { if (getRversion() > '2.7.0') setTxtProgressBar(pb, i) else cat(".") } ## run garbage collection every now and then if(i %% 100 == 0) gc(verbose=FALSE); } if (verbose) { if (getRversion() > '2.7.0') close(pb) else cat("\n") } if (!fitMixture) SNR = mixtureParams = NA ## gns comes from preprocStuff.rda # if(save.it & !missing(snpFile)) { # t0 <- proc.time() # save(res, file=snpFile) # t0 <- proc.time()-t0 # if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".") # } res = list(A=A, B=B, zero=zero, sns=sns, gns=names(snpIndex), SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName, cnAB=cnAB) # open(res[["A"]]) # open(res[["B"]]) # open(res[["zero"]]) # open(res[["SNR"]]) # open(res[["mixtureParams"]]) # if(save.it & !missing(snpFile)) { # t0 <- proc.time() # save(res, file=snpFile) # t0 <- proc.time()-t0 # if(verbose) message("Used ", round(t0[3],1), " seconds to save ", snpFile, ".") # } # close(res[["A"]]) # close(res[["B"]]) # close(res[["zero"]]) # close(res[["SNR"]]) # close(res[["mixtureParams"]]) return(res) } crlmmIllumina <- function(RG, XY, stripNorm=TRUE, useTarget=TRUE, row.names=TRUE, col.names=TRUE, probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, seed=1, mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, cdfName, sns, recallMin=10, recallRegMin=1000, returnParams=FALSE, badSNP=.7) { if(missing(cdfName)) { if(!missing(RG)) cdfName = annotation(RG) if(!missing(XY)) cdfName = annotation(XY) } if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") if(!missing(RG)) { if(missing(XY)) XY = RGtoXY(RG, chipType=cdfName) else stop("Both RG and XY specified - please use one or the other") } if (missing(sns)) sns = sampleNames(XY) res <- preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, if(row.names) row.names=res$gns else row.names=NULL if(col.names) col.names=res$sns else col.names=NULL res2 <- crlmmGT(A=res[["A"]], B=res[["B"]], SNR=res[["SNR"]], mixtureParams=res[["mixtureParams"]], cdfName=cdfName, row.names=row.names, col.names=col.names, probs=probs, DF=DF, SNRMin=SNRMin, recallMin=recallMin, recallRegMin=recallRegMin, gender=gender, verbose=verbose, returnParams=returnParams, badSNP=badSNP) res2[["SNR"]] = res[["SNR"]] res2[["SKW"]] = res[["SKW"]] rm(res); gc(verbose=FALSE) return(list2SnpSet(res2, returnParams=returnParams)) } ## MR: Below is a more memory efficient version of crlmmIllumina() which ## reads in the .idats and genotypes in the one function and removes objects ## after they have been used crlmmIlluminaV2 = function(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), saveDate=FALSE, stripNorm=TRUE, useTarget=TRUE, # outdir=".", row.names=TRUE, col.names=TRUE, probs=c(1/3, 1/3, 1/3), DF=6, SNRMin=5, gender=NULL, seed=1, # save.it=FALSE, snpFile, cnFile, mixtureSampleSize=10^5, eps=0.1, verbose=TRUE, cdfName, sns, recallMin=10, recallRegMin=1000, returnParams=FALSE, badSNP=.7) { if(missing(cdfName)) stop("must specify cdfName") if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") # is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE) is.lds=FALSE RG = readIdatFiles(sampleSheet=sampleSheet, arrayNames=arrayNames, ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate) XY = RGtoXY(RG, chipType=cdfName) # if(is.lds) { # open(RG@assayData$R); open(RG@assayData$G); open(RG@assayData$zero) # delete(RG@assayData$R); delete(RG@assayData$G); delete(RG@assayData$zero) # } rm(RG); gc(verbose=FALSE) if (missing(sns)) { sns = sampleNames(XY) } res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir) #, # save.it=save.it, snpFile=snpFile, cnFile=cnFile) # if(is.lds) { # open(XY@assayData$X); open(XY@assayData$Y); open(XY@assayData$zero) # delete(XY@assayData$X); delete(XY@assayData$Y); delete(XY@assayData$zero) # } rm(XY); gc(verbose=FALSE) if(row.names) row.names=res$gns else row.names=NULL if(col.names) col.names=res$sns else col.names=NULL ## ## if(is.lds){ ## res2 <- crlmmGT2(A=res[["A"]], ## B=res[["B"]], ## SNR=res[["SNR"]], ## mixtureParams=res[["mixtureParams"]], ## cdfName=cdfName, ## row.names=row.names, ## col.names=col.names, ## probs=probs, ## DF=DF, ## SNRMin=SNRMin, ## recallMin=recallMin, ## recallRegMin=recallRegMin, ## gender=gender, ## verbose=verbose, ## returnParams=returnParams, ## badSNP=badSNP) ## } else { res2 <- crlmmGT(A=res[["A"]], B=res[["B"]], SNR=res[["SNR"]], mixtureParams=res[["mixtureParams"]], cdfName=cdfName, row.names=row.names, col.names=col.names, probs=probs, DF=DF, SNRMin=SNRMin, recallMin=recallMin, recallRegMin=recallRegMin, gender=gender, verbose=verbose, returnParams=returnParams, badSNP=badSNP) ## } ## FUN = ifelse(is.lds, "crlmmGT2", "crlmmGT") ## ## genotyping ## crlmmGTfxn = function(FUN,...){ ## switch(FUN, ## crlmmGT2=crlmmGT2(...), ## crlmmGT=crlmmGT(...)) ## } ## res2 = crlmmGTfxn(FUN, ## A=res[["A"]], ## B=res[["B"]], ## SNR=res[["SNR"]], ## mixtureParams=res[["mixtureParams"]], ## cdfName=cdfName, ## row.names=row.names, ## col.names=col.names, ## probs=probs, ## DF=DF, ## SNRMin=SNRMin, ## recallMin=recallMin, ## recallRegMin=recallRegMin, ## gender=gender, ## verbose=verbose, ## returnParams=returnParams, ## badSNP=badSNP) # if(is.lds) { # open(res[["SNR"]]); open(res[["SKW"]]) # } res2[["SNR"]] = res[["SNR"]] res2[["SKW"]] = res[["SKW"]] # if(is.lds) { # delete(res[["A"]]); delete(res[["B"]]) # delete(res[["SKW"]]); delete(res[["SNR"]]); delete(res[["mixtureParams"]]) # } rm(res); gc(verbose=FALSE) return(list2SnpSet(res2, returnParams=returnParams)) } # Functions analogous to Rob's Affy functions to set up container getProtocolData.Illumina = function(filenames, sep="_", fileExt="Grn.idat", verbose=FALSE) { narrays = length(filenames) headerInfo = list(nProbes = rep(NA, narrays), Barcode = rep(NA, narrays), ChipType = rep(NA, narrays), Manifest = rep(NA, narrays), Position = rep(NA, narrays)) scanDates = data.frame(ScanDate=rep(NA, narrays), DecodeDate=rep(NA, narrays)) rownames(scanDates) <- make.unique(gsub(paste(sep, fileExt, sep=""), "", filenames)) ## read in the data for(i in seq_along(filenames)) { if(verbose) cat("reading", filenames[i], "\n") idsG = G = NULL G = readIDAT(filenames[i]) idsG = rownames(G$Quants) headerInfo$nProbes[i] = G$nSNPsRead headerInfo$Barcode[i] = G$Barcode headerInfo$ChipType[i] = G$ChipType headerInfo$Manifest[i] = G$Unknown$MostlyNull headerInfo$Position[i] = G$Unknowns$MostlyA if(headerInfo$nProbes[i]>(headerInfo$nProbes[1]+10000) || headerInfo$nProbes[i]<(headerInfo$nProbes[1]-10000)) { warning("Chips are not of the same type. Skipping ", basename(filenames[i])) next() } scanDates$ScanDate[i] = G$RunInfo[1, 1] scanDates$DecodeDate[i] = G$RunInfo[2, 1] rm(G) gc(verbose=FALSE) } protocoldata = new("AnnotatedDataFrame", data=scanDates, varMetadata=data.frame(labelDescription=colnames(scanDates), row.names=colnames(scanDates))) return(protocoldata) } ##getAvailableIlluminaGenomeBuild <- function(path){ ## snp.file <- list.files(path, pattern="snpProbes_hg") ## if(length(snp.file) > 1){ ## ## use hg19 ## message("genome build not specified. Using build hg19 for annotation.") ## snp.file <- snp.file[1] ## } ## genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]]) ##} getAvailableIlluminaGenomeBuild <- function(path){ snp.file <- list.files(path, pattern="snpProbes_hg") if(length(snp.file) > 1){ ## use hg19 message("genome build not specified. Using build hg19 for annotation.") snp.file <- snp.file[1] } if(length(snp.file) == 1) genome <- gsub(".rda", "", strsplit(snp.file, "snpProbes_")[[1]][[2]]) if(length(snp.file)==0) genome <- "" genome } constructInf <- function(sampleSheet=NULL, arrayNames=NULL, path=".", arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), cdfName, verbose=FALSE, batch, #fns, saveDate=TRUE) { #, outdir="."){ verbose <- FALSE if(!is.null(arrayNames)) { pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) } if(!is.null(sampleSheet)) { # get array info from Illumina's sample sheet if(is.null(arrayNames)) { if(!is.null(arrayInfoColNames$barcode) && (arrayInfoColNames$barcode %in% colnames(sampleSheet))) { barcode = sampleSheet[,arrayInfoColNames$barcode] arrayNames=barcode } if(!is.null(arrayInfoColNames$position) && (arrayInfoColNames$position %in% colnames(sampleSheet))) { position = sampleSheet[,arrayInfoColNames$position] if(is.null(arrayNames)) arrayNames=position else arrayNames = paste(arrayNames, position, sep=sep) if(highDensity) { hdExt = list(A="R01C01", B="R01C02", C="R02C01", D="R02C02") for(i in names(hdExt)) arrayNames = sub(paste(sep, i, sep=""), paste(sep, hdExt[[i]], sep=""), arrayNames) } } } pd = new("AnnotatedDataFrame", data = sampleSheet) sampleNames(pd) = make.unique(basename(arrayNames)) } if(is.null(arrayNames)) { arrayNames = gsub(paste(sep, fileExt$green, sep=""), "", dir(pattern=fileExt$green, path=path)) if(!is.null(sampleSheet)) { sampleSheet=NULL cat("Could not find required info in \'sampleSheet\' - ignoring. Check \'sampleSheet\' and/or \'arrayInfoColNames\'\n") } pd = new("AnnotatedDataFrame", data = data.frame(Sample_ID=arrayNames)) } narrays = length(arrayNames) if(!missing(batch)) { stopifnot(length(batch) == narrays) } if(missing(batch)) { stop("Must specify 'batch'") # batch = as.factor(rep(1, narrays)) } grnfiles = paste(arrayNames, fileExt$green, sep=sep) redfiles = paste(arrayNames, fileExt$red, sep=sep) if(length(grnfiles)==0 || length(redfiles)==0) stop("Cannot find .idat files") if(length(grnfiles)!=length(redfiles)) stop("Cannot find matching .idat files") if(path[1] != "." & path[1] != ""){ grnidats = file.path(path, grnfiles) redidats = file.path(path, redfiles) } else { message("path arg not set. Assuming files are in local directory, or that complete path is provided") grnidats = grnfiles redidats = redfiles } if(!all(file.exists(grnidats))) stop("Missing some of the *Grn.idat files") if(!all(file.exists(redidats))) stop("Missing some of the *Red.idat files") if(verbose) message("Initializing container for genotyping and copy number estimation") pkgname <- getCrlmmAnnotationName(cdfName) path <- system.file("extdata", package=pkgname) genome <- getAvailableIlluminaGenomeBuild(path) featureData = getFeatureData(cdfName, copynumber=TRUE, genome=genome) nr = nrow(featureData); nc = narrays sns <- if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) { make.unique(sampleSheet$Sample_ID) } else{ make.unique(basename(arrayNames)) } biga <- initializeBigMatrix(name="A", nr, nc) bigb <- initializeBigMatrix(name="B", nr, nc) bigc <- initializeBigMatrix(name="call", nr, nc) bigd <- initializeBigMatrix(name="callPr", nr,nc) colnames(biga) <- colnames(bigb) <- colnames(bigc) <- colnames(bigd) <- sns cnSet <- new("CNSet", alleleA=biga, alleleB=bigb, call=bigc, callProbability=bigd, annotation=cdfName, featureData=featureData, batch=batch, genome=genome) ## if (!is.null(sampleSheet) && !is.null(sampleSheet$Sample_ID)) { ## sampleNames(cnSet) = make.unique(sampleSheet$Sample_ID) ## } else{ ## sampleNames(cnSet) <- make.unique(basename(arrayNames)) ## } if(saveDate){ protocolData = getProtocolData.Illumina(grnidats, sep=sep, fileExt=fileExt$green, verbose=verbose) } else{ protocolData = annotatedDataFrameFrom(A(cnSet), byrow=FALSE) } rownames(pData(protocolData)) = sampleNames(cnSet) protocolData(cnSet) = protocolData ##featureData(cnSet) = featureData featureNames(cnSet) = featureNames(featureData) cnSet$gender <- initializeBigVector("gender", ncol(cnSet), vmode="integer") cnSet$SNR = initializeBigVector("crlmmSNR-", ncol(cnSet), "double") cnSet$SKW = initializeBigVector("crlmmSKW-", ncol(cnSet), "double") ##sampleNames(cnSet) <- basename(sampleNames(cnSet)) return(cnSet) } construct.Illumina <- constructInf preprocessInf <- function(cnSet, sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), highDensity=TRUE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, mixtureSampleSize=10^5, fitMixture=TRUE, eps=0.1, verbose=TRUE, seed=1, cdfName){ stopifnot(all(c("gender", "SNR", "SKW") %in% varLabels(cnSet))) open(A(cnSet)) open(B(cnSet)) sns <- sampleNames(cnSet) pkgname = getCrlmmAnnotationName(annotation(cnSet)) is.snp = isSnp(cnSet) snp.index = which(is.snp) narrays = ncol(cnSet) sampleBatches <- splitIndicesByLength(seq_len(ncol(cnSet)), ocSamples()/getDoParWorkers()) mixtureParams = initializeBigMatrix("crlmmMixt-", 4, narrays, "double") ocLapply(seq_along(sampleBatches), crlmm:::processIDAT, sampleBatches=sampleBatches, sampleSheet=sampleSheet, arrayNames=arrayNames, ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose, mixtureSampleSize=mixtureSampleSize, fitMixture=fitMixture, eps=eps, seed=seed, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget, A=A(cnSet), B=B(cnSet), GT=calls(cnSet), GTP=snpCallProbability(cnSet), SKW=cnSet$SKW, SNR=cnSet$SNR, mixtureParams=mixtureParams, is.snp=is.snp, neededPkgs=c("crlmm", pkgname)) # outdir=outdir, return(mixtureParams) } preprocess <- preprocessInf genotypeInf <- function(cnSet, mixtureParams, probs=rep(1/3,3), SNRMin=5, recallMin=10, recallRegMin=1000, verbose=TRUE, returnParams=TRUE, badSNP=0.7, gender=NULL, DF=6, cdfName){ is.snp = isSnp(cnSet) snp.index = which(is.snp) ## narrays = ncol(cnSet) ## open(A(cnSet)) ## open(B(cnSet)) ## open(snpCall(cnSet)) ## open(snpCallProbability(cnSet)) ## ## crlmmGT2 overwrites the normalized intensities with calls and confidenceScores ## message("Writing to call and callProbability slots") ## for(j in 1:ncol(cnSet)){ ## snpCall(cnSet)[snp.index, j] <- as.integer(A(cnSet)[snp.index, j]) ## snpCallProbability(cnSet)[snp.index, j] <- as.integer(B(cnSet)[snp.index, j]) ## } ## message("Writing complete. Begin genotyping...") ## close(A(cnSet)) ## close(B(cnSet)) tmp <- crlmmGT2(A=A(cnSet), B=B(cnSet), SNR=cnSet$SNR, mixtureParams=mixtureParams, cdfName=cdfName, col.names=sampleNames(cnSet), probs=probs, DF=DF, SNRMin=SNRMin, recallMin=recallMin, recallRegMin=recallRegMin, gender=gender, verbose=verbose, returnParams=returnParams, badSNP=badSNP, callsGt=calls(cnSet), callsPr=snpCallProbability(cnSet))#, ##RS: I changed the API for crlmmGT2 to be consistent with crlmmGT ## snp.names=featureNames(cnSet)[snp.index]) if(verbose) message("Genotyping finished. Updating container with genotype calls and confidence scores.") open(cnSet$gender) cnSet$gender[,] = tmp$gender close(cnSet$gender) TRUE } genotype.Illumina <- function(sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), cdfName, copynumber=TRUE, batch, saveDate=TRUE, stripNorm=TRUE, useTarget=TRUE, mixtureSampleSize=10^5, fitMixture=TRUE, eps=0.1, verbose=TRUE, seed=1, sns, probs=rep(1/3, 3), DF=6, SNRMin=5, recallMin=10, recallRegMin=1000, gender=NULL, returnParams=TRUE, badSNP=0.7) { is.lds = ifelse(isPackageLoaded("ff"), TRUE, FALSE) stopifnot(is.lds) if(missing(cdfName)) stop("must specify cdfName") if(!isValidCdfName(cdfName)) stop("cdfName not valid. see validCdfNames") stopifnot(!missing(batch)) if(is(batch, "factor")) batch <- as.character(batch) pkgname <- getCrlmmAnnotationName(cdfName) message("Instantiate CNSet container.") cnSet <- constructInf(sampleSheet=sampleSheet, arrayNames=arrayNames, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity, sep=sep, fileExt=fileExt, cdfName=cdfName, verbose=verbose, batch=batch, saveDate=saveDate) mixtureParams <- preprocessInf(cnSet=cnSet, sampleSheet=sampleSheet, arrayNames=arrayNames, ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, stripNorm=stripNorm, useTarget=useTarget, mixtureSampleSize=mixtureSampleSize, fitMixture=fitMixture, eps=eps, verbose=verbose, seed=seed, cdfName=cdfName) message("Preprocessing complete. Begin genotyping...") genotypeInf(cnSet=cnSet, mixtureParams=mixtureParams, probs=probs, SNRMin=SNRMin, recallMin=recallMin, recallRegMin=recallRegMin, verbose=verbose, returnParams=returnParams, badSNP=badSNP, gender=gender, DF=DF, cdfName=cdfName) return(cnSet) } processIDAT <- function(stratum, sampleBatches, sampleSheet=NULL, arrayNames=NULL, ids=NULL, path=".", arrayInfoColNames=list(barcode="SentrixBarcode_A", position="SentrixPosition_A"), highDensity=FALSE, sep="_", fileExt=list(green="Grn.idat", red="Red.idat"), saveDate=FALSE, verbose=TRUE, mixtureSampleSize=10^5, fitMixture=TRUE, eps=0.1, seed=1, cdfName, sns, stripNorm=TRUE, useTarget=TRUE, A, B, GT, GTP, SKW, SNR, mixtureParams, is.snp) { #, outdir=".") { message("Processing sample stratum ", stratum, " of ", length(sampleBatches)) sel <- sampleBatches[[stratum]] if(length(path)>= length(sel)) path = path[sel] RG = readIdatFiles(sampleSheet=sampleSheet[sel,], arrayNames=arrayNames[sel], ids=ids, path=path, arrayInfoColNames=arrayInfoColNames, highDensity=highDensity, sep=sep, fileExt=fileExt, saveDate=saveDate, verbose=verbose) XY = RGtoXY(RG, chipType=cdfName) rm(RG) gc(verbose=FALSE) if (missing(sns) || length(sns)!=ncol(XY)) sns = sampleNames(XY) res = preprocessInfinium2(XY, mixtureSampleSize=mixtureSampleSize, fitMixture=TRUE, verbose=verbose, seed=seed, eps=eps, cdfName=cdfName, sns=sns, stripNorm=stripNorm, useTarget=useTarget) #, outdir=outdir) rm(XY) gc(verbose=FALSE) if(verbose) message("Finished preprocessing.") snp.index = which(is.snp) np.index = which(!is.snp) open(A); open(B); open(GT); open(GTP) Amatrix <- res[["A"]] Bmatrix <- res[["B"]] ## Amatrix and Bmatrix are ordinary matrices--not ff objects. ## By writing columns of a ordinary matrix to GT and GTP, we ## save one read step later on. GT and GTP will be updated to ## calls and call probabilities by the crlmmGT2 function. The A ## and B slots will retain the normalized intensities for the ## A and B alleles for(j in seq_along(sel)){ jj <- sel[j] A[snp.index, jj] <- Amatrix[, j] GT[snp.index, jj] <- Amatrix[, j] B[snp.index, jj] <- Bmatrix[, j] GTP[snp.index, jj] <- Bmatrix[, j] } if(length(np.index)>0) { cnAmatrix <- res[["cnAB"]]$A cnBmatrix <- res[["cnAB"]]$B for(j in seq_along(sel)){ jj <- sel[j] A[np.index, jj] <- cnAmatrix[, j] GT[np.index, jj] <- cnAmatrix[, j] B[np.index, jj] <- cnBmatrix[, j] GTP[np.index, jj] <- cnBmatrix[, j] } } open(SKW); open(SNR); open(mixtureParams) SKW[sel] = res[["SKW"]][] SNR[sel] = res[["SNR"]][] mixtureParams[, sel] = res[["mixtureParams"]][] close(A); close(B) close(GT); close(GTP) close(SNR); close(SKW) close(mixtureParams) rm(res) gc(verbose=FALSE) TRUE }