.initializeCdf <- function(con, nRows = 1, nCols = 1, nUnits = 1, nQcUnits = 0, refSeq = "", unitnames = rep("", nUnits), qcUnitLengths = rep(0, nQcUnits), unitLengths = rep(0, nUnits), ...) { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Validate arguments # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if(length(qcUnitLengths) != nQcUnits) { stop("Number of elements in argument 'qcUnitLengths' does not match 'nQcUnits'"); } if(length(unitLengths) != nUnits) { stop("Number of elements in argument 'unitLengths' does not match 'nUnits'"); } if(length(refSeq) != 1) { stop("Argument 'refSeq' should be a single character."); } lrefSeq <- nchar(refSeq); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # CDF header # # 1 Magic number. Always set to 67. [integer] # 2 Version number. [integer] # 3 The number of columns of cells on the array. [unsigned short] # 4 The number of rows of cells on the array. [unsigned short] # 5 The number of units in the array not including QC units. The term # unit is an internal term which means probe set. [integer] # 6 The number of QC units. [integer] # 7 The length of the resequencing reference sequence. [integer] # 8 The resequencing reference sequence. [char[len]] # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - offset <- 0; ## Magic number and version number writeBin(object = as.integer(c(67, 1)), con = con, size = 4, endian = "little") ## NCols, NRows writeBin(object = as.integer(c(nCols, nRows)), con = con, size = 2, endian = "little") ## NumberUnits, NumberQCUnits writeBin(object = as.integer(c(nUnits, nQcUnits)), con = con, size = 4, endian = "little") ## Length of refSeqsequence writeBin(object = as.integer(lrefSeq), con = con, size = 4, endian = "little") offset <- 24; fOffset <- seek(con=con, origin="start", rw="write"); if (offset != fOffset) { stop("File format write error (step 1): File offset is not the excepted one: ", fOffset, " != ", offset); } ## RefSeqsequece if(lrefSeq > 0) writeChar(as.character(refSeq), con=con, eos=NULL); # Current offset offset <- offset + lrefSeq; fOffset <- seek(con=con, origin="start", rw="write"); if (offset != fOffset) { stop("File format write error (step 2): File offset is not the excepted one: ", fOffset, " != ", offset); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Unit names # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Write to raw vector (2*10^6 units => 122Mb; should be ok for now) # Since we can't create strings with '\0':s, we use '\xFF', # write to raw and then replace '\xFF' with '\0'. Thus, unit names with # '\xFF' are invalid, but this should not be a real problem. pads <- sapply(0:64, FUN=function(x) paste(rep("\xFF", x), collapse="")); # Write the unit names in chunks to save memory nbrOfUnits <- length(unitnames); chunkSize <- 100000; nbrOfChunks <- ceiling(nbrOfUnits / chunkSize); # Allocate raw vector raw <- raw(64*chunkSize); for (kk in 1:nbrOfChunks) { # Units for this chunk from <- (kk-1)*chunkSize+1; to <- min(from+chunkSize-1, nbrOfUnits); unitnamesFF <- unitnames[from:to]; # Pad the unit names unitnamesFF <- paste(unitnamesFF, pads[64-nchar(unitnamesFF)], sep=""); # Truncate last chunk? if (chunkSize > length(unitnamesFF)) { raw <- raw[1:(64*length(unitnamesFF))]; } # Write unit names to raw vector raw <- writeBin(con=raw, unitnamesFF, size=1); unitnamesFF <- NULL; # Not needed anymore # Replace all '\xFF' with '\0'. idxs <- which(raw == as.raw(255)); raw[idxs] <- as.raw(0); idxs <- NULL; # Not needed anymore writeBin(con=con, raw); } # for (kk in ...) raw <- NULL; # Not needed anymore bytesOfUnitNames <- 64 * nUnits; offset <- offset + bytesOfUnitNames; fOffset <- seek(con=con, origin="start", rw="write"); if (offset != fOffset) { stop("File format write error (step 3): File offset is not the excepted one: ", fOffset, " != ", offset); } bytesOfQcUnits <- 4 * nQcUnits; offset <- offset + bytesOfQcUnits; bytesOfUnits <- 4 * nUnits; offset <- offset + bytesOfUnits; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # QC units file positions # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if (nQcUnits > 0) { csum <- cumsum(qcUnitLengths); nextOffset <- csum[nQcUnits]; starts <- c(0, csum[-nQcUnits]); starts <- as.integer(offset + starts); writeBin(starts, con = con, size = 4, endian = "little") } else { nextOffset <- 0; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Units file positions # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - offset <- offset + nextOffset; if (nUnits > 0) { csum <- cumsum(unitLengths); nextOffset <- csum[nUnits]; starts <- c(0, csum[-nUnits]); starts <- as.integer(offset + starts); writeBin(starts, con = con, size = 4, endian = "little"); } else { nextOffset <- 0; } } # .initializeCdf() .writeCdfUnit <- function(unit, con, unitname=NULL) { ## 3. Write the unit unitType <- unit$unittype if (!is.numeric(unitType)) { unitType <- switch(unitType, unknown = 0, expression = 1, genotyping = 2, resequencing = 3, tag = 4, copynumber = 5, genotypingcontrol = 6, expressioncontrol = 7) } unitDirection <- unit$unitdirection if (!is.numeric(unitDirection)) { unitDirection <- switch(unitDirection, nodirection = 0, sense = 1, antisense = 2, unknown = 3) } unitInfo <- as.integer(c(unitType, unitDirection, unit$natoms, length(unit$groups), unit$ncells, unit$unitnumber, unit$ncellsperatom)) # Number of bytes: 2+1+4*4+1=20 bytes writeBin(unitInfo[1], con = con, size = 2, endian = "little") writeBin(unitInfo[2], con = con, size = 1, endian = "little") writeBin(unitInfo[3:6], con = con, size = 4, endian = "little") writeBin(unitInfo[7], con = con, size = 1, endian = "little") ## Writing each group in turn # Number of bytes: (18+64)*nbrOfGroups + 14*totalNbrOfCells bytes groupDirections <- c(nodirection=0, sense=1, antisense=2, unknown=3); for(igroup in seq_along(unit$groups)) { group <- unit$groups[[igroup]] groupDirection <- groupDirections[group$groupdirection]; groupDirection <- switch(group$groupdirection, nodirection = 0, sense = 1, antisense = 2, unknown = 3) groupInfo <- as.integer(c(group$natoms, length(group$x), group$ncellsperatom, groupDirection, min(group$atoms, 0))) # Number of bytes: 2*4+2*1+2*4=18 bytes writeBin(groupInfo[1:2], con = con, size = 4, endian = "little") writeBin(groupInfo[3:4], con = con, size = 1, endian = "little") writeBin(groupInfo[5:6], con = con, size = 4, endian = "little") # Number of bytes: 64 bytes suppressWarnings({ writeChar(as.character(names(unit$groups)[igroup]), con = con, nchars = 64, eos = NULL) }) ## Writing each cell in turn cells <- matrix(as.integer(c(group$indexpos, group$x, group$y, group$atom)), ncol = 4) # Number of bytes: 14*nbrOfCells bytes for(icell in seq_along(group$x)) { # Number of bytes: 1*4+2*2+1*4+1*2=14 bytes writeBin(cells[icell, 1], con = con, size = 4, endian = "little") writeBin(cells[icell, 2:3], con = con, size = 2, endian = "little") writeBin(cells[icell, 4], con = con, size = 4, endian = "little") writeChar(as.character(c(group$pbase[icell], group$tbase[icell])), con = con, nchars = c(1,1), eos = NULL) } # for (icell ...) } # for (igroup ...) } # .writeCdfUnit() .writeCdfQcUnit <- function(qcunit, con) { ## 2. Actually write the qcunit type <- qcunit$type; if (!is.numeric(type)) { type <- switch(type, unknown = 0, checkerboardNegative = 1, checkerboardPositive = 2, hybeNegative = 3, hybePositive = 4, textFeaturesNegative = 5, textFeaturesPositive = 6, centralNegative = 7, centralPositive = 8, geneExpNegative = 9, geneExpPositive = 10, cycleFidelityNegative = 11, cycleFidelityPositive = 12, centralCrossNegative = 13, centralCrossPositive = 14, crossHybeNegative = 15, crossHybePositive = 16, SpatialNormNegative = 17, SpatialNormPositive = 18) } # Write 2 + 4 bytes nbrOfBytes <- 6; qcunitInfo <- as.integer(c(type, qcunit$ncells)) writeBin(qcunitInfo[1], con = con, size = 2, endian = "little") writeBin(qcunitInfo[2], con = con, size = 4, endian = "little") # Write 2 + 4 bytes nCells <- length(qcunit$x); nbrOfBytes <- 7*nCells; cells <- matrix(as.integer(c(qcunit$x, qcunit$y, qcunit$length, qcunit$pm, qcunit$background)), ncol = 5) for(icell in seq_along(qcunit$x)) { writeBin(cells[icell, 1:2], con = con, size = 2, endian = "little") writeBin(cells[icell, 3:5], con = con, size = 1, endian = "little") } } # .writeCdfQcUnit() ############################################################################ # HISTORY: # 2013-06-29 # o BUG FIX: Since affxparser 1.30.2/1.31.2, .writeCdfUnit() encoded unit # types incorrectly, iff specified as integers. # o BUG FIX: Likewise, .writeCdfUnit() has always encoded unit directions # incorrectly, iff specified as integers. Moreover, .writeCdfQcUnit() # has always encoded unit types incorrectly, iff specified as integers. # 2013-05-25 /HB # o Removed all gc() in .initializeCdf(). # 2013-01-07 /HB # o GENERALIZATION: .writeCdfUnit() now also encodes unit types # 'genotypingcontrol' and 'expressioncontrol'. # o BUG FIX: .writeCdfUnit() incorrectly encoded the 'unknown' unit type # as 5 and not 0. # 2008-08-09 /HB # o BUG FIX: .writeCdfUnit() did output unit type 'resequencing' and 'tag' # as 4 and 3, and not 3 and 4, respectively. # 2007-11-13 /KH # o BUG FIX: The error message in internal .initializeCdf() would mention # 'qcUnitLengths' when it was meant to say 'unitLengths'. # 2007-07-13 /HB # o While writing unit names in .initializeCdf(), quite a few copies were # created using up a lot of memory. By removing unused objects and # writing unit names in chunks memory usage is now stable and < 200MB. # 2007-02-01 /HB # o Updated to camel case as much as possible to match JBs updates in the # branch. # o Removed non-used arguments 'unitpositions' and 'qcunitpositions' from # .initializeCdf(). # 2007-01-10 /HB # o Added writeCdfHeader(), writeCdfQcUnits() and writeCdfUnits(). With # these it is now possible to build up the CDF in chunks. # o Removed obsolete arguments 'addName' and 'addPositions' and all related # code. Internal variable 'positions' is not needed anymore. # There are no more seek():s in the code. # o Removed obsolete .writeCdfUnit2(). # o Now only every 1000th unit (instead of 100th) is reported. It is now # also a count down. # 2006-12-18 /KS # o Make global replacement "block" -> "group" to maintain consistency # with other code, pursuant to communication from KH. # 2006-10-25 /HB (+KS) # o BUG FIX: .initializeCdf() was writing false file offset for QC units # when the number QC nUnits were zero. This would core dump readCdfNnn(). # 2006-09-21 /HB # o BUG FIX: The 'atom' and 'indexpos' fields were swapped. # o Now suppressing warnings "writeChar: more characters requested..." in # writeCdf(). # 2006-09-11 /HB # o BUG FIX: nRows & nCols were swapped in the CDF header. # 2006-09-09 /HB # o Updated writeCdf() has been validate with compareCdfs() on a few arrays. # o With the below "optimizations" writeCdf() now writes Hu6800.CDF with # units in 130s compared to 140s. # o Now initializeCdf() dumps all unit names at once by first building a # raw vector. This is now much faster than before. # o Now writeCdf() does not seek() around in the file anymore. This should # speed up writing at least a bit. # o Made some optimization, which speeds up the writing a bit. Jumping # around in the file with seek() is expensive and should be avoided. # o Rename writeUnit() to writeCdfUnit() and same for the QC function. # o Added more verbose output and better errror messages for writeCdf(). # 2006-09-07 /HB # o Maybe initalizeCdf(), writeUnit(), and writeQcUnit() should be made # private functions of this package. # o Removed textCdf2binCdf() skeleton. See convertCdf() instead. # o Updated writeCdf() such that the connection is guaranteed to be closed # regardless. ############################################################################