Index: MotifList-class.R
===================================================================
--- MotifList-class.R	(revision 67138)
+++ MotifList-class.R	(working copy)
@@ -4,10 +4,30 @@
           prototype(elementType="matrix")
           )
 
-setGeneric ('add', signature='obj', function (obj, matrices, tbl.metadata) standardGeneric ('add'))
+setValidity("MotifList", function(object) {
+    msg <- NULL
+
+    ## what makes for a valid MotifList?
+    if (is.null(names(object)))
+        msg <- c(msg, "'names()' must not be NULL")
+    else if (any(duplicated(names(object))))
+        msg <- c(msg, "all 'names()' must be unique")
+    ## all(sapply(object, class) == "matrix") works for 0-length object
+    if (!all(sapply(object, class) == "matrix"))
+        msg <- c(msg, "all elements must be of class 'matrix'")
+    if (!all(sapply(object, nrow) == 4))
+        msg <- c(msg, "all elements must have 4 rows")
+    ok <- sapply(object, function(elt) all(colSums(elt) == 1))
+    if (!all(ok))
+        msg <- c(msg, "all elements must have colSums equal to 1")
+
+    if (is.null(msg)) TRUE else msg
+})
+
+setGeneric ('add', function (obj, matrices, tbl.metadata) standardGeneric ('add'),
+            signature='obj')
 #------------------------------------------------------------------------------------------------------------------------
 MotifList = function (matrices=list(),
-                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
                       providerNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
                       providerIds=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
                       geneSymbols=character(),              # ABF2, Rhox11
@@ -22,27 +42,9 @@
                       dataSources=character(),              # ScerTF, UniPROBE
                       experimentTypes=character(),          # NA, protein-binding microarray
                       pubmedIDs=character(),                # 19111667, 1858359
-                      tfFamilies=character())               # NA, NA
+                      tfFamilies=character())
 {  
-  names (matrices) = names
-  count = length (matrices)
-  stopifnot (length (names) == count)
-  stopifnot (length (providerNames) == count)
-  stopifnot (length (providerIds) == count)
-  stopifnot (length (geneSymbols) == count)
-  stopifnot (length (geneIds) == count)
-  stopifnot (length (geneIdTypes) == count)
-  stopifnot (length (proteinIds) == count)
-  stopifnot (length (proteinIdTypes) == count)
-  stopifnot (length (sequenceCounts) == count)
-  stopifnot (length (organisms) == count)
-  stopifnot (length (bindingDomains) == count)
-  stopifnot (length (bindingSequences) == count)
-  stopifnot (length (dataSources) == count)
-  stopifnot (length (experimentTypes) == count)
-  stopifnot (length (pubmedIDs) == count)
-  stopifnot (length (tfFamilies) == count)
-
+  ## these are redundant with DataFrame validity method
   elementMetadata = DataFrame (#name=names,
                                providerName=providerNames,
                                providerId=providerIds,
@@ -63,45 +65,33 @@
   
 } # ctor
 #------------------------------------------------------------------------------------------------------------------------
+## quiet vs. verbose
 MotifDb = function (loadAllSources=TRUE, quiet=TRUE)
 {
   mdb <- MotifList ()
   if (loadAllSources) {
-    data.path = paste (system.file (package='MotifDb'), 'data', sep='/')
+    data.path = system.file(package="MotifDb", "data")
     data.files = dir (data.path, full.names=TRUE)
-    sources.to.exclude = c ('ScerTF.RData')   # pending minor revisions
-    removers = as.integer (sapply (sources.to.exclude, function (source) grep (source, data.files)))
-    if (length (removers) > 0)
-      data.files = data.files [-removers]
-  
-    count = 0
-    if (length (data.files) > 0)   
+    sources.to.exclude = "ScerTF.RData"   # pending minor revisions
+    data.files = data.files[!basename(data.files) %in% sources.to.exclude]
       for (data.file in data.files) {
+        ## see globalVariables(tbl.md, matrices)
         tbl.md = NA; matrices = NA;  # define these to keep 'check' happy.  they are loaded by 'load'
-        variables = load (data.file)
-        mdb = add (mdb, matrices, tbl.md)
-        filename.tokens = strsplit (data.file, '\\/') [[1]]
-        shortFilename = filename.tokens [length (filename.tokens)]
-        if (!quiet) 
-          print (noquote (sprintf ('added %s (%d) matrices, length now: %d', shortFilename, length (matrices), length (mdb))))
+        load (data.file)
+        mdb <- append(mdb, do.call("MotifList", c(matrices, tbl.md)))
+        if (!quiet)
+          message(sprintf ('added %s (%d) matrices, length now: %d',
+                           basename(data.file), length (matrices),
+                           length (mdb)))
       } # for data.file
   
-    if (!quiet) {
+    if (!quiet)
       print (table (mdb$dataSource))
-      }
     } # if loadAllSources
 
-  return (mdb)
-
+  mdb
 } # MotifDb
 #------------------------------------------------------------------------------------------------------------------------
-setMethod ('length', signature = 'MotifList',
-
-  function (x) {
-    return (length (x@listData))
-    })
-
-#------------------------------------------------------------------------------------------------------------------------
 setMethod ('subset', signature = 'MotifList',
 
   function (x, subset, select, drop = FALSE, ...) {
@@ -115,42 +105,26 @@
         stop("'subset' must be coercible to logical")
       i <- i & !is.na(i)
       } # else
-    return (x [i])
+    x [i]
   })
 
 #------------------------------------------------------------------------------------------------------------------------
+## 'add' is too general a name; implement as do.call + append, but
+## then not much added value so omit method?
 setMethod ('add', signature = 'MotifList',
 
   function (obj, matrices, tbl.metadata) { 
 
-     full.names = names (matrices)   # full in this sense:  organism-source-providerName[-possibleUniquifier]
-     size = length (matrices)
-     empty.chars = rep (NA_character_, size)
-     empty.ints = rep (NA_integer_, size)
-     new.list = MotifList (matrices, 
-                           names=full.names,
-                           providerNames=tbl.metadata$providerName,                  # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
-                           providerIds=tbl.metadata$providerId,                      # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
-                           geneSymbols=tbl.metadata$geneSymbol,                      # ABF2, Rhox11n
-                           geneIds=tbl.metadata$geneId,
-                           geneIdTypes=tbl.metadata$geneIdType,
-                           proteinIds=tbl.metadata$proteinId,
-                           proteinIdTypes=tbl.metadata$proteinIdType,
-                           sequenceCounts=tbl.metadata$sequenceCount,                # often NA
-                           organisms=tbl.metadata$organism,                          # Scerevisiae, Mmusculus
-                           bindingDomains=tbl.metadata$bindingDomain,                # NA, Homeo
-                           bindingSequences=tbl.metadata$bindingSequence,    
-                           dataSources=tbl.metadata$dataSource,                      # ScerTF, UniPROBE
-                           experimentTypes=tbl.metadata$experimentType,              # NA, protein-binding microarray, SELEX, 
-                           pubmedIDs=tbl.metadata$pubmedID,                          # 19111667, 1858359
-                           tfFamilies=tbl.metadata$tfFamily)                         # NA
-     obj@listData = c (obj@listData, new.list@listData)
-     elementMetadata (obj) = rbind (elementMetadata (obj), elementMetadata (new.list))
-     return (obj)
+     if (!is.list(matrices))            # allow solo matrix
+         matrices <- list(matrices)
+     ## 1:1 mapping between tbl.metadata (data.frame?) column names
+     ## and MotifDb elementMetadata
+     new.list <- do.call("MotifList", c(matrices, tbl.metadata))
+     append(obj, new.list)
     })
 
 #------------------------------------------------------------------------------------------------------------------------
-setMethod("$", signature="MotifList", 
+setMethod("$", signature="MotifList",   # is this a good idea?
 
   function(x, name) {
     elementMetadata (x)[[name]]
@@ -201,45 +175,38 @@
 {
   matrix.count = length (matrices)
 
-    # incoming matrices have nucleotide rows, position columns.  meme format, however, requires position-rows, and nucleotide-columns
-    # calculate the number of lines of text by counting columns
-  total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol)))
-  predicted.line.count = 12 + (3 * length (matrices)) + total.transposed.matrix.rows
-  s = vector ('character', predicted.line.count)
-
+  ## incoming matrices have nucleotide rows, position columns.  meme
+  ## format, however, requires position-rows, and nucleotide-columns
+  ## calculate the number of lines of text by counting columns
   #printf ("length of pre-allocated vector: %d", length (s))
-  s [1] = 'MEME version 4'
-  s [2] = ''
-  s [3] = 'ALPHABET= ACGT'
-  s [4] = ''
-  s [5] = 'strands: + -'
-  s [6] = ''
-  s [7] = 'Background letter frequencies'
-  s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 '
-  s [9] = '' 
 
-  index = 10
-  for (name in names (matrices)) {
-       # transpose the frequency matrix version of the incoming matrix, hence 'tfMat'
-    tfMat = transformMatrixToMemeRepresentation (matrices [[name]])
-       # meme output may be used by tomtom, which uses matrix names as part of image filenames. 
-       # removed all file-system-unfriendly characters here
+  header = c(
+    'MEME version 4',
+    '',
+    'ALPHABET= ACGT',
+    '',
+    'strands: + -',
+    '',
+    'Background letter frequencies',
+    'A 0.250 C 0.250 G 0.250 T 0.250 ',
+    '')
+
+  tfStrings <- Map(function(name, tfMat) {
+    tfMat = transformMatrixToMemeRepresentation (tfMat)
+    ## meme output may be used by tomtom, which uses matrix names as
+    ## part of image filenames.  removed all file-system-unfriendly
+    ## characters here
     fixed.name = gsub ('\\/', '_', name)
-    s [index] = sprintf ('MOTIF %s', fixed.name)
-    index = index + 1
-    s [index] = sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=8.1e-020', nrow (tfMat), 45, 8.1e-020)
-    index = index + 1
-    for (r in 1:nrow (tfMat)) {
-      new.row = sprintf (' %12.10f  %12.10f  %12.10f  %12.10f', tfMat [r,1], tfMat [r,2], tfMat [r,3], tfMat [r,4])
-      s [index] = new.row
-      index = index + 1
-      }
-    s [index] = ''
-    index = index + 1
-    } # for name
+    id <- sprintf ('MOTIF %s', fixed.name)
+    desc <-
+        sprintf ('letter-probability matrix: alength= 4 w= %d nsites= 45 E=8.1e-020',
+                 nrow (tfMat))
+    mat <- sprintf (' %12.10f  %12.10f  %12.10f  %12.10f',
+                    tfMat[,1], tfMat[,2], tfMat[,3], tfMat[,4])
+    c(id, desc, mat, "")
+  }, names(matricies), matricies)
 
-  invisible (s)
-
+  invisible(c(header, unlist(tfStrings)))
 } # matrixToMemeText
 #------------------------------------------------------------------------------------------------------------------------
 # write to connection with specified format
@@ -250,14 +217,11 @@
 setMethod ('export',  signature=c(object='MotifList', con='character', format='character'),
 
   function (object, con, format, ...) {
-
-    fmt = match.arg (tolower (format), c ('meme', 'transfac'))
-    if (fmt %in% c ('meme', 'transfac')) {
-      text = matrixToMemeText (object)
-      }
+      ## do minimum work unique to this method, then dispatch to avoid
+      ## code duplication
     con = file (con, 'w')
-    cat (text, sep='\n', file=con)
-    close (con)
+    on.exit(close(con))
+    export(object, con, format, ...)
     })
 
 #------------------------------------------------------------------------------------------------------------------------
@@ -271,23 +235,16 @@
   function (object, con, format, ...) {
 
     fmt = match.arg (tolower (format), c ('meme', 'transfac'))
-    if (fmt %in% c ('meme', 'transfac')) {
-      text = matrixToMemeText (object)
-      }
+    ## match.arg fails if !fmt %in% c('meme', 'transfac'), so no need
+    ## for test
+    ## let the user manage opened cons
+    if (!isOpen(con)) {
+        open(con)
+        on.exit(close(con))
+    }
     cat (text, sep='\n', file=con)
-    close (con)
     })
 
 #------------------------------------------------------------------------------------------------------------------------
 # write to connection, using default format,  ??? for matrix list, tsv for metadata
-setMethod ('export',  signature=c(object='MotifList',  con='missing', format='character'),
-
-  function (object, con, format,  ...) {
-
-    fmt = match.arg (tolower (format), c ('meme')) # , 'transfac'))
-    if (fmt == 'meme') 
-      text = matrixToMemeText (object)
-    return (text)
-    })
-
-#------------------------------------------------------------------------------------------------------------------------
+## already handled by rtracklayer?