setClass ('MotifList',
          contains='SimpleList',
          representation (elementMetadata="DataFrame"),
          prototype(elementType="matrix")
          )

setGeneric ('add', signature='obj', function (obj, matrices, tbl.metadata) standardGeneric ('add'))
#------------------------------------------------------------------------------------------------------------------------
MotifList = function (matrices=list(),
                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
                      providerNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
                      providerIds=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
                      geneSymbols=character(),              # ABF2, Rhox11
                      geneIds=character(),
                      geneIdTypes=character(),
                      proteinIds=character(),
                      proteinIdTypes=character(),
                      sequenceCounts=integer(),             # often NA
                      organisms=character(),                # Scerevisiae, Mmusculus
                      bindingDomains=character(),       # NA, Homeo
                      bindingSequences=character(),
                      dataSources=character(),              # ScerTF, UniPROBE
                      experimentTypes=character(),          # NA, protein-binding microarray
                      pubmedIDs=character(),                # 19111667, 1858359
                      tfFamilies=character())               # NA, NA
{  
  names (matrices) = names
  count = length (matrices)
  stopifnot (length (names) == count)
  stopifnot (length (providerNames) == count)
  stopifnot (length (providerIds) == count)
  stopifnot (length (geneSymbols) == count)
  stopifnot (length (geneIds) == count)
  stopifnot (length (geneIdTypes) == count)
  stopifnot (length (proteinIds) == count)
  stopifnot (length (proteinIdTypes) == count)
  stopifnot (length (sequenceCounts) == count)
  stopifnot (length (organisms) == count)
  stopifnot (length (bindingDomains) == count)
  stopifnot (length (bindingSequences) == count)
  stopifnot (length (dataSources) == count)
  stopifnot (length (experimentTypes) == count)
  stopifnot (length (pubmedIDs) == count)
  stopifnot (length (tfFamilies) == count)

  elementMetadata = DataFrame (#name=names,
                               providerName=providerNames,
                               providerId=providerIds,
                               geneSymbol=geneSymbols,
                               geneId=geneIds,
                               geneIdType=geneIdTypes,
                               proteinId=proteinIds,
                               proteinIdType=proteinIdTypes,
                               sequenceCount=sequenceCounts,
                               organism=organisms,
                               bindingDomain=bindingDomains,
                               bindingSequence=bindingSequences,
                               dataSource=dataSources,
                               experimentType=experimentTypes,
                               pubmedID=pubmedIDs,
                               tfFamily=tfFamilies)
  new ('MotifList', listData = matrices, elementMetadata=elementMetadata)
  
} # ctor
#------------------------------------------------------------------------------------------------------------------------
MotifDb = function (loadAllSources=TRUE, quiet=TRUE)
{
  mdb <- MotifList ()
  if (loadAllSources) {
    data.path = paste (system.file (package='MotifDb'), 'data', sep='/')
    data.files = dir (data.path, full.names=TRUE)
    sources.to.exclude = c () # 'ScerTF.RData')   # pending minor revisions
    removers = as.integer (sapply (sources.to.exclude, function (source) grep (source, data.files)))
    if (length (removers) > 0)
      data.files = data.files [-removers]
  
    count = 0
    if (length (data.files) > 0)   
      for (data.file in data.files) {
        tbl.md = NA; matrices = NA;  # define these to keep 'check' happy.  they are loaded by 'load'
        variables = load (data.file)
        mdb = add (mdb, matrices, tbl.md)
        filename.tokens = strsplit (data.file, '\\/') [[1]]
        shortFilename = filename.tokens [length (filename.tokens)]
        if (!quiet) 
          print (noquote (sprintf ('added %s (%d) matrices, length now: %d', shortFilename, length (matrices), length (mdb))))
      } # for data.file
  
    if (!quiet) {
      print (table (mdb$dataSource))
      }
    } # if loadAllSources

  return (mdb)

} # MotifDb
#------------------------------------------------------------------------------------------------------------------------
setMethod ('length', signature = 'MotifList',

  function (x) {
    return (length (x@listData))
    })

#------------------------------------------------------------------------------------------------------------------------
setMethod ('subset', signature = 'MotifList',

  function (x, subset, select, drop = FALSE, ...) {

    if (missing (subset)) 
      i <- TRUE
    else {
      i <- eval(substitute (subset), elementMetadata (x), parent.frame ())
      i <- try(as.logical(i), silent=TRUE)
      if (inherits(i, "try-error"))
        stop("'subset' must be coercible to logical")
      i <- i & !is.na(i)
      } # else
    return (x [i])
  })

#------------------------------------------------------------------------------------------------------------------------
setMethod ('add', signature = 'MotifList',

  function (obj, matrices, tbl.metadata) { 

     full.names = names (matrices)   # full in this sense:  organism-source-providerName[-possibleUniquifier]
     size = length (matrices)
     empty.chars = rep (NA_character_, size)
     empty.ints = rep (NA_integer_, size)
     new.list = MotifList (matrices, 
                           names=full.names,
                           providerNames=tbl.metadata$providerName,                  # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
                           providerIds=tbl.metadata$providerId,                      # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
                           geneSymbols=tbl.metadata$geneSymbol,                      # ABF2, Rhox11n
                           geneIds=tbl.metadata$geneId,
                           geneIdTypes=tbl.metadata$geneIdType,
                           proteinIds=tbl.metadata$proteinId,
                           proteinIdTypes=tbl.metadata$proteinIdType,
                           sequenceCounts=tbl.metadata$sequenceCount,                # often NA
                           organisms=tbl.metadata$organism,                          # Scerevisiae, Mmusculus
                           bindingDomains=tbl.metadata$bindingDomain,                # NA, Homeo
                           bindingSequences=tbl.metadata$bindingSequence,    
                           dataSources=tbl.metadata$dataSource,                      # ScerTF, UniPROBE
                           experimentTypes=tbl.metadata$experimentType,              # NA, protein-binding microarray, SELEX, 
                           pubmedIDs=tbl.metadata$pubmedID,                          # 19111667, 1858359
                           tfFamilies=tbl.metadata$tfFamily)                         # NA
     obj@listData = c (obj@listData, new.list@listData)
     elementMetadata (obj) = rbind (elementMetadata (obj), elementMetadata (new.list))
     return (obj)
    })

#------------------------------------------------------------------------------------------------------------------------
setMethod("$", signature="MotifList", 

  function(x, name) {
    elementMetadata (x)[[name]]
    })
#------------------------------------------------------------------------------------------------------------------------
# transpose 4-row matrices to 4-column, so that there are as many rows as there are nucleotides in the motif sequence
# normalize the values, so that at each position the frequency of the 4 bases sums to 1
#
# for example, this count matrix for a sequence 10 bases long
#
#    1  2  3  4  5  6  7  8  9 10
# A  0  3 79 40 66 48 65 11 65  0
# C 94 75  4  3  1  2  5  2  3  3
# G  1  0  3  4  1  0  5  3 28 88
# T  2 19 11 50 29 47 22 81  1  6
#
# becomes this
#            1          2          3          4          5          6          7          8          9         10
# A 0.00000000 0.03092784 0.81443299 0.41237113 0.68041237 0.49484536 0.67010309 0.11340206 0.67010309 0.00000000
# C 0.96907216 0.77319588 0.04123711 0.03092784 0.01030928 0.02061856 0.05154639 0.02061856 0.03092784 0.03092784
# G 0.01030928 0.00000000 0.03092784 0.04123711 0.01030928 0.00000000 0.05154639 0.03092784 0.28865979 0.90721649
# T 0.02061856 0.19587629 0.11340206 0.51546392 0.29896907 0.48453608 0.22680412 0.83505155 0.01030928 0.06185567
#
# and then this
# 
#             A          C          G          T
# 1  0.00000000 0.96907216 0.01030928 0.02061856
# 2  0.03092784 0.77319588 0.00000000 0.19587629
# 3  0.81443299 0.04123711 0.03092784 0.11340206
# 4  0.41237113 0.03092784 0.04123711 0.51546392
# 5  0.68041237 0.01030928 0.01030928 0.29896907
# 6  0.49484536 0.02061856 0.00000000 0.48453608
# 7  0.67010309 0.05154639 0.05154639 0.22680412
# 8  0.11340206 0.02061856 0.03092784 0.83505155
# 9  0.67010309 0.03092784 0.28865979 0.01030928
# 10 0.00000000 0.03092784 0.90721649 0.06185567
transformMatrixToMemeRepresentation = function (m)
{
  stopifnot (class (m) == 'matrix')
  stopifnot (rownames (m) == c ('A', 'C', 'G', 'T'))
  col.sums = round (colSums (m))
  stopifnot (all (col.sums == 1))
  return (t (m))

} # transformMatrixToMemeRepresentation
#------------------------------------------------------------------------------------------------------------------------
matrixToMemeText = function (matrices)
{
  matrix.count = length (matrices)

    # incoming matrices have nucleotide rows, position columns.  meme format, however, requires position-rows, and nucleotide-columns
    # calculate the number of lines of text by counting columns
  total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol)))
  predicted.line.count = 12 + (3 * length (matrices)) + total.transposed.matrix.rows
  s = vector ('character', predicted.line.count)

  #printf ("length of pre-allocated vector: %d", length (s))
  s [1] = 'MEME version 4'
  s [2] = ''
  s [3] = 'ALPHABET= ACGT'
  s [4] = ''
  s [5] = 'strands: + -'
  s [6] = ''
  s [7] = 'Background letter frequencies'
  s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 '
  s [9] = '' 

  index = 10
  for (name in names (matrices)) {
       # transpose the frequency matrix version of the incoming matrix, hence 'tfMat'
    tfMat = transformMatrixToMemeRepresentation (matrices [[name]])
       # meme output may be used by tomtom, which uses matrix names as part of image filenames. 
       # removed all file-system-unfriendly characters here
    fixed.name = gsub ('\\/', '_', name)
    s [index] = sprintf ('MOTIF %s', fixed.name)
    index = index + 1
    s [index] = sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=8.1e-020', nrow (tfMat), 45, 8.1e-020)
    index = index + 1
    for (r in 1:nrow (tfMat)) {
      new.row = sprintf (' %12.10f  %12.10f  %12.10f  %12.10f', tfMat [r,1], tfMat [r,2], tfMat [r,3], tfMat [r,4])
      s [index] = new.row
      index = index + 1
      }
    s [index] = ''
    index = index + 1
    } # for name

  invisible (s)

} # matrixToMemeText
#------------------------------------------------------------------------------------------------------------------------
# write to connection with specified format
# if connection is a character string, create a file by that name, open the file.
# if it is an already-open connection, use that directly
# format includes TRANSFAC, meme (also good for tomtom), and tsv

setMethod ('export',  signature=c(object='MotifList', con='character', format='character'),

  function (object, con, format, ...) {

    fmt = match.arg (tolower (format), c ('meme', 'transfac'))
    if (fmt %in% c ('meme', 'transfac')) {
      text = matrixToMemeText (object)
      }
    con = file (con, 'w')
    cat (text, sep='\n', file=con)
    close (con)
    })

#------------------------------------------------------------------------------------------------------------------------
# write to connection with specified format
# if connection is a character string, create a file by that name, open the file.
# if it is an already-open connection, use that directly
# format includes TRANSFAC, meme (also good for tomtom), and tsv

setMethod ('export',  signature=c(object='MotifList', con='connection', format='character'),

  function (object, con, format, ...) {

    fmt = match.arg (tolower (format), c ('meme', 'transfac'))
    if (fmt %in% c ('meme', 'transfac')) {
      text = matrixToMemeText (object)
      }
    cat (text, sep='\n', file=con)
    close (con)
    })

#------------------------------------------------------------------------------------------------------------------------
# write to connection, using default format,  ??? for matrix list, tsv for metadata
setMethod ('export',  signature=c(object='MotifList',  con='missing', format='character'),

  function (object, con, format,  ...) {

    fmt = match.arg (tolower (format), c ('meme')) # , 'transfac'))
    if (fmt == 'meme') 
      text = matrixToMemeText (object)
    return (text)
    })

#------------------------------------------------------------------------------------------------------------------------