R/MotifList-class.R
b70c1a6e
 setGeneric('query', signature='object', function(object, queryString, ignore.case=TRUE)
             standardGeneric ('query'))
 #-------------------------------------------------------------------------------
 setClass ('MotifList',
           contains='SimpleList',
           representation (elementMetadata='DataFrame'),
           prototype(elementType='matrix')
           )
 
 setValidity('MotifList', function(object) {
     msg = NULL
       ## what makes for a valid MotifList?
     if (length (object) ==  0)
        return (TRUE)
     if (is.null(names(object)))
         msg = c(msg, '"names()" must not be NULL')
     else if (any(duplicated(names(object))))
         msg = c(msg, 'all "names()" must be unique')
     if (!all(sapply(object, is.matrix)))
         msg = c(msg, 'all matrices must be of class "matrix"')
     if (!all(sapply(object, nrow) == 4))
         msg = c(msg, 'all matrices must have 4 rows')
       # all columns of all matrices should be normalized, summing to one.
       # in fact, 2/2086 matrices
       #   Cparvum-UniPROBE-Cgd2_3490.UP00395
       #   Pfalciparum-UniPROBE-PF14_0633.UP00394
       # fail to pass this test.  rounding to one decimal place allows these matrices,
       # also, to pass.   round (0.98, digits=1) --> 1.0
       # see the UniPROBE manpage for the full story on these two matrices
     ok = sapply(object, function(elt) all (round (colSums(elt), digits=1) == 1))
     if (!all(ok))
        msg = c(msg, 'all elements must have colSums equal to 1')
 
     if (is.null(msg)) TRUE else msg
 })
 #-------------------------------------------------------------------------------
 MotifList = function (matrices=list(), tbl.metadata=data.frame ())
 {  
   if (nrow (tbl.metadata) == 0)
     df = DataFrame ()
   else
     df = DataFrame (tbl.metadata, row.names = rownames (tbl.metadata))
   
   object = new ('MotifList', listData=matrices, elementMetadata = df)
 
   if (length (matrices) == 0)
     names (object) = character ()
   else 
     names (object) = rownames (tbl.metadata)
 
   object
   
 } # ctor
 #-------------------------------------------------------------------------------
 setMethod ('subset', signature = 'MotifList',
 
   function (x, subset, select, drop = FALSE, ...) {
 
     if (missing (subset)) 
       i = TRUE
     else {
       i = eval(substitute (subset), elementMetadata (x), parent.frame ())
       i = try(as.logical(i), silent=TRUE)
       if (inherits(i, 'try-error'))
         stop('"subset" must be coercible to logical')
       i = i & !is.na(i)
       } # else
     return (x [i])
   })
 
 #-------------------------------------------------------------------------------
 # transpose 4-row matrices to 4-column, so that there are as many rows as
 # there are nucleotides in the motif sequence. meme requires normalized matrices
 # exactly as we store them
 transformMatrixToMemeRepresentation = function (m)
 {
   return (t (m))
 
 } # transformMatrixToMemeRepresentation
 #-------------------------------------------------------------------------------
 # http://stuff.mit.edu/afs/athena/software/meme_v3.5.4/etc/meme-explanation.html
 # The motif itself is a position-specific probability matrix giving, for each
 # position in the pattern, the observed  frequency ('probability') of each
 # possible letter. The probability matrix is printed 'sideways'--columns
 # correspond  to the letters in the alphabet (in the same order as shown in
 # the simplified motif) and rows corresponding to the  positions of the motif,
 # position one first. The motif is preceded by a line starting with
 # 'letter-probability matrix:' and containing the length of the alphabet,
 # width of the motif, number of occurrences of the motif, and the E-value
 # of the motif.
 matrixToMemeText = function (matrices)
 {
   matrix.count = length (matrices)
 
     # incoming matrices have nucleotide rows, position columns.  meme
     # format, however, requires position-rows, and nucleotide-columns
     # calculate the number of lines of text by counting columns
   total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol)))
   predicted.line.count = 12 + (3 * length (matrices)) +
                            total.transposed.matrix.rows
   #s = vector ('character', predicted.line.count)
   s = character (predicted.line.count)
 
   s [1] = 'MEME version 4'
   s [2] = ''
   s [3] = 'ALPHABET= ACGT'
   s [4] = ''
   s [5] = 'strands: + -'
   s [6] = ''
   s [7] = 'Background letter frequencies'
   s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 '
   s [9] = '' 
 
   index = 10
   for (name in names (matrices)) {
        # transpose the frequency matrix version of the incoming matrix,
        # hence 'tfMat'
     tfMat = transformMatrixToMemeRepresentation (matrices [[name]])
        # meme output may be used by tomtom, which uses matrix names as
        # part of image filenames. removed all file-system-unfriendly
        # characters here
     fixed.name = gsub ('\\/', '_', name)
     s [index] = sprintf ('MOTIF %s', fixed.name)
     index = index + 1
     new.line =
        sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=8.1e-020',
           nrow (tfMat), 45, 8.1e-020)
     s [index] =  new.line
     index = index + 1
     for (r in 1:nrow (tfMat)) {
       new.row = sprintf (' %12.10f  %12.10f  %12.10f  %12.10f', tfMat [r,1],
                           tfMat [r,2], tfMat [r,3], tfMat [r,4])
       s [index] = new.row
       index = index + 1
       }
     s [index] = ''
     index = index + 1
     } # for name
 
   invisible (s)
 
 } # matrixToMemeText
 #-------------------------------------------------------------------------------
 # connection is a character string, create a file by that name, open the file.
 # dispatch to export which dispatches on con='connection'
 
 setMethod ('export',  signature=c(object='MotifList', con='character',
                                   format='character'),
 
   function (object, con, format, ...) {
       ## do minimum work unique to this method, then dispatch to avoid
       ## code duplication
     con = file (con, 'w')
     on.exit(close(con))
     export(object, con, format, ...)
     })
 
 #-------------------------------------------------------------------------------
 # write to connection with specified format
 # format includes TRANSFAC, meme (also good for tomtom), and tsv
 
 setMethod ('export',  signature=c(object='MotifList', con='connection',
                                  format='character'),
 
   function (object, con, format, ...) {
 
8c9d39f9
     fmt = match.arg (tolower (format), c ('meme', 'transfac','jaspar'))
b70c1a6e
     ## match.arg fails if !fmt %in% c('meme', 'transfac'), so no need
     ## for test
     ## let the user manage opened cons
     if (!isOpen(con)) {
         open(con)
         on.exit(close(con))
     }
8c9d39f9
     if (fmt == 'meme') {
         text = matrixToMemeText (object)
     } else if (fmt == 'jaspar') {
         text = matrixToJasparText (object)
     }
b70c1a6e
     cat (text, sep='\n', file=con)
8c9d39f9
   })
 
b70c1a6e
 
 #-------------------------------------------------------------------------------
 # write to connection, using default format,  ??? for matrix list, tsv for
 # metadata
 setMethod ('export',  signature=c(object='MotifList',  con='missing',
                                  format='character'),
 
   function (object, con, format,  ...) {
 
8c9d39f9
     fmt = match.arg (tolower (format), c ('meme','jaspar')) # , 'transfac'
b70c1a6e
     if (fmt == 'meme') {
8c9d39f9
         text = paste (matrixToMemeText (object), collapse='\n')
b70c1a6e
       cat (text)
       invisible (text)
8c9d39f9
     } else if (fmt == 'jaspar') {
         text = paste (matrixToJasparText (object), collapse='\n')
       cat (text)
       invisible (text)
     }
       
   })
b70c1a6e
 
 #-------------------------------------------------------------------------------
 setMethod('show', 'MotifList',
 
     function(object) {
       msg = sprintf ('MotifDb object of length %d', length (object))
       cat (msg, '\n', sep='')
       if (length (object) == 0)
         return ()
       
cdaa7b82
       cat ('| Created from downloaded public sources: 2013-Aug-30', '\n', sep='')
b70c1a6e
 
fe92bd53
       tbl.dataSource = as.data.frame (table (mcols (object)$dataSource))
       tbl.org = as.data.frame (table (mcols (object)$organism))
b70c1a6e
       tbl.org = head (tbl.org [order (tbl.org$Freq, decreasing=TRUE),])
       totalMatrixCount = length (object)
fe92bd53
       totalOrganismCount = length (unique (mcols (object)$organism))
b70c1a6e
       dataSourceCount = nrow (tbl.dataSource)
 
       source.singular.or.plural = 'sources'
       if (dataSourceCount == 1)
         source.singular.or.plural = 'source'
       
       msg = sprintf ('| %d position frequency matrices from %d %s:',
                      totalMatrixCount, dataSourceCount, source.singular.or.plural)
       cat (msg, '\n', sep='')
       for (r in 1:nrow (tbl.dataSource)) {
          dataSource = tbl.dataSource$Var1 [r]
          matrixCount = tbl.dataSource$Freq [r]
          msg = sprintf ('| %18s: %4d', dataSource, matrixCount)
          cat (msg, '\n', sep='')
          } # for r
       msg = sprintf ('| %d organism/s', totalOrganismCount)
       cat (msg, '\n', sep='')
       for (r in 1:nrow (tbl.org)) {
          organism = tbl.org$Var1 [r]
          orgCount = tbl.org$Freq [r]
          msg = sprintf ('| %18s: %4d', organism, orgCount)
          cat (msg, '\n', sep='')
          } # for r
       otherOrgCount = totalMatrixCount - sum (tbl.org$Freq)
       if (otherOrgCount > 0) {
         msg = sprintf ('| %18s: %4d', 'other', otherOrgCount)
         cat (msg, '\n', sep='')
         }
     if (!is.null (names (object))) {
       all.names = names (object)
       count = length (all.names)
       if (count <= 10)
         cat (paste (all.names, '\n'), sep='')
       else {
         cat (paste (all.names [1:5], '\n'), sep='')
         cat ('...', '\n', sep='')
         end = length (all.names)
         start = end - 4
         cat (paste (all.names [start:end], '\n'), sep='')
         }
       }
     })
 #-------------------------------------------------------------------------------
 setMethod ('query', 'MotifList',
 
    function (object, queryString, ignore.case=TRUE) {
fe92bd53
        indices = unique (as.integer (unlist (sapply (colnames (mcols (object)), 
b70c1a6e
                     function (colname) 
fe92bd53
                        grep (queryString, mcols (object)[, colname], 
b70c1a6e
                              ignore.case=ignore.case)))))
         object [indices]
       })
 #-------------------------------------------------------------------------------
8c9d39f9
 # Addition on 2017/06/15 from Matt Richards
 
 # This will not exactly match JASPAR because units are PFM and JASPAR uses PCM
 # General JASPAR Format:
 
 # > "Motif Name"\t"Transcription Factor"
 # A [ PCMS ]
 # C [ PCMS ]
 # G [ PCMS ]
 # T [ PCMS ]
 #
 # ...
 
 # Note: the PCMs are space-delimited
 
 matrixToJasparText <- function (matrices)
 {
   matrix.count <- length (matrices)
 
   # Incoming matrices have nucleotide rows, position columns.
   # This is the correct orientation for JASPAR; however, we need to also
   # add brackets and letters to them
 
   # Calculate the number of lines of text by counting matrices and assuming
   # 6 lines per matrix
   
   predicted.line.count <- 6*matrix.count
 
   #s = vector ('character', predicted.line.count)
   s <- character (predicted.line.count)
 
   index <- 1
   
   for (name in names (matrices)) {
 
       # Print the name with an arrow, follwed by the motif
       s[index] <- sprintf('>%s',name)
       index <- index + 1
 
       # For each line of the matrix, print the correct letter and the
       # matrix row surrounded by brackets
b1ae96b9
       #      motif.matrix <- matrices[name][[1]]
 
       # Alter it to make artificial counts
       motif.matrix <- round(1000*matrices[name][[1]])
       
8c9d39f9
       for (r in 1:nrow(motif.matrix)) {
           s[index] <- sprintf("%s [ %s ]",
                               rownames(motif.matrix)[r],
                               paste(motif.matrix[r,],collapse=" "))
           index <- index + 1
       }
 
       s[index] <- ""
       index <- index + 1
           
     } # for name
 
   invisible (s)
 
 } # matrixToJasparText
 #-------------------------------------------------------------------------------