R/MotifList-class.R
b70c1a6e
 setGeneric('query', signature='object', function(object, queryString, ignore.case=TRUE)
2def1eec
               standardGeneric ('query'))
49ebd9dc
 setGeneric('motifToGene', signature='object', function(object, motifs, source) standardGeneric('motifToGene'))
 setGeneric('geneToMotif', signature='object', function(object, geneSymbols, source) standardGeneric('geneToMotif'))
 setGeneric('associateTranscriptionFactors', signature='object',
094b985a
            function(object, tbl.withMotifs,  source, expand.rows) standardGeneric('associateTranscriptionFactors'))
2def1eec
 #------------------------------------------------------------------------------------------------------------------------
b70c1a6e
 setClass ('MotifList',
           contains='SimpleList',
2def1eec
           representation (elementMetadata='DataFrame',
                           manuallyCuratedGeneMotifAssociationTable="data.frame"),
b70c1a6e
           prototype(elementType='matrix')
           )
 
 setValidity('MotifList', function(object) {
     msg = NULL
       ## what makes for a valid MotifList?
     if (length (object) ==  0)
        return (TRUE)
     if (is.null(names(object)))
         msg = c(msg, '"names()" must not be NULL')
     else if (any(duplicated(names(object))))
         msg = c(msg, 'all "names()" must be unique')
     if (!all(sapply(object, is.matrix)))
         msg = c(msg, 'all matrices must be of class "matrix"')
     if (!all(sapply(object, nrow) == 4))
         msg = c(msg, 'all matrices must have 4 rows')
       # all columns of all matrices should be normalized, summing to one.
       # in fact, 2/2086 matrices
       #   Cparvum-UniPROBE-Cgd2_3490.UP00395
       #   Pfalciparum-UniPROBE-PF14_0633.UP00394
       # fail to pass this test.  rounding to one decimal place allows these matrices,
       # also, to pass.   round (0.98, digits=1) --> 1.0
       # see the UniPROBE manpage for the full story on these two matrices
     ok = sapply(object, function(elt) all (round (colSums(elt), digits=1) == 1))
     if (!all(ok))
        msg = c(msg, 'all elements must have colSums equal to 1')
 
     if (is.null(msg)) TRUE else msg
 })
 #-------------------------------------------------------------------------------
 MotifList = function (matrices=list(), tbl.metadata=data.frame ())
2def1eec
 {
b70c1a6e
   if (nrow (tbl.metadata) == 0)
     df = DataFrame ()
   else
     df = DataFrame (tbl.metadata, row.names = rownames (tbl.metadata))
2def1eec
 
   tbl.tfClass.filename <- system.file(package="MotifDb", "extdata", "tfClass.tsv")
   stopifnot(file.exists(tbl.tfClass.filename))
   tbl.tfClass <- read.table(tbl.tfClass.filename, header=TRUE, as.is=TRUE, sep="\t")
 
   object = new ('MotifList', listData=matrices, elementMetadata = df,
                 manuallyCuratedGeneMotifAssociationTable=tbl.tfClass)
b70c1a6e
 
   if (length (matrices) == 0)
     names (object) = character ()
2def1eec
   else
b70c1a6e
     names (object) = rownames (tbl.metadata)
 
   object
2def1eec
 
b70c1a6e
 } # ctor
 #-------------------------------------------------------------------------------
 setMethod ('subset', signature = 'MotifList',
 
   function (x, subset, select, drop = FALSE, ...) {
 
2def1eec
     if (missing (subset))
b70c1a6e
       i = TRUE
     else {
       i = eval(substitute (subset), elementMetadata (x), parent.frame ())
       i = try(as.logical(i), silent=TRUE)
       if (inherits(i, 'try-error'))
         stop('"subset" must be coercible to logical')
       i = i & !is.na(i)
       } # else
     return (x [i])
   })
 
 #-------------------------------------------------------------------------------
 # transpose 4-row matrices to 4-column, so that there are as many rows as
 # there are nucleotides in the motif sequence. meme requires normalized matrices
 # exactly as we store them
 transformMatrixToMemeRepresentation = function (m)
 {
   return (t (m))
 
 } # transformMatrixToMemeRepresentation
 #-------------------------------------------------------------------------------
 # http://stuff.mit.edu/afs/athena/software/meme_v3.5.4/etc/meme-explanation.html
 # The motif itself is a position-specific probability matrix giving, for each
 # position in the pattern, the observed  frequency ('probability') of each
 # possible letter. The probability matrix is printed 'sideways'--columns
 # correspond  to the letters in the alphabet (in the same order as shown in
 # the simplified motif) and rows corresponding to the  positions of the motif,
 # position one first. The motif is preceded by a line starting with
 # 'letter-probability matrix:' and containing the length of the alphabet,
 # width of the motif, number of occurrences of the motif, and the E-value
 # of the motif.
 matrixToMemeText = function (matrices)
 {
   matrix.count = length (matrices)
 
     # incoming matrices have nucleotide rows, position columns.  meme
     # format, however, requires position-rows, and nucleotide-columns
     # calculate the number of lines of text by counting columns
   total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol)))
   predicted.line.count = 12 + (3 * length (matrices)) +
                            total.transposed.matrix.rows
   #s = vector ('character', predicted.line.count)
   s = character (predicted.line.count)
 
   s [1] = 'MEME version 4'
   s [2] = ''
   s [3] = 'ALPHABET= ACGT'
   s [4] = ''
   s [5] = 'strands: + -'
   s [6] = ''
   s [7] = 'Background letter frequencies'
   s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 '
2def1eec
   s [9] = ''
b70c1a6e
 
   index = 10
   for (name in names (matrices)) {
        # transpose the frequency matrix version of the incoming matrix,
        # hence 'tfMat'
     tfMat = transformMatrixToMemeRepresentation (matrices [[name]])
        # meme output may be used by tomtom, which uses matrix names as
        # part of image filenames. removed all file-system-unfriendly
        # characters here
     fixed.name = gsub ('\\/', '_', name)
     s [index] = sprintf ('MOTIF %s', fixed.name)
     index = index + 1
     new.line =
        sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=8.1e-020',
           nrow (tfMat), 45, 8.1e-020)
     s [index] =  new.line
     index = index + 1
     for (r in 1:nrow (tfMat)) {
       new.row = sprintf (' %12.10f  %12.10f  %12.10f  %12.10f', tfMat [r,1],
                           tfMat [r,2], tfMat [r,3], tfMat [r,4])
       s [index] = new.row
       index = index + 1
       }
     s [index] = ''
     index = index + 1
     } # for name
 
   invisible (s)
 
 } # matrixToMemeText
 #-------------------------------------------------------------------------------
 # connection is a character string, create a file by that name, open the file.
 # dispatch to export which dispatches on con='connection'
 
 setMethod ('export',  signature=c(object='MotifList', con='character',
                                   format='character'),
 
   function (object, con, format, ...) {
       ## do minimum work unique to this method, then dispatch to avoid
       ## code duplication
     con = file (con, 'w')
     on.exit(close(con))
     export(object, con, format, ...)
     })
 
 #-------------------------------------------------------------------------------
 # write to connection with specified format
 # format includes TRANSFAC, meme (also good for tomtom), and tsv
 
 setMethod ('export',  signature=c(object='MotifList', con='connection',
                                  format='character'),
 
   function (object, con, format, ...) {
 
8c9d39f9
     fmt = match.arg (tolower (format), c ('meme', 'transfac','jaspar'))
b70c1a6e
     ## match.arg fails if !fmt %in% c('meme', 'transfac'), so no need
     ## for test
     ## let the user manage opened cons
     if (!isOpen(con)) {
         open(con)
         on.exit(close(con))
     }
8c9d39f9
     if (fmt == 'meme') {
         text = matrixToMemeText (object)
     } else if (fmt == 'jaspar') {
         text = matrixToJasparText (object)
     }
b70c1a6e
     cat (text, sep='\n', file=con)
8c9d39f9
   })
 
b70c1a6e
 
 #-------------------------------------------------------------------------------
 # write to connection, using default format,  ??? for matrix list, tsv for
 # metadata
 setMethod ('export',  signature=c(object='MotifList',  con='missing',
                                  format='character'),
 
   function (object, con, format,  ...) {
 
8c9d39f9
     fmt = match.arg (tolower (format), c ('meme','jaspar')) # , 'transfac'
b70c1a6e
     if (fmt == 'meme') {
8c9d39f9
         text = paste (matrixToMemeText (object), collapse='\n')
b70c1a6e
       cat (text)
       invisible (text)
8c9d39f9
     } else if (fmt == 'jaspar') {
         text = paste (matrixToJasparText (object), collapse='\n')
       cat (text)
       invisible (text)
     }
2def1eec
 
8c9d39f9
   })
b70c1a6e
 
 #-------------------------------------------------------------------------------
 setMethod('show', 'MotifList',
 
     function(object) {
       msg = sprintf ('MotifDb object of length %d', length (object))
       cat (msg, '\n', sep='')
       if (length (object) == 0)
         return ()
2def1eec
 
cdaa7b82
       cat ('| Created from downloaded public sources: 2013-Aug-30', '\n', sep='')
b70c1a6e
 
fe92bd53
       tbl.dataSource = as.data.frame (table (mcols (object)$dataSource))
       tbl.org = as.data.frame (table (mcols (object)$organism))
b70c1a6e
       tbl.org = head (tbl.org [order (tbl.org$Freq, decreasing=TRUE),])
       totalMatrixCount = length (object)
fe92bd53
       totalOrganismCount = length (unique (mcols (object)$organism))
b70c1a6e
       dataSourceCount = nrow (tbl.dataSource)
 
       source.singular.or.plural = 'sources'
       if (dataSourceCount == 1)
         source.singular.or.plural = 'source'
2def1eec
 
b70c1a6e
       msg = sprintf ('| %d position frequency matrices from %d %s:',
                      totalMatrixCount, dataSourceCount, source.singular.or.plural)
       cat (msg, '\n', sep='')
       for (r in 1:nrow (tbl.dataSource)) {
          dataSource = tbl.dataSource$Var1 [r]
          matrixCount = tbl.dataSource$Freq [r]
          msg = sprintf ('| %18s: %4d', dataSource, matrixCount)
          cat (msg, '\n', sep='')
          } # for r
       msg = sprintf ('| %d organism/s', totalOrganismCount)
       cat (msg, '\n', sep='')
       for (r in 1:nrow (tbl.org)) {
          organism = tbl.org$Var1 [r]
          orgCount = tbl.org$Freq [r]
          msg = sprintf ('| %18s: %4d', organism, orgCount)
          cat (msg, '\n', sep='')
          } # for r
       otherOrgCount = totalMatrixCount - sum (tbl.org$Freq)
       if (otherOrgCount > 0) {
         msg = sprintf ('| %18s: %4d', 'other', otherOrgCount)
         cat (msg, '\n', sep='')
         }
     if (!is.null (names (object))) {
       all.names = names (object)
       count = length (all.names)
       if (count <= 10)
         cat (paste (all.names, '\n'), sep='')
       else {
         cat (paste (all.names [1:5], '\n'), sep='')
         cat ('...', '\n', sep='')
         end = length (all.names)
         start = end - 4
         cat (paste (all.names [start:end], '\n'), sep='')
         }
       }
     })
 #-------------------------------------------------------------------------------
 setMethod ('query', 'MotifList',
 
    function (object, queryString, ignore.case=TRUE) {
2def1eec
        indices = unique (as.integer (unlist (sapply (colnames (mcols (object)),
                     function (colname)
                        grep (queryString, mcols (object)[, colname],
b70c1a6e
                              ignore.case=ignore.case)))))
         object [indices]
       })
 #-------------------------------------------------------------------------------
8c9d39f9
 # Addition on 2017/06/15 from Matt Richards
 
 # This will not exactly match JASPAR because units are PFM and JASPAR uses PCM
 # General JASPAR Format:
 
 # > "Motif Name"\t"Transcription Factor"
 # A [ PCMS ]
 # C [ PCMS ]
 # G [ PCMS ]
 # T [ PCMS ]
 #
 # ...
 
 # Note: the PCMs are space-delimited
 
 matrixToJasparText <- function (matrices)
 {
   matrix.count <- length (matrices)
 
   # Incoming matrices have nucleotide rows, position columns.
   # This is the correct orientation for JASPAR; however, we need to also
   # add brackets and letters to them
 
   # Calculate the number of lines of text by counting matrices and assuming
   # 6 lines per matrix
2def1eec
 
8c9d39f9
   predicted.line.count <- 6*matrix.count
 
   #s = vector ('character', predicted.line.count)
   s <- character (predicted.line.count)
 
   index <- 1
2def1eec
 
8c9d39f9
   for (name in names (matrices)) {
 
       # Print the name with an arrow, follwed by the motif
       s[index] <- sprintf('>%s',name)
       index <- index + 1
 
       # For each line of the matrix, print the correct letter and the
       # matrix row surrounded by brackets
5afdc5a9
       motif.matrix <- matrices[name][[1]]
49ebd9dc
 
b1ae96b9
 
8c9d39f9
       for (r in 1:nrow(motif.matrix)) {
           s[index] <- sprintf("%s [ %s ]",
                               rownames(motif.matrix)[r],
                               paste(motif.matrix[r,],collapse=" "))
           index <- index + 1
       }
 
       s[index] <- ""
       index <- index + 1
2def1eec
 
1f060cb8
   } # for name
 
   # Remove the last blank line
   s <- s[-length(s)]
8c9d39f9
 
   invisible (s)
 
 } # matrixToJasparText
 #-------------------------------------------------------------------------------
2def1eec
 # returns a data.frame with motif, geneSymbol, source, pubmedID columns
49ebd9dc
 setMethod ('motifToGene', 'MotifList',
 
    function (object, motifs, source) {
3b614ce7
      source <- tolower(source)
      stopifnot(source %in% c("motifdb", "tfclass"))
49ebd9dc
      tbl <- data.frame()
3b614ce7
      if(source %in% c("motifdb")){
4acf0d78
         providerId <- NULL   # avoid R CMD check note
49ebd9dc
         tbl <- as.data.frame(subset(mcols(object), providerId %in% motifs))
d030e74b
         if(nrow(tbl) == 0)
            return(data.frame())
49ebd9dc
         tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")])
         colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID")
         tbl <- tbl[, c("motif", "geneSymbol", "dataSource", "organism", "pubmedID")]
4acf0d78
         tbl$source <- "MotifDb"
2def1eec
         }
3b614ce7
      if(source %in% c("tfclass")){
4acf0d78
         motif <- NULL
49ebd9dc
         tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, motif %in% motifs)
d030e74b
         if(nrow(tbl) == 0)
            return(data.frame())
49ebd9dc
         tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")])
         tbl <- tbl[order(tbl$motif),]
         rownames(tbl) <- NULL
4acf0d78
         colnames(tbl) <- c("motif", "geneSymbol", "pubmedID")
         tbl$source <- "TFClass"
2def1eec
         }
49ebd9dc
      tbl
2def1eec
      })
 
 #-------------------------------------------------------------------------------
 # returns a data.frame with motif, geneSymbol, source, pubmedID columns
49ebd9dc
 setMethod ('geneToMotif', 'MotifList',
 
    function (object, geneSymbols, source) {
3b614ce7
      source <- tolower(source)
      stopifnot(source %in% c("motifdb", "tfclass"))
49ebd9dc
      extract.mdb <- function(gene){
4acf0d78
         geneSymbol <- NULL # workaround the R CMD check "no visible binding for global variable"
49ebd9dc
         tbl <- as.data.frame(subset(mcols(object), geneSymbol == gene))
         tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")])
         colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID")
         tbl
2def1eec
         }
3b614ce7
      if(source %in% c("motifdb")){
49ebd9dc
         tbls <- lapply(geneSymbols, extract.mdb)
         result <- do.call(rbind, tbls)
4acf0d78
         result$source <- "MotifDb"
49ebd9dc
         }
3b614ce7
      if(source %in% c("tfclass")){
49ebd9dc
         tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, tf.gene %in% geneSymbols)
4acf0d78
         tf.gene <- NULL; motif <- NULL  # workaround R CMD CHECK "no visible binding ..." bogus error
49ebd9dc
         tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")])
         tbl <- tbl[order(tbl$tf.gene),]
         rownames(tbl) <- NULL
4acf0d78
         colnames(tbl) <- c("motif", "geneSymbol", "pubmedID")
         result <- tbl[, c("geneSymbol", "motif", "pubmedID")]
         result$source <- "TFClass"
2def1eec
         }
      result
      })
 
 #-------------------------------------------------------------------------------
49ebd9dc
 setMethod('associateTranscriptionFactors', 'MotifList',
 
094b985a
    function(object, tbl.withMotifs, source, expand.rows){
3b614ce7
      source <- tolower(source)
      stopifnot(source %in% c("motifdb", "tfclass"))
49ebd9dc
      tbl.out <- data.frame()
3b614ce7
      if(source %in% c("motifdb")){
094b985a
            # lookup up in the object metadata, expect one TF geneSymbol per matrix name
         pfm.ids <- tbl.withMotifs[, "motifName"]
49ebd9dc
         matched.rows <- match(pfm.ids, names(as.list(object)))
         #if(length(matched.rows) == nrow(tbl.withMotifs)) {
         tbl.new <- mcols(object)[matched.rows, c("geneSymbol", "pubmedID")]
         tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
         tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
         tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
         } # direct
3b614ce7
      if(source %in% c("tfclass")){
094b985a
         if(! "shortMotif" %in% colnames(tbl.withMotifs)){
            stop("MotifDb::assoicateTranscriptionFactors needs a 'shortMotif' column with the TFClass source")
            }
49ebd9dc
         tbl.tfClass <- read.table(system.file(package="MotifDb", "extdata", "tfClass.tsv"), sep="\t", as.is=TRUE, header=TRUE)
094b985a
         motif.ids <- tbl.withMotifs[, "shortMotif"]
d030e74b
         geneSymbols <- lapply(motif.ids, function(id)
                                  paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif, fixed=TRUE)], collapse=";"))
49ebd9dc
         geneSymbols <- unlist(geneSymbols)
d030e74b
         pubmedIds   <- lapply(motif.ids, function(id)
                                  unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif, fixed=TRUE)]))
49ebd9dc
         pubmedIds   <- as.character(pubmedIds)
         pubmedIds   <- gsub("integer(0)", "", pubmedIds, fixed=TRUE)
         tbl.new     <- data.frame(geneSymbol=geneSymbols, pubmedID=pubmedIds, stringsAsFactors=FALSE)
         tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
         tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
         tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
 
         if(expand.rows){
            rows.with.na <- which(is.na(tbl.out$geneSymbol))
            rows.with.geneSymbol <- setdiff(1:nrow(tbl.out), rows.with.na)
            tbl.asIs <- tbl.out[rows.with.na,]
            tbl.toExpand <- tbl.out[rows.with.geneSymbol,]
            geneSymbols.split <- strsplit(tbl.toExpand$geneSymbol, ";")
            counts <- unlist(lapply(geneSymbols.split, length))
            geneSymbols.split.vec <- unlist(geneSymbols.split)
            tbl.expanded <- splitstackshape::expandRows(tbl.toExpand, counts, count.is.col=FALSE, drop=FALSE)
            stopifnot(length(geneSymbols.split.vec) == nrow(tbl.expanded))
            tbl.expanded$geneSymbol <- geneSymbols.split.vec
            tbl.out <- rbind(tbl.expanded, tbl.asIs)
            }
         } # indirect
      tbl.out
      })
 
 #-------------------------------------------------------------------------------