setGeneric('query', signature='object', function(object, andStrings, orStrings=c(), notStrings=c(), ignore.case=TRUE) standardGeneric ('query')) setGeneric('motifToGene', signature='object', function(object, motifs, source) standardGeneric('motifToGene')) setGeneric('geneToMotif', signature='object', function(object, geneSymbols, source, ignore.case=FALSE) standardGeneric('geneToMotif')) setGeneric('associateTranscriptionFactors', signature='object', function(object, tbl.withMotifs, source, expand.rows, motifColumnName="motifName") standardGeneric('associateTranscriptionFactors')) #setGeneric('matchMotif', signature='object', function(object, motifs, genomeName, regions, pval.cutoff, # fimoDataFrameStyle=FALSE) standardGeneric('matchMotif')) #------------------------------------------------------------------------------------------------------------------------ setClass ('MotifList', contains='SimpleList', representation (elementMetadata='DataFrame', manuallyCuratedGeneMotifAssociationTable="data.frame"), prototype(elementType='matrix') ) setValidity('MotifList', function(object) { msg = NULL ## what makes for a valid MotifList? if (length (object) == 0) return (TRUE) if (is.null(names(object))) msg = c(msg, '"names()" must not be NULL') else if (any(duplicated(names(object)))) msg = c(msg, 'all "names()" must be unique') if (!all(sapply(object, is.matrix))) msg = c(msg, 'all matrices must be of class "matrix"') if (!all(sapply(object, nrow) == 4)) msg = c(msg, 'all matrices must have 4 rows') # all columns of all matrices should be normalized, summing to one. # in fact, 2/2086 matrices # Cparvum-UniPROBE-Cgd2_3490.UP00395 # Pfalciparum-UniPROBE-PF14_0633.UP00394 # fail to pass this test. rounding to one decimal place allows these matrices, # also, to pass. round (0.98, digits=1) --> 1.0 # see the UniPROBE manpage for the full story on these two matrices ok = sapply(object, function(elt) all (round (colSums(elt), digits=1) == 1)) if (!all(ok)) msg = c(msg, 'all elements must have colSums equal to 1') if (is.null(msg)) TRUE else msg }) #------------------------------------------------------------------------------- MotifList = function (matrices=list(), tbl.metadata=data.frame ()) { if (nrow (tbl.metadata) == 0) df = DataFrame () else df = DataFrame (tbl.metadata, row.names = rownames (tbl.metadata)) tbl.tfClass.filename <- system.file(package="MotifDb", "extdata", "tfClass.tsv") stopifnot(file.exists(tbl.tfClass.filename)) tbl.tfClass <- read.table(tbl.tfClass.filename, header=TRUE, as.is=TRUE, sep="\t") object = new ('MotifList', listData=matrices, elementMetadata = df, manuallyCuratedGeneMotifAssociationTable=tbl.tfClass) if (length (matrices) == 0) names (object) = character () else names (object) = rownames (tbl.metadata) object } # ctor #------------------------------------------------------------------------------- # setMethod('matchMotif', signature='MotifList', # # function(object, motifs, genomeName, regions, pval.cutoff, fimoDataFrameStyle=FALSE){ # motifs.pfmatrix <- lapply(motifs, function(motif) convert_motifs(motif, "TFBSTools-PFMatrix")) # motifs.pfmList <- do.call(PFMatrixList, motifs.pfmatrix) # gr.list <- motifmatchr::matchMotifs(motifs.pfmList, regions, genome=genomeName, out="positions", p.cutoff=pval.cutoff) # result <- gr.list # if(fimoDataFrameStyle){ # gr <- unlist(gr.list) # motif.names <- names(gr) # names(gr) <- NULL # tbl <- as.data.frame(gr) # tbl$motif_id <- motif.names # colnames(tbl)[1] <- "chrom" # tbl$chrom <- as.character(tbl$chrom) # colnames(tbl)[grep("score", colnames(tbl))] <- "mood.score" # new.order <- order(tbl$start, decreasing=FALSE) # tbl <- tbl[new.order,] # result <- tbl # } # return(result) # }) # #------------------------------------------------------------------------------- setMethod ('subset', signature = 'MotifList', function (x, subset, select, drop = FALSE, ...) { if (missing (subset)) i = TRUE else { i = eval(substitute (subset), elementMetadata (x), parent.frame ()) i = try(as.logical(i), silent=TRUE) if (inherits(i, 'try-error')) stop('"subset" must be coercible to logical') i = i & !is.na(i) } # else return (x [i]) }) #------------------------------------------------------------------------------- # transpose 4-row matrices to 4-column, so that there are as many rows as # there are nucleotides in the motif sequence. meme requires normalized matrices # exactly as we store them transformMatrixToMemeRepresentation = function (m) { return (t (m)) } # transformMatrixToMemeRepresentation #------------------------------------------------------------------------------- # http://stuff.mit.edu/afs/athena/software/meme_v3.5.4/etc/meme-explanation.html # The motif itself is a position-specific probability matrix giving, for each # position in the pattern, the observed frequency ('probability') of each # possible letter. The probability matrix is printed 'sideways'--columns # correspond to the letters in the alphabet (in the same order as shown in # the simplified motif) and rows corresponding to the positions of the motif, # position one first. The motif is preceded by a line starting with # 'letter-probability matrix:' and containing the length of the alphabet, # width of the motif, number of occurrences of the motif, and the E-value # of the motif. matrixToMemeText = function (matrices) { matrix.count = length (matrices) # incoming matrices have nucleotide rows, position columns. meme # format, however, requires position-rows, and nucleotide-columns # calculate the number of lines of text by counting columns total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol))) predicted.line.count = 12 + (3 * length (matrices)) + total.transposed.matrix.rows #s = vector ('character', predicted.line.count) s = character (predicted.line.count) s [1] = 'MEME version 4' s [2] = '' s [3] = 'ALPHABET= ACGT' s [4] = '' s [5] = 'strands: + -' s [6] = '' s [7] = 'Background letter frequencies' s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 ' s [9] = '' index = 10 for (name in names (matrices)) { # transpose the frequency matrix version of the incoming matrix, # hence 'tfMat' tfMat = transformMatrixToMemeRepresentation (matrices [[name]]) # meme output may be used by tomtom, which uses matrix names as # part of image filenames. removed all file-system-unfriendly # characters here fixed.name = gsub ('\\/', '_', name) s [index] = sprintf ('MOTIF %s', fixed.name) index = index + 1 new.line = sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=%f', nrow (tfMat), 45, 8.1e-020) s [index] = new.line index = index + 1 for (r in 1:nrow (tfMat)) { new.row = sprintf (' %12.10f %12.10f %12.10f %12.10f', tfMat [r,1], tfMat [r,2], tfMat [r,3], tfMat [r,4]) s [index] = new.row index = index + 1 } s [index] = '' index = index + 1 } # for name invisible (s) } # matrixToMemeText #------------------------------------------------------------------------------- # connection is a character string, create a file by that name, open the file. # dispatch to export which dispatches on con='connection' setMethod ('export', signature=c(object='MotifList', con='character', format='character'), function (object, con, format, ...) { ## do minimum work unique to this method, then dispatch to avoid ## code duplication con = file (con, 'w') on.exit(close(con)) export(object, con, format, ...) }) #------------------------------------------------------------------------------- # write to connection with specified format # format includes TRANSFAC, meme (also good for tomtom), and tsv setMethod ('export', signature=c(object='MotifList', con='connection', format='character'), function (object, con, format, ...) { fmt = match.arg (tolower (format), c ('meme', 'transfac','jaspar')) ## match.arg fails if !fmt %in% c('meme', 'transfac'), so no need ## for test ## let the user manage opened cons if (!isOpen(con)) { open(con) on.exit(close(con)) } if (fmt == 'meme') { text = matrixToMemeText (object) } else if (fmt == 'jaspar') { text = matrixToJasparText (object) } cat (text, sep='\n', file=con) }) #------------------------------------------------------------------------------- # write to connection, using default format, ??? for matrix list, tsv for # metadata setMethod ('export', signature=c(object='MotifList', con='missing', format='character'), function (object, con, format, ...) { fmt = match.arg (tolower (format), c ('meme','jaspar')) # , 'transfac' if (fmt == 'meme') { text = paste (matrixToMemeText (object), collapse='\n') cat (text) invisible (text) } else if (fmt == 'jaspar') { text = paste (matrixToJasparText (object), collapse='\n') cat (text) invisible (text) } }) #------------------------------------------------------------------------------- setMethod('show', 'MotifList', function(object) { msg = sprintf ('MotifDb object of length %d', length (object)) cat (msg, '\n', sep='') if (length (object) == 0) return () cat ('| Created from downloaded public sources, last update: 2022-Mar-04', '\n', sep='') tbl.dataSource = as.data.frame (table (mcols (object)$dataSource)) tbl.org = as.data.frame (table (mcols (object)$organism)) tbl.org = head (tbl.org [order (tbl.org$Freq, decreasing=TRUE),]) totalMatrixCount = length (object) totalOrganismCount = length (unique (mcols (object)$organism)) dataSourceCount = nrow (tbl.dataSource) source.singular.or.plural = 'sources' if (dataSourceCount == 1) source.singular.or.plural = 'source' msg = sprintf ('| %d position frequency matrices from %d %s:', totalMatrixCount, dataSourceCount, source.singular.or.plural) cat (msg, '\n', sep='') for (r in 1:nrow (tbl.dataSource)) { dataSource = tbl.dataSource$Var1 [r] matrixCount = tbl.dataSource$Freq [r] msg = sprintf ('| %18s: %4d', dataSource, matrixCount) cat (msg, '\n', sep='') } # for r msg = sprintf ('| %d organism/s', totalOrganismCount) cat (msg, '\n', sep='') for (r in 1:nrow (tbl.org)) { organism = tbl.org$Var1 [r] orgCount = tbl.org$Freq [r] msg = sprintf ('| %18s: %4d', organism, orgCount) cat (msg, '\n', sep='') } # for r otherOrgCount = totalMatrixCount - sum (tbl.org$Freq) if (otherOrgCount > 0) { msg = sprintf ('| %18s: %4d', 'other', otherOrgCount) cat (msg, '\n', sep='') } if (!is.null (names (object))) { all.names = names (object) count = length (all.names) if (count <= 10) cat (paste (all.names, '\n'), sep='') else { cat (paste (all.names [1:5], '\n'), sep='') cat ('...', '\n', sep='') end = length (all.names) start = end - 4 cat (paste (all.names [start:end], '\n'), sep='') } } }) #------------------------------------------------------------------------------- #setMethod ('query', 'MotifList', # # function (object, queryString, ignore.case=TRUE) { # indices = unique (as.integer (unlist (sapply (colnames (mcols (object)), # function (colname) # grep (queryString, mcols (object)[, colname], # ignore.case=ignore.case))))) # object [indices] # }) #------------------------------------------------------------------------------- setMethod ('query', 'MotifList', function (object, andStrings, orStrings=c(), notStrings=c(), ignore.case=TRUE) { find.indices <- function(queryString) {unique(as.integer(unlist(sapply(colnames(mcols(object)), function(colname) grep(queryString, mcols(object)[,colname],ignore.case=ignore.case))))) } # setup defaults and.indices <- list(seq_len(length(object))) or.indices <- list(seq_len(length(object))) not.indices <- list(c()) if(length(andStrings) > 0) and.indices <- lapply(andStrings, find.indices) if(length(orStrings) > 0) or.indices <- lapply(orStrings, find.indices) if(length(notStrings) > 0) not.indices <- lapply(notStrings, find.indices) # start with the indices of all elements final.indices <- seq_len(length(object)) # get the cumulative intersection of all the "and" terms # this steadily dimishes the set of indices for(indices in and.indices){ final.indices <- intersect(final.indices, indices) #message(sprintf(" final.indices length is now %d", length(final.indices))) } # lump all of the "or" terms together: they all get included final.indices <- intersect(unlist(or.indices), final.indices) # finally reduce the set to exclude all indices of all "not" terms for(indices in not.indices) final.indices <- setdiff(final.indices, indices) object [final.indices] }) #------------------------------------------------------------------------------- # Addition on 2017/06/15 from Matt Richards # This will not exactly match JASPAR because units are PFM and JASPAR uses PCM # General JASPAR Format: # > "Motif Name"\t"Transcription Factor" # A [ PCMS ] # C [ PCMS ] # G [ PCMS ] # T [ PCMS ] # # ... # Note: the PCMs are space-delimited matrixToJasparText <- function (matrices) { matrix.count <- length (matrices) # Incoming matrices have nucleotide rows, position columns. # This is the correct orientation for JASPAR; however, we need to also # add brackets and letters to them # Calculate the number of lines of text by counting matrices and assuming # 6 lines per matrix predicted.line.count <- 6*matrix.count #s = vector ('character', predicted.line.count) s <- character (predicted.line.count) index <- 1 for (name in names (matrices)) { # Print the name with an arrow, follwed by the motif s[index] <- sprintf('>%s',name) index <- index + 1 # For each line of the matrix, print the correct letter and the # matrix row surrounded by brackets motif.matrix <- matrices[name][[1]] for (r in 1:nrow(motif.matrix)) { s[index] <- sprintf("%s [ %s ]", rownames(motif.matrix)[r], paste(motif.matrix[r,],collapse=" ")) index <- index + 1 } s[index] <- "" index <- index + 1 } # for name # Remove the last blank line s <- s[-length(s)] invisible (s) } # matrixToJasparText #------------------------------------------------------------------------------- # returns a data.frame with motif, geneSymbol, source, pubmedID columns # setMethod ('oldMotifToGene', 'MotifList', # # function (object, motifs, source) { # source <- tolower(source) # stopifnot(source %in% c("motifdb", "tfclass")) # tbl <- data.frame() # if(source %in% c("motifdb")){ # providerId <- NULL # avoid R CMD check note # tbl <- as.data.frame(subset(mcols(object), providerId %in% motifs)) # if(nrow(tbl) == 0) # return(data.frame()) # tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")]) # colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID") # tbl <- tbl[, c("motif", "geneSymbol", "dataSource", "organism", "pubmedID")] # if(nrow(tbl) > 0) # tbl$source <- "MotifDb" # } # if(source %in% c("tfclass")){ # motif <- NULL # tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, motif %in% motifs) # if(nrow(tbl) == 0) # return(data.frame()) # tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")]) # tbl <- tbl[order(tbl$motif),] # rownames(tbl) <- NULL # colnames(tbl) <- c("motif", "geneSymbol", "pubmedID") # if(nrow(tbl) > 0) # tbl$source <- "TFClass" # } # tbl # }) # oldMotifToGene # #------------------------------------------------------------------------------- # returns a data.frame with motif, geneSymbol, source, pubmedID columns setMethod ('motifToGene', 'MotifList', function (object, motifs, source) { # for MotifDb, motif names come in a variety of forms, and our first step # is to convert them all, if needed, into that which is found in # the "providerId" column of the MotifDb metadata table. # # first check to see if the supplied motif name is actually a MotifDb # matrix list name, e.g., Hsapiens-HOCOMOCOv10-IKZF1_HUMAN.H10MO.C # when those cases are discovered, they are translated to the matrices # providerId - which is our standard currency for lookup, using # the mcols (the metadata, the annotation) which accompanies # each pfm matrix tbl.mdb <- data.frame() tbl.tfc <- data.frame() name.map <- as.list(motifs) names(name.map) <- motifs for(i in seq_len(length(motifs))){ x <-match(motifs[i], names(MotifDb)); if(!is.na(x)){ newValue <- mcols(MotifDb[motifs[i]])$providerId names(name.map)[i] <- newValue motifs[i] <-newValue } } # for i source <- tolower(source) stopifnot(all(source %in% c("motifdb", "tfclass"))) if("motifdb" %in% source){ providerId <- NULL # avoid R CMD check note tbl.mdb <- as.data.frame(subset(mcols(object), providerId %in% motifs)) if(nrow(tbl.mdb) > 0){ tbl.mdb <- unique(tbl.mdb [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")]) colnames(tbl.mdb) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID") tbl.mdb <- tbl.mdb[, c("motif", "geneSymbol", "dataSource", "organism", "pubmedID")] tbl.mdb$source <- "MotifDb" tbl.mdb <- tbl.mdb[, c("motif", "geneSymbol", "pubmedID", "organism", "source")] rownames(tbl.mdb) <- NULL } # nrow of tbl.mdb > 0 } # motifDb if("tfclass" %in% source){ motif <- NULL tbl.tfc <- subset(object@manuallyCuratedGeneMotifAssociationTable, motif %in% motifs) if(nrow(tbl.tfc) > 0){ tbl.tfc <- unique(tbl.tfc[, c("motif", "tf.gene", "pubmedID")]) tbl.tfc <- tbl.tfc[order(tbl.tfc$motif),] rownames(tbl.tfc) <- NULL colnames(tbl.tfc) <- c("motif", "geneSymbol", "pubmedID") tbl.tfc$source <- "TFClass" tbl.tfc$organism <- "Hsapiens" } # nrow(tbl.tfc) > 0 } # tfclass if(nrow(tbl.mdb) == 0 && nrow(tbl.tfc) == 0) return(data.frame()) tbl.out <- rbind(tbl.mdb, tbl.tfc) dups <- which(duplicated(tbl.out[, c("motif", "geneSymbol", "organism", "source")])) if(length(dups) > 0) tbl.out <- tbl.out[-dups,] if(length(name.map) > 0) tbl.out$motif <- as.character(name.map[tbl.out$motif]) tbl.out }) #------------------------------------------------------------------------------- # returns a data.frame with motif, geneSymbol, source, pubmedID columns setMethod ('geneToMotif', 'MotifList', function (object, geneSymbols, source, ignore.case=FALSE) { source <- tolower(source) stopifnot(all(source %in% c("motifdb", "tfclass"))) extract.mdb <- function(gene){ geneSymbol <- NULL # workaround the R CMD check "no visible binding for global variable" if(ignore.case) tbl <- as.data.frame(subset(mcols(object), tolower(geneSymbol) == tolower(gene))) else tbl <- as.data.frame(subset(mcols(object), geneSymbol == gene)) tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")]) colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID") tbl } if("motifdb" %in% source){ tbls <- lapply(geneSymbols, extract.mdb) result <- do.call(rbind, tbls) if(nrow(result) > 0) result$source <- "MotifDb" } if("tfclass" %in% source){ if(ignore.case) tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, tolower(tf.gene) %in% tolower(geneSymbols)) else tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, tf.gene %in% geneSymbols) tf.gene <- NULL; motif <- NULL # workaround R CMD CHECK "no visible binding ..." bogus error tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")]) tbl <- tbl[order(tbl$tf.gene),] rownames(tbl) <- NULL colnames(tbl) <- c("motif", "geneSymbol", "pubmedID") result <- tbl[, c("geneSymbol", "motif", "pubmedID")] if(nrow(result) > 0) result$source <- "TFClass" } result }) #------------------------------------------------------------------------------- setMethod('associateTranscriptionFactors', 'MotifList', function(object, tbl.withMotifs, source, expand.rows, motifColumnName="motifName"){ stopifnot(motifColumnName %in% colnames(tbl.withMotifs)) tbl.tf <- motifToGene(object, unique(tbl.withMotifs[, motifColumnName]), source) merge(tbl.withMotifs, tbl.tf, by.x=motifColumnName, by.y="motif", all.x=TRUE) }) #------------------------------------------------------------------------------- # setMethod('associateTranscriptionFactors', 'MotifList', # # function(object, tbl.withMotifs, source, expand.rows){ # source <- tolower(source) # stopifnot(source %in% c("motifdb", "tfclass")) # tbl.out <- data.frame() # if(source %in% c("motifdb")){ # # lookup up in the object metadata, expect one TF geneSymbol per matrix name # pfm.ids <- tbl.withMotifs[, "motifName"] # matched.rows <- match(pfm.ids, names(as.list(object))) # #if(length(matched.rows) == nrow(tbl.withMotifs)) { # tbl.new <- mcols(object)[matched.rows, c("geneSymbol", "pubmedID")] # tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA # tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA # tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new)) # } # direct # if(source %in% c("tfclass")){ # if(! "shortMotif" %in% colnames(tbl.withMotifs)){ # stop("MotifDb::assoicateTranscriptionFactors needs a 'shortMotif' column with the TFClass source") # } # tbl.tfClass <- read.table(system.file(package="MotifDb", "extdata", "tfClass.tsv"), sep="\t", as.is=TRUE, header=TRUE) # motif.ids <- tbl.withMotifs[, "shortMotif"] # geneSymbols <- lapply(motif.ids, function(id) # paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif, fixed=TRUE)], collapse=";")) # geneSymbols <- unlist(geneSymbols) # pubmedIds <- lapply(motif.ids, function(id) # unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif, fixed=TRUE)])) # pubmedIds <- as.character(pubmedIds) # pubmedIds <- gsub("integer(0)", "", pubmedIds, fixed=TRUE) # tbl.new <- data.frame(geneSymbol=geneSymbols, pubmedID=pubmedIds, stringsAsFactors=FALSE) # tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA # tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA # tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new)) # # if(expand.rows){ # rows.with.na <- which(is.na(tbl.out$geneSymbol)) # rows.with.geneSymbol <- setdiff(1:nrow(tbl.out), rows.with.na) # tbl.asIs <- tbl.out[rows.with.na,] # tbl.toExpand <- tbl.out[rows.with.geneSymbol,] # geneSymbols.split <- strsplit(tbl.toExpand$geneSymbol, ";") # counts <- unlist(lapply(geneSymbols.split, length)) # geneSymbols.split.vec <- unlist(geneSymbols.split) # tbl.expanded <- splitstackshape::expandRows(tbl.toExpand, counts, count.is.col=FALSE, drop=FALSE) # stopifnot(length(geneSymbols.split.vec) == nrow(tbl.expanded)) # tbl.expanded$geneSymbol <- geneSymbols.split.vec # tbl.out <- rbind(tbl.expanded, tbl.asIs) # } # } # indirect # tbl.out # }) # #-------------------------------------------------------------------------------