... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: MotifDb |
2 | 2 |
Type: Package |
3 | 3 |
Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs |
4 |
-Version: 1.19.6 |
|
5 |
-Date: 2017-09-07 |
|
4 |
+Version: 1.19.7 |
|
5 |
+Date: 2017-09-12 |
|
6 | 6 |
Author: Paul Shannon, Matt Richards |
7 | 7 |
Maintainer: Paul Shannon <pshannon@systemsbiology.org> |
8 | 8 |
Depends: R (>= 2.15.0), methods, BiocGenerics, S4Vectors, IRanges, Biostrings |
... | ... |
@@ -3,8 +3,7 @@ setGeneric('query', signature='object', function(object, queryString, ignore.cas |
3 | 3 |
setGeneric('motifToGene', signature='object', function(object, motifs, source) standardGeneric('motifToGene')) |
4 | 4 |
setGeneric('geneToMotif', signature='object', function(object, geneSymbols, source) standardGeneric('geneToMotif')) |
5 | 5 |
setGeneric('associateTranscriptionFactors', signature='object', |
6 |
- function(object, tbl.withMotifs, motif.column.name, source, expand.rows) |
|
7 |
- standardGeneric('associateTranscriptionFactors')) |
|
6 |
+ function(object, tbl.withMotifs, source, expand.rows) standardGeneric('associateTranscriptionFactors')) |
|
8 | 7 |
#------------------------------------------------------------------------------------------------------------------------ |
9 | 8 |
setClass ('MotifList', |
10 | 9 |
contains='SimpleList', |
... | ... |
@@ -400,23 +399,25 @@ setMethod ('geneToMotif', 'MotifList', |
400 | 399 |
#------------------------------------------------------------------------------- |
401 | 400 |
setMethod('associateTranscriptionFactors', 'MotifList', |
402 | 401 |
|
403 |
- function(object, tbl.withMotifs, motif.column.name, source, expand.rows){ |
|
402 |
+ function(object, tbl.withMotifs, source, expand.rows){ |
|
404 | 403 |
stopifnot(source %in% c("MotifDb", "TFClass")) |
405 | 404 |
tbl.out <- data.frame() |
406 | 405 |
if(source %in% c("MotifDb")){ |
407 |
- # "direct" means: lookup up in the object metadata, expect one TF geneSymbol per matrix name |
|
408 |
- pfm.ids <- tbl.withMotifs[, motif.column.name] |
|
406 |
+ # lookup up in the object metadata, expect one TF geneSymbol per matrix name |
|
407 |
+ pfm.ids <- tbl.withMotifs[, "motifName"] |
|
409 | 408 |
matched.rows <- match(pfm.ids, names(as.list(object))) |
410 | 409 |
#if(length(matched.rows) == nrow(tbl.withMotifs)) { |
411 | 410 |
tbl.new <- mcols(object)[matched.rows, c("geneSymbol", "pubmedID")] |
412 | 411 |
tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA |
413 | 412 |
tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA |
414 | 413 |
tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new)) |
415 |
- #} |
|
416 | 414 |
} # direct |
417 | 415 |
if(source %in% c("TFClass")){ |
416 |
+ if(! "shortMotif" %in% colnames(tbl.withMotifs)){ |
|
417 |
+ stop("MotifDb::assoicateTranscriptionFactors needs a 'shortMotif' column with the TFClass source") |
|
418 |
+ } |
|
418 | 419 |
tbl.tfClass <- read.table(system.file(package="MotifDb", "extdata", "tfClass.tsv"), sep="\t", as.is=TRUE, header=TRUE) |
419 |
- motif.ids <- tbl.withMotifs[, motif.column.name] |
|
420 |
+ motif.ids <- tbl.withMotifs[, "shortMotif"] |
|
420 | 421 |
geneSymbols <- lapply(motif.ids, function(id) paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif)], collapse=";")) |
421 | 422 |
geneSymbols <- unlist(geneSymbols) |
422 | 423 |
pubmedIds <- lapply(motif.ids, function(id) unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif)])) |
... | ... |
@@ -809,16 +809,17 @@ test.associateTranscriptionFactors <- function() |
809 | 809 |
# "Hsapiens-jaspar2016-RUNX1-MA0002.1" "Hsapiens-jaspar2016-TFAP2A-MA0003.1" |
810 | 810 |
|
811 | 811 |
motif.names <- names(pfms[1:5]) |
812 |
- tbl <- data.frame(motifLongName=motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
813 |
- tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motifLongName", source="MotifDb", expand.rows=FALSE) |
|
812 |
+ tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
813 |
+ tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE) |
|
814 | 814 |
checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2)) |
815 | 815 |
checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno))) |
816 |
+ checkEquals(tbl.anno$geneSymbol, c("RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "AR")) |
|
816 | 817 |
|
817 | 818 |
# now add in a bogus motif name, one for which there cannot possibly be a TF |
818 | 819 |
|
819 | 820 |
motif.names[3] <- "bogus" |
820 |
- tbl <- data.frame(motifLongName=motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
821 |
- tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motifLongName", source="MotifDb", expand.rows=FALSE) |
|
821 |
+ tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
822 |
+ tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE) |
|
822 | 823 |
checkTrue(is.na(tbl.anno$geneSymbol[3])) |
823 | 824 |
checkTrue(is.na(tbl.anno$pubmedID[3])) |
824 | 825 |
|
... | ... |
@@ -826,65 +827,42 @@ test.associateTranscriptionFactors <- function() |
826 | 827 |
# the tfFamily |
827 | 828 |
# "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2" |
828 | 829 |
|
830 |
+ motif.names <- names(pfms[1:5]) |
|
829 | 831 |
short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)]))) |
830 |
- tbl <- data.frame(motif=short.motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
832 |
+ tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
831 | 833 |
|
832 |
- tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motif", source="TFClass", expand.rows=FALSE) |
|
834 |
+ tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE) |
|
833 | 835 |
checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2)) |
834 | 836 |
|
837 |
+ # TFClass only annotates MA0003.3, none of the others |
|
838 |
+ checkTrue(all(is.na(tbl.anno$geneSymbol[-4]))) |
|
839 |
+ checkTrue(all(is.na(tbl.anno$pubmedID[-4]))) |
|
840 |
+ checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E") |
|
841 |
+ checkEquals(tbl.anno$pubmedID[4], "23180794") |
|
842 |
+ |
|
843 |
+ # now ask for expandsion of the semicolon separated list |
|
844 |
+ tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE) |
|
845 |
+ checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2)) |
|
846 |
+ checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol)) |
|
847 |
+ |
|
835 | 848 |
# now add in a bogus motif name, one for which there cannot possibly be a TF |
836 | 849 |
|
837 | 850 |
motif.names <- names(pfms[1:5]) |
838 | 851 |
short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)]))) |
839 |
- short.motif.names[3] <- "bogus" |
|
840 |
- tbl <- data.frame(motif=short.motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
852 |
+ short.motif.names[4] <- "bogus" |
|
853 |
+ tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
841 | 854 |
|
842 |
- tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motif", source="TFClass", expand.rows=FALSE) |
|
855 |
+ tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE) |
|
843 | 856 |
checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2)) |
857 |
+ # after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA |
|
858 |
+ checkTrue(all(is.na(tbl.anno$geneSymbol))) |
|
859 |
+ checkTrue(all(is.na(tbl.anno$pubmedID))) |
|
844 | 860 |
|
845 |
-} # test.associateTranscriptionFactors |
|
846 |
-#------------------------------------------------------------------------------------------------------------------------ |
|
847 |
-# MA0003.3 is mapped by TFClass to TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E (pumbedID 23180794) |
|
848 |
-# use it as a first test of the "exapand.rows" argument to associateTranscriptionFactors method |
|
849 |
-# MA0017.1 is unmapped by TFClass |
|
850 |
-# MA0017.2 is mapped to NR2F1 |
|
851 |
-test.associateTranscriptionFactors_expandRows <- function() |
|
852 |
-{ |
|
853 |
- printf("--- test.assoicateTranscriptionFactors") |
|
861 |
+ # now make sure that the absence of the TFClass-specific "shortMotif" field is detected |
|
862 |
+ motif.names <- names(pfms[1:5]) |
|
863 |
+ tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE) |
|
864 |
+ checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE) |
|
854 | 865 |
|
855 |
- mdb <- MotifDb |
|
856 |
- motifs <- c("MA0003.3", "MA0017.1", "MA0017.2") |
|
857 |
- tbl <- data.frame(motif=motifs, stringsAsFactors=FALSE) |
|
858 |
- tbl.anno <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif", source="TFClass", expand.rows=FALSE) |
|
859 |
- checkEquals(dim(tbl.anno), c(3,3)) |
|
860 |
- checkEquals(tbl.anno$geneSymbol[c(1,3)], c("TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E", "NR2F1")) |
|
861 |
- checkTrue(is.na(tbl.anno$geneSymbol[2])) |
|
862 |
- |
|
863 |
- tbl.annoX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif", source="TFClass", expand.rows=TRUE) |
|
864 |
- checkEquals(dim(tbl.annoX), c(7,3)) |
|
865 |
- checkEquals(tbl.annoX$motif, c(rep("MA0003.3", 5), "MA0017.2", "MA0017.1")) |
|
866 |
- checkEquals(tbl.annoX$geneSymbol[1:6], c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E", "NR2F1")) |
|
867 |
- checkTrue(is.na(tbl.annoX$geneSymbol[7])) |
|
868 |
- |
|
869 |
- # now a large scale test |
|
870 |
- set.seed(37) |
|
871 |
- mdb.human <- query(mdb, "sapiens") # > 4000 human matrices |
|
872 |
- indices <- sample(1:length(mdb.human), size=250) |
|
873 |
- motif.long.names <- names(mdb.human)[indices] |
|
874 |
- motif.short.names <- mcols(mdb.human)[indices, "providerId"] |
|
875 |
- tbl <- data.frame(motif.short=motif.short.names, motif.long=motif.long.names, stringsAsFactors=FALSE) |
|
876 |
- |
|
877 |
- tbl.anno.mdb <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.long", source="MotifDb", expand.rows=FALSE) |
|
878 |
- tbl.anno.mdbX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.long", source="MotifDb", expand.rows=TRUE) |
|
879 |
- |
|
880 |
- tbl.anno.tfc <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.short", source="TFClass", expand.rows=FALSE) |
|
881 |
- checkEquals(nrow(tbl.anno.tfc), length(motif.short.names)) |
|
882 |
- checkTrue(length(grep(";", tbl.anno.tfc$geneSymbol)) > 20) |
|
883 |
- |
|
884 |
- tbl.anno.tfcX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.short", source="TFClass", expand.rows=TRUE) |
|
885 |
- checkTrue(nrow(tbl.anno.tfcX) > nrow(tbl.anno.tfc)) # 250 vs 779 |
|
886 |
- checkEquals(length(grep(";", tbl.anno.tfcX$geneSymbol)), 0) |
|
887 |
- |
|
888 |
- |
|
889 |
-} # test.associateTranscriptionFactors_expandRows |
|
866 |
+ |
|
867 |
+} # test.associateTranscriptionFactors |
|
890 | 868 |
#------------------------------------------------------------------------------------------------------------------------ |