Browse code

hocomoco core|secondary names now used for matrices & rownames mcols

paul-shannon authored on 18/03/2020 20:22:29
Showing5 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: MotifDb
2 2
 Type: Package
3 3
 Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
-Version: 1.29.4
4
+Version: 1.29.6
5 5
 Date: 2020-03-18
6 6
 Author: Paul Shannon, Matt Richards
7 7
 Maintainer: Paul Shannon <pshannon@systemsbiology.org>
8 8
Binary files a/inst/extdata/hocomoco11.RData and b/inst/extdata/hocomoco11.RData differ
... ...
@@ -1192,6 +1192,11 @@ test.hocomoco11.with.reliabilityScores <- function()
1192 1192
 {
1193 1193
    printf("--- test.hocomoco11.with.reliabilityScores")
1194 1194
 
1195
+     #-------------------------------------------------------------------------
1196
+     # these queries rely primarily upon the dataSoure column of the metadata
1197
+     # subsequent checks below look at metadata rownames and matrix names
1198
+     #-------------------------------------------------------------------------
1199
+
1195 1200
    checkEquals(length(query(MotifDb, "hocomoco")), 1834)
1196 1201
    checkEquals(length(query(MotifDb, "hocomocov10")), 1066)
1197 1202
    checkEquals(length(query(MotifDb, "hocomocov11")), 768)
... ...
@@ -1210,6 +1215,15 @@ test.hocomoco11.with.reliabilityScores <- function()
1210 1215
    checkEquals(length(query(MotifDb, "hocomocov11-core-D")), 0)
1211 1216
    checkEquals(length(query(MotifDb, "hocomocov11-secondary-D")), 290)
1212 1217
 
1218
+     #-------------------------------------------------------------------------
1219
+     # check matrix names
1220
+     #-------------------------------------------------------------------------
1221
+   checkEquals(length(grep("HOCOMOCOv11-core-A", names(MotifDb))), 181)
1222
+   checkEquals(length(grep("HOCOMOCOv11-secondary-A", names(MotifDb))), 46)
1223
+
1224
+   checkEquals(length(grep("HOCOMOCOv11-core-A", rownames(mcols(MotifDb)))), 181)
1225
+   checkEquals(length(grep("HOCOMOCOv11-secondary-A", rownames(mcols(MotifDb)))), 46)
1226
+
1213 1227
 } # test.hocomoco11.with.reliabilityScores
1214 1228
 #------------------------------------------------------------------------------------------------------------------------
1215 1229
 if(!interactive())
1216 1230
new file mode 100644
... ...
@@ -0,0 +1,181 @@
1
+# MotifDb/inst/scripes/import/HOCOMOCO/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+options (stringsAsFactors=FALSE)
4
+printf <- function(...) print(noquote(sprintf(...)))
5
+library(RCurl)
6
+#------------------------------------------------------------------------------------------------------------------------
7
+run = function (dataDir)
8
+{
9
+  dataDir <- file.path(dataDir)
10
+  rawMatrixList <- readRawMatrices (dataDir)
11
+  matrices <- extractMatrices (rawMatrixList)
12
+  
13
+  tbl.md <- createMetadataTable (dataDir, matrices,
14
+                                 raw.metadata.filename="md-raw.tsv")
15
+  matrices <- normalizeMatrices (matrices)
16
+  matrices <- renameMatrices (matrices, tbl.md)
17
+  
18
+  serializedFile <- file.path(dataDir, "HOCOMOCO.RData")
19
+  printf("writing %s to %s", "HOCOMOCO.RData", dataDir)
20
+  
21
+  save (matrices, tbl.md, file=serializedFile)
22
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
23
+  printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
24
+  
25
+} # run
26
+#------------------------------------------------------------------------------------------------------------------------
27
+readRawMatrices = function (dataDir)
28
+{
29
+  # our convention is that there is a shared "dataDir" visible to
30
+  # the importer, and that within that directory there is one
31
+  # subdirectory for each data source.
32
+  # for this example importer, that directory will be <dataDir>/test
33
+  # within which we will look for one small file "sample.pcm"
34
+  
35
+  
36
+  filename <- file.path(dataDir, "HOCOMOCO", "HOCOMOCOv9_AD_PLAINTEXT_H_WPCM.txt") #old filename: "hoco.pcm"
37
+  printf("checking for readable matrix file:")
38
+  printf("     %s", filename)
39
+  stopifnot(file.exists(filename))
40
+  
41
+  all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
42
+  title.lines = grep ('^>', all.lines)
43
+  title.line.count <<- length (title.lines)
44
+  max = title.line.count - 1
45
+  
46
+  pwms = list ()
47
+  #loops through all motifs in the matrix file, one motif at a time
48
+  for (i in 1:max) {
49
+    start.line = title.lines [i]
50
+    end.line = title.lines [i+1] - 1
51
+    new.pwm = parsePwm (all.lines [start.line:end.line])
52
+    pwms = c (pwms, list (new.pwm))
53
+  } # for i
54
+  
55
+  
56
+  invisible (pwms)
57
+  
58
+} # readRawMatrices
59
+#------------------------------------------------------------------------------------------------------------------------
60
+extractMatrices = function (pwm.list)
61
+{
62
+  matrices = sapply (pwm.list, function (element) element$matrix)
63
+  matrix.names <- sapply (pwm.list, function (element) element$title)
64
+  matrix.names <- sub("^> ", "", matrix.names)
65
+  names (matrices) <- matrix.names
66
+  
67
+  matrices
68
+  
69
+} # extractMatrices
70
+#------------------------------------------------------------------------------------------------------------------------
71
+createMetadataTable = function (dataDir, matrices, raw.metadata.filename)
72
+{
73
+  filename <- file.path(dataDir, "HOCOMOCO", "md-raw.tsv")
74
+  printf("checking for readable metadata file:")
75
+  printf("   %s", filename)
76
+  stopifnot(file.exists(filename))
77
+  
78
+  tbl.raw <- read.table(filename, sep="\t", header=TRUE, as.is=TRUE)
79
+  tbl.md = data.frame ()
80
+  matrix.ids = names(matrices)
81
+  geturlname <- function(name){
82
+    h = getCurlHandle()
83
+    z <- getURL(paste0("www.uniprot.org/uniprot/?query=",name),
84
+                followlocation=TRUE, curl=h)
85
+    getCurlInfo(h)$effective.url # catch the url redirect
86
+  }
87
+  for (matrix.id in matrix.ids) {
88
+    matrix <- matrices[[matrix.id]]
89
+    short.matrix.name <- sub("\\..*$", "", matrix.id)
90
+    #stopifnot(length(grep(short.matrix.name, tbl.raw$symbol)) == 1)
91
+    #md <- as.list(subset(tbl.raw, symbol==short.matrix.name))
92
+    dataSource <- "HOCOMOCOv9_AD_PLAINTEXT_H_PWM_hg19"
93
+    organism <- "Hsapiens"
94
+    
95
+    split.matrix.name <- unlist(strsplit(short.matrix.name, "_"))[1]
96
+    shorter.matrix.name <- split.matrix.name
97
+    #if (grepl(split.matrix.name, "+")){
98
+    #  shorter.matrix.name <- unlist(strsplit(split.matrix.name, "+"))[1]
99
+    #}
100
+    
101
+    #uri <- paste0("www.uniprot.org/uniprot/?query=",idStr)
102
+    if (nchar(short.matrix.name) <=9){#!("+" %in% shorter.matrix.name)
103
+      idStr <- paste0(shorter.matrix.name, "_HUMAN")
104
+      protIDURL <- geturlname(idStr) #gets the URL for the proteinID from the geneSymbol
105
+      protID <- unlist(strsplit(protIDURL, "http://www.uniprot.org/uniprot/"))[-1]
106
+      }else{
107
+        protID <- rep(NA,1)
108
+      }
109
+      
110
+    new.row = list (providerName=matrix.id,
111
+                    providerId=matrix.id, #"HOCOMOCO v8 and ChiPMunk 3.1"
112
+                    dataSource="HOCOMOCOv9_AD_PLAINTEXT_H_PWM_hg19",
113
+                    geneSymbol=shorter.matrix.name, #md$symbol
114
+                    geneId="9606",
115
+                    geneIdType="ENTREZ",
116
+                    proteinId=protID,
117
+                    proteinIdType="UNIPROT",
118
+                    organism="Hsapiens",
119
+                    sequenceCount=max(colSums(matrix)),
120
+                    bindingSequence=NA_character_,
121
+                    bindingDomain=NA,
122
+                    tfFamily=NA, #family
123
+                    experimentType="low- and high-throughput methods",
124
+                    pubmedID="23175603")
125
+    printf("matrix.id: %s", matrix.id);
126
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
127
+    full.name = sprintf ('%s-%s-%s', organism, dataSource, matrix.id)
128
+    rownames (tbl.md) [nrow (tbl.md)] = full.name
129
+  } # for matrix.id
130
+  
131
+  invisible (tbl.md)
132
+  
133
+} # createMetadataTable
134
+#------------------------------------------------------------------------------------------------------------------------
135
+renameMatrices = function (matrices, tbl.md)
136
+{
137
+  stopifnot (length (matrices) == nrow (tbl.md))
138
+  names (matrices) = rownames (tbl.md)
139
+  invisible (matrices)
140
+  
141
+} # renameMatrices
142
+#------------------------------------------------------------------------------------------------------------------------
143
+normalizeMatrices = function (matrices)
144
+{
145
+  mtx.normalized = sapply (matrices,
146
+                           function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
147
+  
148
+  invisible (mtx.normalized)
149
+  
150
+} # normalizeMatrices
151
+#------------------------------------------------------------------------------------------------------------------------
152
+parsePwm = function (text)
153
+{
154
+  lines = strsplit (text, '\t')
155
+  stopifnot(length(lines)==5) # title line, one line for each base
156
+  title = lines [[1]][1]
157
+  line.count = length(lines)
158
+  
159
+  # expect 4 rows, and a number of columns we can discern from
160
+  # the incoming text.
161
+  secondLineParsed <- strsplit(lines[[2]], " ")[[1]]
162
+  class(secondLineParsed) <- "numeric"
163
+  cols <- length(secondLineParsed)
164
+  result <- matrix (nrow=4, ncol=cols,
165
+                    dimnames=list(c('A','C','G','T'),
166
+                                  as.character(1:cols)))
167
+  # loop over the four lines (for each base respectively)
168
+  row = 1
169
+  
170
+  for(i in 2:line.count){
171
+    linesParsed <- strsplit(lines[[i]], " ")[[1]]
172
+    class(linesParsed) <- "numeric"
173
+    result [row,] = as.numeric (linesParsed)
174
+    row = row + 1
175
+  } # for i
176
+  
177
+  #return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
178
+  return (list (title=title, matrix=result))
179
+  
180
+} # parsePwm
181
+#----------------------------------------------------------------------------------------------------
0 182
new file mode 100644
... ...
@@ -0,0 +1,399 @@
1
+# hocomoco.file <- "HOCOMOCOv11_core_HUMAN_mono_jaspar_format.txt"
2
+# metadata.file <- "HOCOMOCOv11_core_annotation_HUMAN_mono.tsv"
3
+# outputFile <- "hocomocov11-core.RData"
4
+
5
+hocomoco.file <- "HOCOMOCOv11_full_HUMAN_mono_jaspar_format.txt"
6
+metadata.file <- "HOCOMOCOv11_full_annotation_HUMAN_mono.tsv"
7
+outputFile <- "hocomocov11-secondary.RData"
8
+# MotifDb/inst/scripes/import/HOCOMOCO/import.R
9
+#------------------------------------------------------------------------------------------------------------------------
10
+library(RUnit)
11
+library(RCurl)
12
+#------------------------------------------------------------------------------------------------------------------------
13
+runTests <- function()
14
+{
15
+  test_readRawMatrices()
16
+  test_extractMatrices()
17
+  test_createMetaDataTable()
18
+  test_renameMatrices()
19
+  test_normalizeMatrices()
20
+
21
+} # runTests
22
+#------------------------------------------------------------------------------------------------------------------------
23
+readRawMatrices <- function (dataDir, count=-1)
24
+{
25
+  # our convention is that there is a shared "dataDir" visible to
26
+  # the importer, and that within that directory there is one
27
+  # subdirectory for each data source.
28
+  # for this example importer, that directory will be <dataDir>/test
29
+  # within which we will look for one small file "sample.pcm"
30
+
31
+  filename <- file.path(dataDir, hocomoco.file)
32
+
33
+  stopifnot(file.exists(filename))
34
+
35
+  all.lines <- scan(filename, what=character(0), sep='\n', quiet=TRUE)
36
+  title.lines <- grep('^>', all.lines)
37
+  title.line.count <- length(title.lines)
38
+  max <- title.line.count - 1
39
+  if(count > 0)
40
+     max <- count
41
+
42
+  pwms <- list ()
43
+
44
+  for (i in 1:max){ # loop through all motifs in the matrix file, one motif at a time
45
+    start.line <- title.lines[i]
46
+    end.line <- title.lines[i+1] - 1
47
+    new.pwm <- parsePwm(all.lines [start.line:end.line])
48
+    pwms <- c(pwms, list(new.pwm))
49
+    } # for i
50
+
51
+  invisible (pwms)
52
+
53
+} # readRawMatrices
54
+#------------------------------------------------------------------------------------------------------------------------
55
+test_readRawMatrices <- function()
56
+{
57
+   printf("--- test_readRawMatrices")
58
+   pwms <- readRawMatrices("./", count=3)
59
+   checkEquals(length(pwms), 3)
60
+   pwm.1 <- pwms[[1]]
61
+
62
+   checkEquals(pwm.1$title, "AHR_HUMAN.H11MO.0.B")
63
+   mtx <- pwm.1$matrix
64
+   checkEquals(dim(mtx), c(4, 9))
65
+   checkEquals(rownames(mtx), c("A", "C", "G", "T"))
66
+   checkEquals(sum(mtx), 1386)
67
+
68
+} # test_readRawMatrices
69
+#------------------------------------------------------------------------------------------------------------------------
70
+# simple operation: for each element in the pwm.list, create an intem in a new list where
71
+# rather than a sublist of title and matrix, each element is simply a matrix, with the element named
72
+extractMatrices <- function(pwm.list)
73
+{
74
+  matrices <- sapply(pwm.list, function (element) element$matrix)
75
+  matrix.names <- sapply(pwm.list, function (element) element$title)
76
+  matrix.names <- sub("^> ", "", matrix.names)
77
+  names(matrices) <- matrix.names
78
+  matrices
79
+
80
+} # extractMatrices
81
+#------------------------------------------------------------------------------------------------------------------------
82
+test_extractMatrices <- function()
83
+{
84
+   printf("--- test_extractMatrices")
85
+   pwms <- readRawMatrices("./", count=3)
86
+   matrices <- extractMatrices(pwms)
87
+   checkEquals(length(matrices), 3)
88
+   checkEquals(names(matrices), c("AHR_HUMAN.H11MO.0.B", "AIRE_HUMAN.H11MO.0.C", "ALX1_HUMAN.H11MO.0.B"))
89
+
90
+} # test_extractMatrices
91
+#------------------------------------------------------------------------------------------------------------------------
92
+createMetaDataTable <- function(dataDir, matrices, raw.metadata.filename)
93
+{
94
+  filename <- file.path(dataDir, raw.metadata.filename)
95
+    #printf("checking for readable metadata file:")
96
+    #printf("   %s", filename)
97
+  stopifnot(file.exists(filename))
98
+
99
+  tbl.raw <- read.table(filename, sep="\t", header=TRUE, as.is=TRUE)
100
+  tbl.md <- data.frame ()
101
+  matrix.ids <- names(matrices)
102
+
103
+  for (matrix.id in matrix.ids) {
104
+    tokens <- strsplit(matrix.id, ".", fixed=TRUE)[[1]]
105
+    reliability.score <- tokens[length(tokens)]
106
+    matrix <- matrices[[matrix.id]]
107
+    geneSymbol <- sub("_HUMAN.*$", "", matrix.id)
108
+    tbl.sub <- subset(tbl.raw, Model==matrix.id)
109
+    entrez.gene <- tbl.sub$EntrezGene[1]
110
+    uniprot.id <- tbl.sub$UniProt.ID[1]
111
+    tf.family <- paste(tbl.sub$TF.family[1], tbl.sub$TF.subfamily[1], sep=": ")
112
+    experiment.type <- tbl.sub$Data.source
113
+    dataSource <- sprintf("HOCOMOCOv11%s", reliability.score)
114
+    organism <- "Hsapiens"
115
+
116
+    new.row <- list (providerName=matrix.id,
117
+                     providerId=matrix.id, #"HOCOMOCO v8 and ChiPMunk 3.1"
118
+                     dataSource=dataSource,
119
+                     geneSymbol=geneSymbol,
120
+                     geneId=entrez.gene,
121
+                     geneIdType="ENTREZ",
122
+                     proteinId=uniprot.id,
123
+                     proteinIdType="UNIPROT",
124
+                     organism="Hsapiens",
125
+                     sequenceCount=max(colSums(matrix)),
126
+                     bindingSequence=NA_character_,
127
+                     bindingDomain=NA,
128
+                     tfFamily=tf.family,
129
+                     experimentType=experiment.type,
130
+                     pubmedID="23175603")
131
+    #printf("matrix.id: %s", matrix.id);
132
+       # inefficient but acceptable:
133
+    tbl.md <- rbind(tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
134
+    full.name <- sprintf ('%s-%s-%s', organism, dataSource, matrix.id)
135
+    rownames (tbl.md) [nrow (tbl.md)] <- full.name
136
+    } # for matrix.id
137
+
138
+  invisible (tbl.md)
139
+
140
+} # createMetaDataTable
141
+#------------------------------------------------------------------------------------------------------------------------
142
+test_createMetaDataTable <- function()
143
+{
144
+   printf("--- test_createMetaDataTable")
145
+
146
+   pwms <- readRawMatrices("./", count=3)
147
+   matrices <- extractMatrices(pwms)
148
+
149
+   tbl.md <- createMetaDataTable("./", matrices, raw.metadata.filename=metadata.file)
150
+   checkEquals(dim(tbl.md), c(3, 15))
151
+     # make sure the reliability score (A-D) is searchably found in the matrix rowname
152
+   checkEquals(length(grep("HOCOMOCOv11[ABCD]", rownames(tbl.md))), 3)
153
+   checkEquals(length(grep("HOCOMOCOv11[ABCD]", tbl.md$dataSource)), 3)
154
+
155
+   pwms <- readRawMatrices("./", count=-1)
156
+   matrices <- extractMatrices(pwms)
157
+   tbl.md <- createMetaDataTable("./", matrices, raw.metadata.filename=metadata.file)
158
+   checkEquals(length(grep("HOCOMOCOv11[ABCD]", rownames(tbl.md))), nrow(tbl.md))
159
+
160
+} # test_createMetaDataTable
161
+#------------------------------------------------------------------------------------------------------------------------
162
+# we now have metadata rownames, one per matrix, each of which has all (or at least at lot) of information
163
+# used in querying.  For instance, the hocomoco give name for the first matrix is
164
+#   "AHR_HUMAN.H11MO.0.B"
165
+# we transform that here to
166
+#   "Hsapiens-HOCOMOCOv11B-AHR_HUMAN.H11MO.0.B"
167
+renameMatrices <- function (matrices, tbl.md)
168
+{
169
+  stopifnot(length(matrices) == nrow(tbl.md))
170
+  names(matrices) <- rownames(tbl.md)
171
+  invisible(matrices)
172
+
173
+} # renameMatrices
174
+#------------------------------------------------------------------------------------------------------------------------
175
+test_renameMatrices <- function()
176
+{
177
+   printf("--- test_renameMatrices")
178
+   pwms <- readRawMatrices("./", count=3)
179
+   matrices <- extractMatrices(pwms)
180
+   checkEquals(names(matrices), c("AHR_HUMAN.H11MO.0.B",
181
+                                  "AIRE_HUMAN.H11MO.0.C",
182
+                                  "ALX1_HUMAN.H11MO.0.B"))
183
+   tbl.md <- createMetaDataTable("./", matrices, raw.metadata.filename=metadata.file)
184
+   matrices.renamed <- renameMatrices(matrices, tbl.md)
185
+
186
+   checkEquals(names(matrices.renamed), c("Hsapiens-HOCOMOCOv11B-AHR_HUMAN.H11MO.0.B",
187
+                                          "Hsapiens-HOCOMOCOv11C-AIRE_HUMAN.H11MO.0.C",
188
+                                          "Hsapiens-HOCOMOCOv11B-ALX1_HUMAN.H11MO.0.B"))
189
+
190
+} # test_renameMatrices
191
+#------------------------------------------------------------------------------------------------------------------------
192
+normalizeMatrices <- function(matrices)
193
+{
194
+  mtx.normalized <- sapply (matrices,
195
+                           function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
196
+
197
+  invisible (mtx.normalized)
198
+
199
+} # normalizeMatrices
200
+#------------------------------------------------------------------------------------------------------------------------
201
+test_normalizeMatrices <- function()
202
+{
203
+   printf("--- test_normalizeMatrices")
204
+
205
+   pwms <- readRawMatrices("./", count=3)
206
+   matrices <- extractMatrices(pwms)
207
+   matrices.norm <- normalizeMatrices(matrices)
208
+
209
+     # after normalization, each matrix column should total to 1.0
210
+     # so sum(matrix) should be equals to the number of columns
211
+     # after normalization, the summed row counts should be in the same order,
212
+     # indicating that the structure has been preserved
213
+
214
+   for(i in 1:3){
215
+     checkEquals(sum(matrices.norm[[i]]), ncol(matrices.norm[[i]]))
216
+     checkEquals(order(rowSums(matrices[[i]])), order(rowSums(matrices.norm[[i]])))
217
+     } # for i
218
+
219
+} # test_normalizeMatrices
220
+#------------------------------------------------------------------------------------------------------------------------
221
+parsePwm <- function (text)
222
+{
223
+  lines <- strsplit (text, '\t')
224
+  stopifnot(length(lines)==5) # title line, one line for each base
225
+  title <- sub(">", "", lines [[1]][1], fixed=TRUE)
226
+  line.count <- length(lines)
227
+
228
+  # expect 4 rows, and a number of columns we can discern from
229
+  # the incoming text.
230
+  secondLineParsed <- unlist(strsplit(lines[[2]], " "))
231
+  cols <- length(secondLineParsed)
232
+  result <- matrix (nrow=4, ncol=cols,
233
+                    dimnames=list(c('A','C','G','T'),
234
+                                  as.character(1:cols)))
235
+  # loop over the four lines (for each base respectively)
236
+  row <- 1
237
+
238
+  for(i in 2:line.count){
239
+    lineParsed <- unlist(strsplit(lines[[i]], "\t"))
240
+    #print(lineParsed)
241
+    result [row,] <- as.numeric(lineParsed)
242
+    row <- row + 1
243
+    } # for i
244
+
245
+  #browser()
246
+
247
+  #return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
248
+  return (list (title=title, matrix=result))
249
+
250
+} # parsePwm
251
+#----------------------------------------------------------------------------------------------------
252
+readMetaDataTable <- function()
253
+{
254
+   tbl <- read.table(metadata.file, sep="\t", header=TRUE, as.is=TRUE)
255
+   dim(tbl)
256
+
257
+    # Model                                AHR_HUMAN.H11MO.0.B
258
+    # Transcription.factor                                 AHR
259
+    # Model.length                                           9
260
+    # Quality                                                B
261
+    # Model.rank                                             0
262
+    # Consensus                                      dKhGCGTGh
263
+    # Model.release                                 HOCOMOCOv9
264
+    # Data.source                                  Integrative
265
+    # Best.auROC..human.                                  <NA>
266
+    # Best.auROC..mouse.                                  <NA>
267
+    # Peak.sets.in.benchmark..human.                      <NA>
268
+    # Peak.sets.in.benchmark..mouse.                      <NA>
269
+    # Aligned.words                                        157
270
+    # TF.family                      PAS domain factors{1.2.5}
271
+    # TF.subfamily                   Ahr-like factors{1.2.5.1}
272
+    # HGNC                                                 348
273
+    # EntrezGene                                           196
274
+    # UniProt.ID                                     AHR_HUMAN
275
+    # UniProt.AC                                        P35869
276
+
277
+
278
+    #                     Mmusculus-jaspar2014-Arnt-MA0004
279
+    #     providerName    "MA0004.1 Arnt"
280
+    #     providerId      "MA0004.1 Arnt"
281
+    #     dataSource      "jaspar2014"
282
+    #     geneSymbol      "Arnt"
283
+    #     geneId          NA
284
+    #     geneIdType      NA
285
+    #     proteinId       "P53762"
286
+    #     proteinIdType   "UNIPROT"
287
+    #     organism        "Mmusculus"
288
+    #     sequenceCount   "20"
289
+    #     bindingSequence NA
290
+    #     bindingDomain   NA
291
+    #     tfFamily        "Helix-Loop-Helix"
292
+    #     experimentType  "SELEX"
293
+    #     pubmedID        "24194598"
294
+
295
+
296
+
297
+
298
+} # readMetaDataTable
299
+#----------------------------------------------------------------------------------------------------
300
+run <- function (dataDir=".")
301
+{
302
+  rawMatrixList <- readRawMatrices("./", dataDir)
303
+  length(rawMatrixList)
304
+  matrices <- extractMatrices (rawMatrixList)
305
+  length(matrices)
306
+  matrices[1]
307
+  tbl.md <- createMetaDataTable(dataDir, matrices, raw.metadata.filename=metadata.file)
308
+  matrices <- normalizeMatrices(matrices)
309
+  matrices <- renameMatrices(matrices, tbl.md)
310
+
311
+  serializedFile <- file.path(dataDir, outputFile)
312
+  printf("writing %s to %s", outputFile, dataDir)
313
+
314
+  save(matrices, tbl.md, file=serializedFile)
315
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
316
+  printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
317
+
318
+} # run
319
+#------------------------------------------------------------------------------------------------------------------------
320
+blendCoreAndSecondaryDataSets <- function()
321
+{
322
+   load("hocomocov11-full.Rdata")
323
+   matrices.secondary <- matrices
324
+   tbl.secondary <- tbl.md
325
+   length(matrices.secondary);   # 768
326
+   dim(tbl.secondary)            # 768 15
327
+
328
+   checkTrue(all(names(matrices.secondary) %in% rownames(tbl.secondary)))
329
+
330
+   load("hocomocov11-core.RData")
331
+   matrices.core <- matrices
332
+   tbl.core <- tbl.md
333
+   checkTrue(all(names(matrices.core) %in% rownames(tbl.core)))
334
+   length(matrices.core);  # 400
335
+   dim(tbl.core)           # 400 15
336
+
337
+   checkTrue(all(names(matrices.core) %in% rownames(tbl.secondary)))
338
+   checkTrue(all(names(matrices.core) %in% names(matrices.secondary)))
339
+
340
+      # above tests establish that tbl.secondary and matrices.secondary are a proper superset of the core set
341
+      # all we need is to now mark the tbl.secondary$dataSource to distinguish core & secondary
342
+
343
+   mapping <- match(rownames(tbl.core), rownames(tbl.secondary))
344
+   checkEquals(length(mapping), 400)
345
+   unmapped.secondary.only <- setdiff(seq_len(nrow(tbl.secondary)), mapping)
346
+
347
+   checkEquals(length(unmapped.secondary.only), 368)
348
+
349
+     # are all core matrices in secondary, with the same names
350
+   checkTrue(all(names(matrices.core) == names(matrices.secondary)[mapping]))
351
+   checkTrue(all(rownames(tbl.core) == rownames(tbl.secondary)[mapping]))
352
+
353
+   nrow(subset(tbl.secondary, geneId==""))  # 3
354
+   rownames(subset(tbl.secondary, geneId==""))
355
+      # [1] "Hsapiens-HOCOMOCOv11D-FOXO6_HUMAN.H11MO.0.D"
356
+      # [2] "Hsapiens-HOCOMOCOv11D-KLF14_HUMAN.H11MO.0.D"
357
+      # [3] "Hsapiens-HOCOMOCOv11D-ZN713_HUMAN.H11MO.0.D"
358
+
359
+      # FOXO6 forkhead box O6 [Homo sapiens (human)]
360
+      #   Gene ID: 100132074, updated on 13-Mar-2020
361
+      # KLF14 Kruppel like factor 14 [Homo sapiens (human)]
362
+      #   Gene ID: 136259, updated on 13-Mar-2020
363
+      # ZNF713 zinc finger protein 713 [Homo sapiens (human)],
364
+      #   Gene ID: 349075, updated on 13-Mar-2020
365
+
366
+    tbl.secondary["Hsapiens-HOCOMOCOv11D-FOXO6_HUMAN.H11MO.0.D", "geneId"] <- "100132074"
367
+    tbl.secondary["Hsapiens-HOCOMOCOv11D-KLF14_HUMAN.H11MO.0.D", "geneId"] <- "136259"
368
+    tbl.secondary["Hsapiens-HOCOMOCOv11D-ZN713_HUMAN.H11MO.0.D", "geneId"] <- "349075"
369
+
370
+    nrow(subset(tbl.secondary, geneId==""))  # was 3, should now be zero
371
+
372
+    length(mapping)
373
+    length(unmapped.secondary.only)
374
+
375
+     # revise all entries in the dataSource column
376
+     # the dataSource starts off: HOCOMOCOv11[ABCD]
377
+     # we want, in order to support MotifDb::query
378
+     #  HOCOMOCOv11-[core|secondary]-[ABCD]
379
+
380
+   x <- tbl.secondary$dataSource
381
+     # pick off just the category: A, B, C, D
382
+   class <- substr(x, 12, 12)
383
+
384
+   tbl.secondary$dataSource[unmapped.secondary.only] <- paste0("HOCOMOCOv11-secondary-", class[unmapped.secondary.only])
385
+   tbl.secondary$dataSource[mapping] <-  paste0("HOCOMOCOv11-core-", class[mapping])
386
+
387
+     # now revise all of the tbl.md rownames: they need "core" and "secondary"
388
+     # inserted also
389
+
390
+   x <- rownames(tbl.md)
391
+
392
+
393
+   matrices <- matrices.secondary
394
+   tbl.md <- tbl.secondary
395
+   save(matrices, tbl.md, file="hocomoco11.RData")
396
+
397
+
398
+} # blendCoreAndSecondaryDataSets
399
+#------------------------------------------------------------------------------------------------------------------------