Browse code

Added homer data w_o organisms

Matthew Richards authored on 24/05/2017 22:17:18
Showing3 changed files

1 1
new file mode 100644
2 2
Binary files /dev/null and b/inst/extdata/homer.RData differ
3 3
similarity index 99%
4 4
rename from inst/scripts/import/homer/custom.motifs.txt
5 5
rename to inst/scripts/import/homer/custom.motifs.mod.txt
... ...
@@ -646,7 +646,7 @@
646 646
 0.502	0.054	0.119	0.325
647 647
 0.272	0.054	0.292	0.382
648 648
 0.084	0.748	0.053	0.115
649
->AGGTCAAGGTCA	RAR:RXR(NR),DR5/ES-RAR-ChIP-Seq(GSE56893)/Homer	9.389522	-1136.226412	0	T:507.0(17.99%),B:367.0(0.81%),P:1e-493	Tpos:50.7,Tstd:22.6,Bpos:50.9,Bstd:31.8,StrandBias:0.2,Multiplicity:1.10
649
+>AGGTCAAGGTCA	RAR:RXR(NR),DR5/ES-RAR-ChIP-Seq(GSE56893)/Homer_1	9.389522	-1136.226412	0	T:507.0(17.99%),B:367.0(0.81%),P:1e-493	Tpos:50.7,Tstd:22.6,Bpos:50.9,Bstd:31.8,StrandBias:0.2,Multiplicity:1.10
650 650
 0.662	0.057	0.280	0.001
651 651
 0.001	0.001	0.985	0.013
652 652
 0.014	0.001	0.630	0.355
... ...
@@ -659,7 +659,7 @@
659 659
 0.257	0.138	0.190	0.415
660 660
 0.071	0.639	0.189	0.101
661 661
 0.602	0.032	0.246	0.120
662
->RGGTCADNNAGAGGTCAV	RAR:RXR(NR),DR5/ES-RAR-ChIP-Seq(GSE56893)/Homer	11.193397	-1186.419162	0	T:330.0(11.71%),B:58.3(0.13%),P:1e-515	Tpos:49.7,Tstd:20.8,Bpos:53.3,Bstd:33.5,StrandBias:-0.3,Multiplicity:1.04
662
+>RGGTCADNNAGAGGTCAV	RAR:RXR(NR),DR5/ES-RAR-ChIP-Seq(GSE56893)/Homer_2	11.193397	-1186.419162	0	T:330.0(11.71%),B:58.3(0.13%),P:1e-515	Tpos:49.7,Tstd:20.8,Bpos:53.3,Bstd:33.5,StrandBias:-0.3,Multiplicity:1.04
663 663
 0.509	0.001	0.489	0.001
664 664
 0.019	0.001	0.961	0.019
665 665
 0.001	0.001	0.803	0.195
666 666
new file mode 100644
... ...
@@ -0,0 +1,173 @@
1
+# MotifDb/inst/scripts/import/homer/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+options (stringsAsFactors=FALSE)
4
+printf <- function(...) print(noquote(sprintf(...)))
5
+library(RCurl)
6
+#------------------------------------------------------------------------------------------------------------------------
7
+run = function (dataDir)
8
+{
9
+  dataDir <- file.path(dataDir, "homer")
10
+  rawMatrixList <- readRawMatrices (dataDir)
11
+  matrices <- extractMatrices (rawMatrixList)
12
+ browser() 
13
+  tbl.md <- createMetadataTable(matrices) #(dataDir, matrices,
14
+  #raw.metadata.filename="md-raw.tsv")
15
+#  browser()
16
+  matrices <- normalizeMatrices (matrices)
17
+  matrices <- renameMatrices (matrices, tbl.md)
18
+  
19
+  serializedFile <- file.path(dataDir, "homer.RData")
20
+  printf("writing %s to %s", "homer.RData", dataDir)
21
+  
22
+  save (matrices, tbl.md, file=serializedFile)
23
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
24
+  printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
25
+  
26
+} # run
27
+#------------------------------------------------------------------------------------------------------------------------
28
+readRawMatrices = function (dataDir)
29
+{
30
+  # our convention is that there is a shared "dataDir" visible to
31
+  # the importer, and that within that directory there is one
32
+  # subdirectory for each data source.
33
+  # for this example importer, that directory will be <dataDir>/test
34
+  # within which we will look for one small file "sample.pcm"
35
+  
36
+  filename <- file.path(dataDir, "custom.motifs.mod.txt")
37
+  printf("checking for readable human matrix file:")
38
+  printf("     %s", filename)
39
+  stopifnot(file.exists(filename))
40
+  
41
+  all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
42
+  title.lines = grep ('^>', all.lines)
43
+  title.line.count <<- length (title.lines)
44
+  max = title.line.count - 1
45
+  
46
+  pwms = list ()
47
+  #loops through all motifs in the matrix file, one motif at a time
48
+  for (i in 1:max) {
49
+    start.line = title.lines [i]
50
+    end.line = title.lines [i+1] - 1
51
+    new.pwm = parsePwm (all.lines [start.line:end.line])
52
+    pwms = c (pwms, list (new.pwm))
53
+  } # for i
54
+    
55
+  #browser()
56
+  invisible (pwms)
57
+  
58
+} # readRawMatrices
59
+#------------------------------------------------------------------------------------------------------------------------
60
+extractMatrices = function (pwm.list)
61
+{
62
+  matrices = sapply (pwm.list, function (element) element$matrix)
63
+  matrix.names <- sapply (pwm.list, function (element) element$title)
64
+  matrix.names <- sub("^> ", "", matrix.names)
65
+  names (matrices) <- matrix.names
66
+  
67
+  matrices
68
+  
69
+} # extractMatrices
70
+#------------------------------------------------------------------------------------------------------------------------
71
+createMetadataTable = function (matrices)
72
+{
73
+  #  browser()
74
+    tbl.md = data.frame ()
75
+    matrix.ids = names(matrices)
76
+  geturlname <- function(name){
77
+    h = getCurlHandle()
78
+    z <- getURL(paste0("www.uniprot.org/uniprot/?query=",name),
79
+                followlocation=TRUE, curl=h)
80
+    getCurlInfo(h)$effective.url # catch the url redirect
81
+  }
82
+    # Assume we have either Human or Mouse
83
+  for (matrix.id in matrix.ids) {
84
+      my.matrix <- matrices[[matrix.id]]
85
+      # Split up the ID pieces (symbol, organism, database, version)
86
+      #    short.matrix.name <- sub("\\..*$", "", matrix.id)
87
+      id.pieces <- unlist(strsplit(matrix.id, "\\/"))
88
+      # Piece 1 has TF; Piece 2 has Origin; Piece 3 has program/tool/resource
89
+      tf <- sub("\\(.*\\)","", id.pieces[[1]])
90
+     
91
+      organism <- NA
92
+    
93
+      dataSource <- "HOMER"
94
+      
95
+    # split.matrix.name <- unlist(strsplit(short.matrix.name, "_"))[1]
96
+    # shorter.matrix.name <- split.matrix.name
97
+    #if (grepl(split.matrix.name, "+")){
98
+    #  shorter.matrix.name <- unlist(strsplit(split.matrix.name, "+"))[1]
99
+    #}
100
+    
101
+    #uri <- paste0("www.uniprot.org/uniprot/?query=",idStr)
102
+    if (nchar(id.pieces[1]) <=9){#!("+" %in% shorter.matrix.name)
103
+      idStr <- paste0(id.pieces[1], "_", id.pieces[2])
104
+      protIDURL <- geturlname(idStr) #gets the URL for the proteinID from the geneSymbol
105
+      protID <- unlist(strsplit(protIDURL, "http://www.uniprot.org/uniprot/"))[-1]
106
+      }else{
107
+        protID <- rep(NA,1)
108
+      }
109
+      
110
+    new.row = list (providerName=matrix.id,
111
+                    providerId=matrix.id, 
112
+                    dataSource= dataSource,
113
+                    geneSymbol= tf, #md$symbol
114
+                    geneId= NA,
115
+                    geneIdType= NA,
116
+                    proteinId=protID,
117
+                    proteinIdType="UNIPROT",
118
+                    organism= organism,
119
+                    sequenceCount=max(colSums(my.matrix)),
120
+                    bindingSequence=NA_character_,
121
+                    bindingDomain=NA,
122
+                    tfFamily=NA, #family
123
+                    experimentType="low- and high-throughput methods",
124
+                    pubmedID="26586801")
125
+    printf("matrix.id: %s", matrix.id);
126
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
127
+    full.name = sprintf ('%s-%s-%s', organism, dataSource, matrix.id)
128
+    rownames (tbl.md) [nrow (tbl.md)] = full.name
129
+  } # for matrix.id
130
+  browser()
131
+  invisible (tbl.md)
132
+  
133
+} # createMetadataTable
134
+#------------------------------------------------------------------------------------------------------------------------
135
+renameMatrices = function (matrices, tbl.md)
136
+{
137
+  stopifnot (length (matrices) == nrow (tbl.md))
138
+  names (matrices) = rownames (tbl.md)
139
+  invisible (matrices)
140
+  
141
+} # renameMatrices
142
+#------------------------------------------------------------------------------------------------------------------------
143
+normalizeMatrices = function (matrices)
144
+{
145
+  mtx.normalized = sapply (matrices,
146
+                           function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
147
+  
148
+  invisible (mtx.normalized)
149
+  
150
+} # normalizeMatrices
151
+#------------------------------------------------------------------------------------------------------------------------
152
+parsePwm = function (text)
153
+{
154
+  lines = strsplit (text, '\t')
155
+  # Remove the arrow from sequence; save it and title separately
156
+  consensus.sequence <- sub(">", "", lines [[1]][1])
157
+  title <- lines[[1]][2]
158
+  line.count = length(lines)
159
+  
160
+  # browser()
161
+  result = matrix (nrow=line.count-1, ncol=4, dimnames=list(1:(line.count-1), c ('A','C','G','T')))
162
+  row = 1
163
+  for (line in lines [2:line.count]) {
164
+    result [row,] = as.numeric (line)
165
+    row = row + 1
166
+  } # for line
167
+
168
+  result <- t(result)
169
+  
170
+  return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
171
+  
172
+} # parsePwm
173
+#----------------------------------------------------------------------------------------------------