Browse code

Adds Mirsynergy/ MIMOSA/ CNEr/ to the repos.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/CNEr@86790 bc3139a8-67e5-0310-9ffc-ced21a209358

Marc Carlson authored on 25/02/2014 18:49:52
Showing 145 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,25 @@
1
+Package: CNEr
2
+Version: 0.99.8
3
+Date: 2014-02-13
4
+Title: CNE detection and visualization.
5
+Description: Large-scale identification and advanced visualization of
6
+        sets of conserved noncoding elements.
7
+Author: Ge Tan <ge.tan09@imperial.ac.uk>
8
+Maintainer: Ge Tan <ge.tan09@imperial.ac.uk>
9
+Imports: Biostrings(>= 2.26.3), RSQLite(>= 0.11.4), GenomicRanges(>=
10
+        1.12.5), rtracklayer(>= 1.20.4), RMySQL(>= 0.9-3), XVector(>=
11
+        0.2.0), DBI(>= 0.2-7), GenomicAlignments(>= 0.99.17), methods,
12
+        IRanges(>= 1.19.38)
13
+Depends: R (>= 3.0.2)
14
+Suggests: Gviz(>= 1.7.4), RUnit, BiocGenerics
15
+LinkingTo: IRanges, XVector
16
+System Requirements: blat
17
+License: GPL-2 | file LICENSE
18
+License_restricts_use: yes
19
+URL: http://ancora.genereg.net/
20
+Type: Package
21
+biocViews: GeneRegulation, Visualization, DataImport
22
+NeedsCompilation: yes
23
+LazyData: no
24
+Collate: AllGenerics.r AllClasses.r utils.r ceScan.r plot.r
25
+        makeGeneDbFromUCSC.r io.r
0 26
new file mode 100644
... ...
@@ -0,0 +1,17 @@
1
+CONTENTS AND COPYRIGHT
2
+
3
+The source code under src/ucsc comes from Jim Kent's source utilities.
4
+Any usage of this code should follow the license restriction.
5
+
6
+The following lines are taken from README of Kent's utilities.
7
+
8
+This directory contains the entire source tree for the
9
+UCSC Genome Bioinformatics Group's suite of biological analysis
10
+and web display programs as well as some of Jim Kent's own tools.
11
+All files are copyrighted, but license is hereby granted for personal,
12
+academic, and non-profit use.  A license is also granted for the contents
13
+of the src/lib, src/inc and src/utils and python directories for commercial
14
+users.  Commercial users should contact kent@soe.ucsc.edu for access to other
15
+modules.  Commercial users interested in the UCSC Genome Browser in particular
16
+please see
17
+    http://genome.ucsc.edu/license/.
0 18
new file mode 100644
... ...
@@ -0,0 +1,110 @@
1
+useDynLib(CNEr, .registration = TRUE)
2
+
3
+### -----------------------------------------------------------------
4
+### Import
5
+### -----------------------------------------------------------------
6
+import(IRanges)
7
+
8
+### -----------------------------------------------------------------
9
+### Import S4 classes defined in other packages
10
+###
11
+#import(Biostrings)
12
+#import(GenomicRanges)
13
+importClassesFrom(methods, ANY, character, integer, missing)
14
+
15
+### -----------------------------------------------------------------
16
+### Import S4 methods defined in other packages
17
+###
18
+importMethodsFrom(rtracklayer, import.bed)
19
+importMethodsFrom(GenomicRanges, seqnames, seqlengths, strand)
20
+importMethodsFrom(XVector, subseq)
21
+importMethodsFrom(DBI, dbGetQuery, dbConnect, dbDisconnect, dbWriteTable)
22
+importMethodsFrom(methods, initialize, show)
23
+
24
+### -----------------------------------------------------------------
25
+### Import ordinary functions, variables in other packages
26
+###
27
+importFrom(GenomicRanges, GRanges, Seqinfo)
28
+importFrom(GenomicAlignments, explodeCigarOps, CIGAR_OPS, explodeCigarOpLengths)
29
+importFrom(Biostrings, DNAStringSet)
30
+importFrom(RMySQL, MySQL)
31
+importFrom(RSQLite, SQLite)
32
+importFrom(methods, is, new)
33
+
34
+### -----------------------------------------------------------------
35
+### Export S4 classes defined in CNEr
36
+###
37
+exportClasses(Axt,
38
+              CNE)
39
+
40
+### -----------------------------------------------------------------
41
+### Export S4 methods for generics not defined in CNEr
42
+###
43
+exportMethods(
44
+  "[", 
45
+  c,
46
+  ## class.r
47
+  length,
48
+  score
49
+)
50
+
51
+### -----------------------------------------------------------------
52
+### Export non-generic functions
53
+###
54
+
55
+export(
56
+  ## utils.r
57
+  reverseCigar,
58
+  binFromCoordRange,
59
+  binRangesFromCoordRange,
60
+  binRestrictionString,
61
+  readCNERangesFromSQLite,
62
+  #queryAnnotationSQLite,
63
+  fetchChromSizes,
64
+
65
+  ## AllClasses.r
66
+  Axt,
67
+  CNE,
68
+
69
+  # io.r
70
+  readBed,
71
+  readAxt,
72
+  axtInfo,
73
+  writeAxt,
74
+
75
+  ## ceScan.r
76
+  ceScan, 
77
+  cneMerge,
78
+  blatCNE,
79
+  ceScanOneStep,
80
+
81
+  ## makeGeneDbFromUCSC.r
82
+  #supportedUCSCtables,
83
+  #queryrefGene, queryknownGene, queryensGene,
84
+  #makeGeneDbFromUCSC,
85
+
86
+  ## plot.r
87
+  CNEDensity
88
+)
89
+
90
+### -----------------------------------------------------------------
91
+### Export S4 generics defined in CNEr + export corresponding methods
92
+###
93
+
94
+exportMethods(
95
+  ## AllClasses.r
96
+  targetRanges, queryRanges,
97
+  targetSeqs, querySeqs,
98
+  symCount,
99
+  subAxt,
100
+  assembly1, assembly2,
101
+  CNE1, CNE2,
102
+  thresholds,
103
+  CNEMerged, CNERepeatsFiltered,
104
+  
105
+  ## utils.r
106
+  saveCNEToSQLite,
107
+  ## plot.r
108
+  CNEDensity
109
+)
110
+
0 111
new file mode 100644
... ...
@@ -0,0 +1,382 @@
1
+
2
+### -----------------------------------------------------------------
3
+### Axt class
4
+### Exported!
5
+setClass(Class="Axt",
6
+         slots=c(targetRanges="GRanges",
7
+                 targetSeqs="DNAStringSet",
8
+                 queryRanges="GRanges",
9
+                 querySeqs="DNAStringSet",
10
+                 score="integer",
11
+                 symCount="integer"
12
+                 )
13
+         )
14
+
15
+setValidity("Axt",
16
+            function(object){
17
+              if(!isConstant(c(length(object@targetRanges), 
18
+                             length(object@targetSeqs),
19
+                             length(object@queryRanges), 
20
+                             length(object@querySeqs),
21
+                             length(object@score), 
22
+                             length(object@symCount))))
23
+                return("The lengths of targetRanges, targetSeqs,
24
+                       queryRanges, querySeqs, score and symCount
25
+                       must have be same!")
26
+              if(any(object@symCount <= 0L))
27
+                return("Then symCount must be larger than 0!")
28
+              return(TRUE)
29
+            }
30
+            )
31
+
32
+### -----------------------------------------------------------------
33
+### CNE class
34
+### 
35
+setClass(Class="CNE",
36
+         slots=c(assembly1="character",
37
+                 assembly2="character",
38
+                 thresholds="character",
39
+                 CNE1="list",
40
+                 CNE2="list",
41
+                 CNEMerged="list",
42
+                 CNERepeatsFiltered="list",
43
+                 alignMethod="character"
44
+                 )
45
+         )
46
+
47
+setValidity("CNE",
48
+            function(object){
49
+              if(length(object@assembly1) != 1L)
50
+                return("The name of assembly1 must be length 1!")
51
+              if(length(object@alignMethod) != 1L)
52
+                return("The align method must be length 1!")
53
+              if(length(object@assembly2) != 1L)
54
+                return("The name of assembly2 must be length 1!")
55
+              if(!all(grepl("^\\d+_\\d+$", object@thresholds)))
56
+                return("The thresholds must be in format of 49_50!")
57
+              if(any(as.integer(
58
+                       sapply(strsplit(object@thresholds, "_"), "[", 2))
59
+                 < as.integer(
60
+                       sapply(strsplit(object@thresholds, "_"), "[", 1))))
61
+                return("The window size cannot be smaller than identity score!")
62
+              if(length(object@CNE1) != length(object@thresholds) ||
63
+                 length(object@CNE2) != length(object@thresholds) ||
64
+                 length(object@CNEMerged) != length(object@thresholds) ||
65
+                 length(object@CNERepeatsFiltered) != length(object@thresholds))
66
+                return("The number of cne tables must be same with
67
+                       number of thresholds!")
68
+              return(TRUE)
69
+            }
70
+            )
71
+
72
+
73
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
74
+### Axt Slot getters and setters.
75
+###
76
+setMethod("targetRanges", "Axt", function(x) x@targetRanges)
77
+setMethod("targetSeqs", "Axt", function(x) x@targetSeqs)
78
+setMethod("queryRanges", "Axt", function(x) x@queryRanges)
79
+setMethod("querySeqs", "Axt", function(x) x@querySeqs)
80
+setMethod("score", "Axt", function(x) x@score)
81
+setMethod("symCount", "Axt", function(x) x@symCount)
82
+setMethod("length", "Axt", function(x) length(targetRanges(x)))
83
+
84
+### -----------------------------------------------------------------
85
+### CNE Slot getters and setters.
86
+###
87
+setMethod("assembly1", "CNE", function(x) x@assembly1)
88
+setMethod("assembly2", "CNE", function(x) x@assembly2)
89
+setMethod("CNE1", "CNE", function(x) x@CNE1)
90
+setMethod("CNE2", "CNE", function(x) x@CNE2)
91
+setMethod("thresholds", "CNE", function(x) x@thresholds)
92
+setMethod("CNEMerged", "CNE", function(x) x@CNEMerged)
93
+setMethod("CNERepeatsFiltered", "CNE", function(x) x@CNERepeatsFiltered)
94
+
95
+### -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
96
+### Axt Constructor.
97
+###
98
+Axt <- function(targetRanges=GRanges(), targetSeqs=DNAStringSet(),
99
+               queryRanges=GRanges(), querySeqs=DNAStringSet(),
100
+               score=integer(), symCount=integer()){
101
+  new("Axt", targetRanges=targetRanges, targetSeqs=targetSeqs,
102
+      queryRanges=queryRanges, querySeqs=querySeqs,
103
+      score=score, symCount=symCount)
104
+}
105
+
106
+### -----------------------------------------------------------------
107
+### CNE constructor.
108
+###
109
+CNE <- function(assembly1=character(), assembly2=character(),
110
+                thresholds=character(),
111
+                CNE1=list(), CNE2=list(),
112
+                CNEMerged=list(), CNERepeatsFiltered=list(),
113
+                alignMethod=character()
114
+                ){
115
+  new("CNE", assembly1=assembly1, assembly2=assembly2,
116
+      thresholds=thresholds, CNE1=CNE1, CNE2=CNE2,
117
+      CNEMerged=CNEMerged, CNERepeatsFiltered=CNERepeatsFiltered,
118
+      alignMethod=alignMethod)
119
+}
120
+
121
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
122
+### Updating and cloning.
123
+###
124
+### An object is either 'update'd in place (usually with a replacement
125
+### method) or 'clone'd (copied), with specified slots/fields overridden.
126
+
127
+### For an object with a pure S4 slot representation, these both map to
128
+### initialize. Reference classes will want to override 'update'. Other
129
+### external representations need further customization.
130
+
131
+setMethod("update", "Axt",
132
+          function(object, ..., check=TRUE){
133
+            initialize(object, ...)
134
+          }
135
+          )
136
+
137
+setMethod("update", "CNE",
138
+          function(object, ..., check=TRUE){
139
+            initialize(object, ...)
140
+          }
141
+          )
142
+
143
+setMethod("clone", "ANY",  # not exported
144
+    function(x, ...)
145
+    {
146
+        if (nargs() > 1L)
147
+            initialize(x, ...)
148
+        else
149
+            x
150
+    }
151
+)
152
+
153
+
154
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
155
+### Subsetting and combining.
156
+###
157
+
158
+setMethod("[", "Axt",
159
+          function(x, i, ..., drop){
160
+            if(length(list(...)) > 0L)
161
+              stop("invalid subsetting")
162
+            if(missing(i))
163
+              return(x)
164
+            #i = IRanges:::normalizeSingleBracketSubscript(i, x)
165
+            ans_targetRanges <- targetRanges(x)[i]
166
+            ans_targetSeqs <- targetSeqs(x)[i]
167
+            ans_queryRanges <- queryRanges(x)[i]
168
+            ans_querySeqs <- querySeqs(x)[i]
169
+            ans_score <- score(x)[i]
170
+            ans_symCount <- symCount(x)[i]
171
+            clone(x, targetRanges=ans_targetRanges, targetSeqs=ans_targetSeqs,
172
+                  queryRanges=ans_queryRanges, querySeqs=ans_querySeqs,
173
+                  score=ans_score, symCount=ans_symCount)
174
+          }
175
+          )
176
+
177
+setMethod("c", "Axt",
178
+          function(x, ...){
179
+            if(missing(x)){
180
+              args <- unname(list(...))
181
+              x <- args[[1L]]
182
+            }else{
183
+              args <- unname(list(x, ...))
184
+            }
185
+            if(length(args) == 1L)
186
+              return(x)
187
+            arg_is_null <- sapply(args, is.null)
188
+            if(any(arg_is_null))
189
+              args[arg_is_null] <- NULL
190
+            if(!all(sapply(args, is, class(x))))
191
+              stop("all arguments in '...' must be ", 
192
+                   class(x), " objects (or NULLs)")
193
+            new_targetRanges <- do.call(c, lapply(args, targetRanges))
194
+            new_targetSeqs <- do.call(c, lapply(args, targetSeqs))
195
+            new_queryRanges <- do.call(c, lapply(args, queryRanges))
196
+            new_querySeqs <- do.call(c, lapply(args, querySeqs))
197
+            new_score <- do.call(c, lapply(args, score))
198
+            new_symCount <- do.call(c, lapply(args, symCount))
199
+
200
+            initialize(x,
201
+                       targetRanges=new_targetRanges,
202
+                       targetSeqs=new_targetSeqs,
203
+                       queryRanges=new_queryRanges,
204
+                       querySeqs=new_querySeqs,
205
+                       score=new_score,
206
+                       symCount=new_symCount)
207
+          }
208
+          )
209
+
210
+
211
+setMethod("subAxt", "Axt",
212
+## This is to fetch the Axts within the specific chrs, starts, ends 
213
+## based on target sequences.
214
+          function(x, chr, start, end, #strand=c("+", "-", "*"), 
215
+                   select=c("target", "query"),
216
+                   type=c("any", "within"),
217
+                   qSize=NULL){
218
+            type <- match.arg(type)
219
+            select <- match.arg(select)
220
+            if(select == "query"){
221
+              if(is.null(qSize))
222
+                stop("When selecting on query alignments, 
223
+                     qSize must be provided.")
224
+              if(!is(qSize, "integer"))
225
+                stop("qSize must be an integer object.")
226
+            }
227
+            if(select == "target"){
228
+              searchGRanges = GRanges(seqnames=chr,
229
+                                      ranges=IRanges(start=start, end=end),
230
+                                      strand="+")
231
+              if(length(searchGRanges) == 0)
232
+                return(x)
233
+              index <- queryHits(findOverlaps(targetRanges(x), 
234
+                                             searchGRanges, type=type, 
235
+                                             select="all"))
236
+            }else if(select == "query"){
237
+              # first search Axts on positive strand
238
+              searchGRanges = GRanges(seqnames=chr,
239
+                                      ranges=IRanges(start=start, end=end),
240
+                                      strand="+")
241
+              if(length(searchGRanges) == 0)
242
+                return(x)
243
+              indexPositive <- queryHits(findOverlaps(queryRanges(x),
244
+                                                     searchGRanges, type=type,
245
+                                                     select="all"))
246
+              # then search Axts on negative strand.
247
+              # we need to prepare the searchGRanges on negative strand.
248
+              searchGRanges <- GRanges(seqnames=chr,
249
+                                      ranges=IRanges(start=qSize-end+1,
250
+                                                     end=qSize-start+1),
251
+                                      strand="-")
252
+              indexNegative <- queryHits(findOverlaps(queryRanges(x),
253
+                                                     searchGRanges, type=type,
254
+                                                     select="all"))
255
+              index <- sort(c(indexPositive, indexNegative))
256
+            }else{
257
+              stop("Wrong select!")
258
+            }
259
+            return(x[index])
260
+          }
261
+          )
262
+
263
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
264
+### "show" method.
265
+###
266
+### 'x' must be an XString or MaskedXString object.
267
+toSeqSnippet <- function(x, width)
268
+{
269
+    if (width < 7L)
270
+        width <- 7L
271
+    seqlen <- length(x)
272
+    if (seqlen <= width) {
273
+        as.character(x)
274
+    } else {
275
+        w1 <- (width - 2) %/% 2
276
+        w2 <- (width - 3) %/% 2
277
+        paste(as.character(subseq(x, start=1, width=w1)),
278
+              "...",
279
+              as.character(subseq(x, end=seqlen, width=w2)),
280
+              sep="")
281
+    }
282
+}
283
+
284
+
285
+.axt.show_frame_line <- function(x, i, iW, tNameW, tStartW, tEndW, 
286
+                                qNameW, qStartW, qEndW, scoreW){
287
+  cat(format(i, width=iW, justify="right"), " ",
288
+      format(as.character(seqnames(targetRanges(x)[i])), 
289
+             width=tNameW, justify="right"), " ",
290
+      format(start(targetRanges(x)[i]), width=tStartW, justify="right"), " ",
291
+      format(end(targetRanges(x)[i]), width=tEndW, justify="right"), " ",
292
+      format(as.character(seqnames(queryRanges(x)[i])), 
293
+             width=qNameW, justify="right"), " ",
294
+      format(start(queryRanges(x)[i]), width=qStartW, justify="right"), " ",
295
+      format(end(queryRanges(x)[i]), width=qEndW, justify="right"), " ",
296
+      format(as.character(strand(queryRanges(x))[i]), 
297
+             width=1, justify="right"), " ",
298
+      format(score(x)[i], width=scoreW, justify="right"), " ",
299
+      sep=""
300
+      )
301
+  cat("\n")
302
+  snippetWidth <- getOption("width")
303
+  seq_snippet <- toSeqSnippet(targetSeqs(x)[[i]], snippetWidth)
304
+  cat(seq_snippet)
305
+  cat("\n")
306
+  seq_snippet <- toSeqSnippet(querySeqs(x)[[i]], snippetWidth)
307
+  cat(seq_snippet)
308
+  cat("\n")
309
+}
310
+
311
+showAxt <- function(x, margin="", half_nrow=5L){
312
+  lx <- length(x)
313
+  if(is.null((head_nrow = getOption("showHeadLines"))))
314
+    head_nrow = half_nrow
315
+  if(is.null((tail_nrow = getOption("showTailLines"))))
316
+    tail_nrow = half_nrow
317
+  iW = nchar(as.character(lx))
318
+  if(lx < (2*half_nrow+1L) | (lx < (head_nrow+tail_nrow+1L))) {
319
+    tNameW <- max(nchar(as.character(seqnames(targetRanges(x)))))
320
+    tStartW <- max(nchar(as.character(start(targetRanges(x)))))
321
+    tEndW <- max(nchar(as.character(end(targetRanges(x)))))
322
+    qNameW <- max(nchar(as.character(seqnames(queryRanges(x)))))
323
+    qStartW <- max(nchar(as.character(start(queryRanges(x)))))
324
+    qEndW <- max(nchar(as.character(end(queryRanges(x)))))
325
+    scoreW <- max(nchar(as.character(score(x))))
326
+    for(i in seq_len(lx))
327
+      .axt.show_frame_line(x, i, iW, tNameW, tStartW, tEndW, 
328
+                           qNameW, qStartW, qEndW, scoreW)
329
+  }else{
330
+    tNameW <- max(nchar(as.character(seqnames(targetRanges(x)
331
+                        [c(1:head_nrow, (lx-tail_nrow+1L):lx)]))))
332
+    tStartW <- max(nchar(as.character(start(targetRanges(x)
333
+                        [c(1:head_nrow, (lx-tail_nrow+1L):lx)]))))
334
+    tEndW <- max(nchar(as.character(end(targetRanges(x)
335
+                        [c(1:head_nrow, (lx-tail_nrow+1L):lx)]))))
336
+    qNameW <- max(nchar(as.character(seqnames(queryRanges(x)
337
+                        [c(1:head_nrow, (lx-tail_nrow+1L):lx)]))))
338
+    qStartW <- max(nchar(as.character(start(queryRanges(x)
339
+                        [c(1:head_nrow, (lx-tail_nrow+1L):lx)]))))
340
+    qEndW <- max(nchar(as.character(end(queryRanges(x)
341
+                        [c(1:head_nrow, (lx-tail_nrow+1L):lx)]))))
342
+    scoreW <- max(nchar(as.character(score(x)
343
+                        [c(1:head_nrow, (lx-tail_nrow+1L):lx)])))
344
+    if(head_nrow > 0){
345
+      for(i in 1:head_nrow)
346
+        .axt.show_frame_line(x, i, iW, tNameW, tStartW, tEndW, 
347
+                             qNameW, qStartW, qEndW, scoreW)
348
+    }
349
+    cat(format("...", width=iW, justify="right"),
350
+        format("...", width=tNameW, justify="right"),
351
+        format("...", width=tStartW, justify="right"),
352
+        format("...", width=tEndW, justify="right"),
353
+        format("...", width=qNameW, justify="right"),
354
+        format("...", width=qStartW, justify="right"),
355
+        format("...", width=qEndW, justify="right"),
356
+        format("...", width=scoreW, justify="right")
357
+        )
358
+    cat("\n")
359
+    if(tail_nrow > 0){
360
+      for(i in (lx-tail_nrow+1L):lx)
361
+        .axt.show_frame_line(x, i, iW, tNameW, tStartW, tEndW, 
362
+                             qNameW, qStartW, qEndW, scoreW)
363
+    }
364
+  }
365
+}
366
+    #out = makePrettyMatrixForCompactPrintingAxt(x, .makeNakedMatFromAxt)
367
+    #if(nrow(out) != 0L)
368
+    #      rownames(out) = paste0(margin, rownames(out))
369
+    #  print(out, quote=FALSE, right=TRUE)
370
+
371
+setMethod("show", "Axt",
372
+          function(object){
373
+            lx <- length(object)
374
+            cat(" A ", class(object), " with ", length(object), " ", 
375
+                ifelse(lx == 1L, "alignment pair", "alignment pairs"), 
376
+                ":\n", sep="")
377
+            if(lx != 0){
378
+              showAxt(object, margin="  ")
379
+            }
380
+          }
381
+)
382
+
0 383
new file mode 100644
... ...
@@ -0,0 +1,49 @@
1
+
2
+### -----------------------------------------------------------------
3
+### Axt object related
4
+###
5
+setGeneric("targetRanges", function(x) standardGeneric("targetRanges"))
6
+setGeneric("targetSeqs", function(x) standardGeneric("targetSeqs"))
7
+setGeneric("queryRanges", function(x) standardGeneric("queryRanges"))
8
+setGeneric("querySeqs", function(x) standardGeneric("querySeqs"))
9
+setGeneric("symCount", function(x) standardGeneric("symCount"))
10
+setGeneric("subAxt", function(x, chr, start, end, #strand=c("+", "-", "*"),
11
+                              select=c("target", "query"),
12
+                              type=c("any", "within"),
13
+                              qSize=NULL) 
14
+                      standardGeneric("subAxt")
15
+                      )
16
+
17
+### -----------------------------------------------------------------
18
+### CNE class related
19
+###
20
+setGeneric("assembly1", function(x) standardGeneric("assembly1"))
21
+setGeneric("assembly2", function(x) standardGeneric("assembly2"))
22
+setGeneric("CNE1", function(x) standardGeneric("CNE1"))
23
+setGeneric("CNE2", function(x) standardGeneric("CNE2"))
24
+setGeneric("thresholds", function(x) standardGeneric("thresholds"))
25
+setGeneric("CNEMerged", function(x) standardGeneric("CNEMerged"))
26
+setGeneric("CNERepeatsFiltered", function(x) 
27
+           standardGeneric("CNERepeatsFiltered"))
28
+
29
+setGeneric("saveCNEToSQLite", 
30
+           function(CNE, dbName, tableName, overwrite=FALSE) 
31
+             standardGeneric("saveCNEToSQLite"))
32
+setGeneric("CNEDensity",
33
+           function(dbName, tableName, assembly1, assembly2, threshold,
34
+                    chr, start, end, windowSize, minLength=NULL)
35
+             standardGeneric("CNEDensity"))
36
+
37
+### -----------------------------------------------------------------
38
+### general
39
+### Not Exported!
40
+setGeneric("clone", function(x, ...) standardGeneric("clone"))
41
+
42
+### -----------------------------------------------------------------
43
+### ceScan
44
+### Exported!
45
+setGeneric("ceScan", 
46
+           function(axts, tFilter, qFilter, qSizes, thresholds="49_50")
47
+           standardGeneric("ceScan"))
48
+
49
+
0 50
new file mode 100644
... ...
@@ -0,0 +1,382 @@
1
+
2
+### -----------------------------------------------------------------
3
+### The main function for scanning the axts and get the CNEs
4
+### Not exported!
5
+ceScanR <- function(axts, tFilter=NULL, qFilter=NULL, qSizes=NULL, 
6
+                   thresholds=c("49_50")){
7
+  ## Here the returned tStart and qStart are 1-based coordinates. 
8
+  ## Of course ends are also 1-based.
9
+  if(!is.null(qFilter))
10
+    if(is.null(qSizes) || !is(qSizes, "Seqinfo"))
11
+      stop("qSizes must exist and be a Seqinfo object when qFilter exists")
12
+  
13
+  winSize <- as.integer(sapply(strsplit(thresholds, "_"), "[", 2))
14
+  minScore <- as.integer(sapply(strsplit(thresholds, "_"), "[", 1))
15
+  resFiles <- tempfile(pattern=paste(minScore, winSize, "ceScan", sep="-"), 
16
+                       tmpdir=tempdir(), fileext="")
17
+  ## How stupid I am...
18
+  if(is.null(tFilter) && is.null(qFilter)){
19
+    .Call2("myCeScan", NULL, NULL, NULL,
20
+           NULL, NULL, NULL,
21
+           NULL, NULL,
22
+           as.character(seqnames(targetRanges(axts))),
23
+           start(targetRanges(axts)), end(targetRanges(axts)),
24
+           as.character(strand(targetRanges(axts))),
25
+           as.character(targetSeqs(axts)),
26
+           as.character(seqnames(queryRanges(axts))),
27
+           start(queryRanges(axts)), end(queryRanges(axts)),
28
+           as.character(strand(queryRanges(axts))),
29
+           as.character(querySeqs(axts)),
30
+           score(axts), symCount(axts), winSize, minScore,
31
+           as.character(resFiles),
32
+           PACKAGE="CNEr")
33
+  }else if(is.null(tFilter) && !is.null(qFilter)){
34
+    .Call2("myCeScan", NULL, NULL, NULL,
35
+           as.character(seqnames(qFilter)), start(qFilter), end(qFilter),
36
+           as.character(seqnames(qSizes)), as.integer(seqlengths(qSizes)),
37
+           as.character(seqnames(targetRanges(axts))),
38
+           start(targetRanges(axts)), end(targetRanges(axts)),
39
+           as.character(strand(targetRanges(axts))),
40
+           as.character(targetSeqs(axts)),
41
+           as.character(seqnames(queryRanges(axts))),
42
+           start(queryRanges(axts)), end(queryRanges(axts)),
43
+           as.character(strand(queryRanges(axts))),
44
+           as.character(querySeqs(axts)),
45
+           score(axts), symCount(axts), winSize, minScore,
46
+           as.character(resFiles),
47
+           PACKAGE="CNEr")
48
+  }else if(!is.null(tFilter) && is.null(qFilter)){
49
+    .Call2("myCeScan", as.character(seqnames(tFilter)), 
50
+           start(tFilter), end(tFilter),
51
+           NULL, NULL, NULL,
52
+           NULL, NULL,
53
+           as.character(seqnames(targetRanges(axts))),
54
+           start(targetRanges(axts)), end(targetRanges(axts)),
55
+           as.character(strand(targetRanges(axts))),
56
+           as.character(targetSeqs(axts)),
57
+           as.character(seqnames(queryRanges(axts))),
58
+           start(queryRanges(axts)), end(queryRanges(axts)),
59
+           as.character(strand(queryRanges(axts))),
60
+           as.character(querySeqs(axts)),
61
+           score(axts), symCount(axts), winSize, minScore,
62
+           as.character(resFiles),
63
+           PACKAGE="CNEr")
64
+  }else{
65
+    .Call2("myCeScan", as.character(seqnames(tFilter)), 
66
+           start(tFilter), end(tFilter),
67
+              as.character(seqnames(qFilter)), start(qFilter), end(qFilter),
68
+              as.character(seqnames(qSizes)), as.integer(seqlengths(qSizes)), 
69
+              as.character(seqnames(targetRanges(axts))), 
70
+              start(targetRanges(axts)), end(targetRanges(axts)), 
71
+              as.character(strand(targetRanges(axts))), 
72
+              as.character(targetSeqs(axts)),
73
+              as.character(seqnames(queryRanges(axts))), 
74
+              start(queryRanges(axts)), end(queryRanges(axts)), 
75
+              as.character(strand(queryRanges(axts))), 
76
+              as.character(querySeqs(axts)),
77
+              score(axts), symCount(axts), winSize, minScore, 
78
+              as.character(resFiles),
79
+              PACKAGE="CNEr")
80
+  }
81
+  CNE <- lapply(resFiles, 
82
+               function(x){
83
+                 res <- read.table(x, header=FALSE, sep="\t", as.is=TRUE)
84
+               colnames(res) <- c("tName", "tStart", "tEnd", "qName", "qStart", 
85
+                                 "qEnd", "strand", "score", "cigar")
86
+               return(res)})
87
+  names(CNE) <-  paste(minScore, winSize, sep="_")
88
+  unlink(resFiles)
89
+  return(CNE)
90
+}
91
+
92
+
93
+### -----------------------------------------------------------------
94
+### Another main function for CNEs identification, 
95
+### but it takes the axt files and bed files as input
96
+### Not exported!
97
+ceScanFile <- function(axtFiles, tFilterFile=NULL, qFilterFile=NULL, 
98
+                       qSizes=NULL,
99
+                       thresholds=c("49_50")){
100
+  ## Here the returned tStart and qStart are 1-based coordinates. 
101
+  ## Of course ends are also 1-based.
102
+  if(!is.null(qFilterFile))
103
+    if(is.null(qSizes) || !is(qSizes, "Seqinfo"))
104
+      stop("qSizes must exist and be a Seqinfo object when qFilter exists")
105
+  winSize <- as.integer(sapply(strsplit(thresholds, "_"), "[", 2))
106
+  minScore <- as.integer(sapply(strsplit(thresholds, "_"), "[", 1))
107
+  resFiles <- tempfile(pattern=paste(minScore, winSize, "ceScan", sep="-"), 
108
+                      tmpdir=tempdir(), fileext="")
109
+  .Call2("myCeScanFile", axtFiles, tFilterFile, qFilterFile, 
110
+        as.character(seqnames(qSizes)), as.character(seqlengths(qSizes)),
111
+        winSize, minScore,
112
+        resFiles, PACKAGE="CNEr")
113
+  CNE <- lapply(resFiles,
114
+               function(x){
115
+                 res <- read.table(x, header=FALSE, sep="\t", as.is=TRUE)
116
+                 colnames(res) <- c("tName", "tStart", "tEnd", "qName", 
117
+                                   "qStart", "qEnd", "strand", "score", "cigar")
118
+                 return(res)})
119
+  names(CNE) <-  paste(minScore, winSize, sep="_")
120
+  unlink(resFiles)
121
+  return(CNE)
122
+}
123
+
124
+### -----------------------------------------------------------------
125
+### The S4 methods for ceScan
126
+### Exported!
127
+setMethod("ceScan", signature(axts="Axt", tFilter="GRanges", qFilter="GRanges",
128
+                              qSizes="Seqinfo"),
129
+          function(axts, tFilter, qFilter, qSizes, thresholds="49_50"){
130
+            ceScanR(axts, tFilter=tFilter, qFilter=qFilter, 
131
+                    qSizes=qSizes, thresholds=thresholds)
132
+          }
133
+          )
134
+setMethod("ceScan", signature(axts="Axt", tFilter="missing", qFilter="GRanges",
135
+                              qSizes="Seqinfo"),
136
+          function(axts, tFilter, qFilter, qSizes, thresholds="49_50"){
137
+            ceScanR(axts, tFilter=NULL, qFilter=qFilter,
138
+                    qSizes=qSizes, thresholds=thresholds)
139
+          }
140
+          )
141
+setMethod("ceScan", signature(axts="Axt", tFilter="missing", qFilter="missing",
142
+                              qSizes="missing"),
143
+          function(axts, tFilter, qFilter, qSizes, thresholds="49_50"){
144
+            ceScanR(axts, tFilter=NULL, qFilter=NULL,
145
+                    qSizes=NULL, thresholds=thresholds)
146
+          }
147
+          )
148
+setMethod("ceScan", signature(axts="Axt", tFilter="GRanges", qFilter="missing",
149
+                              qSizes="missing"),
150
+          function(axts, tFilter, qFilter, qSizes, thresholds="49_50"){
151
+            ceScanR(axts, tFilter=tFilter, qFilter=NULL,
152
+                    qSizes=NULL, thresholds=thresholds)
153
+          }
154
+          )
155
+setMethod("ceScan", signature(axts="character", tFilter="character", 
156
+                              qFilter="character", qSizes="Seqinfo"),
157
+          function(axts, tFilter, qFilter, qSizes, thresholds="49_50"){
158
+            ceScanFile(axtFiles=axts, tFilterFile=tFilter, qFilterFile=qFilter,
159
+                       qSizes=qSizes, thresholds=thresholds)
160
+          }
161
+          )
162
+setMethod("ceScan", signature(axts="character", tFilter="missing",
163
+                              qFilter="character", qSizes="Seqinfo"),
164
+          function(axts, tFilter, qFilter, qSizes, thresholds="49_50"){
165
+            ceScanFile(axtFiles=axts, tFilterFile=NULL, qFilterFile=qFilter,
166
+                       qSizes=qSizes, thresholds=thresholds)
167
+          }
168
+          )
169
+setMethod("ceScan", signature(axts="character", tFilter="missing",
170
+                              qFilter="missing", qSizes="missing"),
171
+          function(axts, tFilter, qFilter, qSizes, thresholds="49_50"){
172
+            ceScanFile(axtFiles=axts, tFilterFile=NULL, qFilterFile=NULL,
173
+                       qSizes=NULL, thresholds=thresholds)
174
+          }
175
+          )
176
+setMethod("ceScan", signature(axts="character", tFilter="character",
177
+                              qFilter="missing", qSizes="missing"),
178
+          function(axts, tFilter, qFilter, qSizes, thresholds="49_50"){
179
+            ceScanFile(axtFiles=axts, tFilterFile=tFilter, qFilterFile=NULL,
180
+                       qSizes=NULL, thresholds=thresholds)
181
+          }
182
+          )
183
+
184
+### -----------------------------------------------------------------
185
+### Merge two side cnes
186
+### Exported!
187
+cneMerge <- function(cne1, cne2){
188
+  # In this function, cne's start is 1-based coordinates. ends are 1-based too. 
189
+  # Although in cne1 and cne2, query strand can be negative, 
190
+  # but the coordinate is already on positive strand.
191
+  ## first reverse the cne2's cigar
192
+  ## cne2 = transform(cne2, cigar=chartr("DI", "ID", cigar))
193
+  cne2$cigar <- chartr("DI", "ID", cne2$cigar)
194
+  if(any(cne2$strand == "-")){
195
+    #cne2[cne2$strand=="-", ] = transform(subset(cne2, strand=="-"), 
196
+    #                                      cigar=reverseCigar(cigar))
197
+    cne2[cne2$strand == "-", "cigar"] <- 
198
+      reverseCigar(cne2[cne2$strand == "-", "cigar"])
199
+  }
200
+  colnames(cne2) <- c("qName", "qStart", "qEnd", "tName", 
201
+                     "tStart", "tEnd", "strand", "score", "cigar")
202
+  cne1T <- GRanges(seqnames=cne1$tName, 
203
+                   ranges=IRanges(start=cne1$tStart, end=cne1$tEnd), 
204
+                   strand="+")
205
+  cne1Q <- GRanges(seqnames=cne1$qName, 
206
+                   ranges=IRanges(start=cne1$qStart, end=cne1$qEnd), 
207
+                   strand="+")
208
+  cne2T <- GRanges(seqnames=cne2$tName, 
209
+                   ranges=IRanges(start=cne2$tStart, end=cne2$tEnd), 
210
+                   strand="+")
211
+  cne2Q <- GRanges(seqnames=cne2$qName, 
212
+                   ranges=IRanges(start=cne2$qStart, end=cne2$qEnd), 
213
+                   strand="+")
214
+  cneT <- c(cne1T, cne2T)
215
+  cneQ <- c(cne1Q, cne2Q)
216
+  # Here, I just removed the CNEs which are within another big CNEs. 
217
+  # In very rare cases(1 in 100000), some cnes may just connect 
218
+  # and need to merge them. Needs to be done in the future (perhaps not easy to be done in R).
219
+  cneT_overlap <- findOverlaps(cneT, type="within", 
220
+                              ignoreSelf=TRUE, ignoreRedundant=TRUE)
221
+  #cneT_overlap1 = findOverlaps(cneT, type="equal", 
222
+                              #ignoreSelf=TRUE, ignoreRedundant=TRUE)
223
+  #cneT_overlap2 = findOverlaps(cneT, type="any", 
224
+                              #ignoreSelf=TRUE, ignoreRedundant=TRUE)
225
+  cneQ_overlap <- findOverlaps(cneQ, type="within", 
226
+                              ignoreSelf=TRUE, ignoreRedundant=TRUE)
227
+  #cneQ_overlap1 = findOverlaps(cneQ, type="equal", 
228
+                              #ignoreSelf=TRUE, ignoreRedundant=TRUE)
229
+  #cneQ_overlap2 = findOverlaps(cneQ, type="any", 
230
+                              #ignoreSelf=TRUE, ignoreRedundant=TRUE)
231
+  redundance <- IRanges::intersect(cneT_overlap, cneQ_overlap)
232
+  #any_overlap = intersect(cneT_overlap2, cneQ_overlap2)
233
+  #foo = setdiff(any_overlap, redundance)
234
+  #paste(subjectHits(foo), queryHits(foo), sep=",") 
235
+  # %in% paste(queryHits(redundance), subjectHits(redundance), sep=",")
236
+  res <- rbind(cne1, cne2)[-queryHits(redundance), ] 
237
+  # After the merge, we'd better name them as 1 and 2 
238
+  # rather than the tName and qName. Use the names in mysql cne db.
239
+  colnames(res) <- c("chr1", "start1", "end1", "chr2", "start2", "end2", 
240
+                    "strand", "similarity", "cigar")
241
+  return(res)
242
+}
243
+
244
+#cutoffs1 = 4
245
+#cutoffs2 = 8
246
+
247
+blatCNE <- function(CNE, winSize, cutoffs1, cutoffs2, 
248
+                    assembly1Twobit, assembly2Twobit, 
249
+                    blatOptions=NULL, cutIdentity=90, 
250
+                    tmpDir=tempdir(), blatBinary="blat"){
251
+  # In this function, the input CNE's start and end are 1-based coordinates.
252
+  blatOptionsALL <- list("DEF_BLAT_OPT_WSLO"=
253
+                         "-tileSize=9 -minScore=24 -repMatch=16384",
254
+                         "DEF_BLAT_OPT_WSMID"=
255
+                         "-tileSize=10 -minScore=28 -repMatch=4096",
256
+                         "DEF_BLAT_OPT_WSHI"=
257
+                         "-tileSize=11 -minScore=30 -repMatch=1024")
258
+  if(!is(winSize, "integer"))
259
+    stop("winSize must be an integer!")
260
+  if(is.null(blatOptions)){
261
+    if(winSize > 45L)
262
+      blatOptions <- blatOptionsALL[["DEF_BLAT_OPT_WSHI"]]
263
+    else if(winSize > 35L)
264
+      blatOptions <- blatOptionsALL[["DEF_BLAT_OPT_WSMID"]]
265
+    else
266
+      blatOptions <- blatOptionsALL[["DEF_BLAT_OPT_WSLO"]]
267
+  }
268
+  if(cutIdentity > 100  || cutIdentity < 0)
269
+    stop("cutIdentity must be between 0 and 100!")
270
+  if(!is(cutoffs1, "integer"))
271
+    stop("cutoffs1 must be an integer!")
272
+  if(!is(cutoffs2, "integer"))
273
+    stop("cutoffs2 must be an integer!")
274
+  
275
+  .run_blat <- function(cne, cutIdentity, whichAssembly, 
276
+                        assemblyTwobit, blatBinary, blatOptions, tmpDir){
277
+    temp_cne <- tempfile(pattern="cne-", tmpdir=tmpDir)
278
+    temp_psl <- tempfile(pattern="psl-", tmpdir=tmpDir)
279
+    # For Blat, the start is 0-based and end is 1-based. 
280
+    # So make cne's coordinates to comply with it.
281
+    if(whichAssembly == 1){
282
+      cne <- paste0(assemblyTwobit, ":", cne[,"chr1"], 
283
+                    ":", cne[,"start1"]-1, "-", cne[,"end1"])
284
+    }else{
285
+      cne <- paste0(assemblyTwobit, ":", cne[,"chr2"], 
286
+                    ":", cne[,"start2"]-1, "-", cne[,"end2"])
287
+    }
288
+    cne <- unique(cne)
289
+    writeLines(cne, con=temp_cne)
290
+    cmd <- paste0(blatBinary, " ", blatOptions, " ",
291
+                  "-minIdentity=", cutIdentity,
292
+                  " ", assemblyTwobit, " ", temp_cne, " ", temp_psl)
293
+    my.system(cmd)
294
+    unlink(temp_cne)
295
+    return(temp_psl)
296
+  }
297
+  psl1Fn <- .run_blat(CNE, cutIdentity, 1, assembly1Twobit, 
298
+                      blatBinary, blatOptions, tmpDir)
299
+  psl2Fn <- .run_blat(CNE, cutIdentity, 2, assembly2Twobit, 
300
+                      blatBinary, blatOptions, tmpDir)
301
+  psl1 <- read.table(psl1Fn, header=FALSE, sep="\t", skip=5, as.is=TRUE)
302
+  psl2 <- read.table(psl2Fn, header=FALSE, sep="\t", skip=5, as.is=TRUE)
303
+  colnames(psl1) <- colnames(psl2) <- 
304
+    c("matches", "misMatches", "repMatches", "nCount", 
305
+      "qNumInsert", "qBaseInsert", "tNumInsert", "tBaseInsert", 
306
+      "strand", "qName", "qSize", "qStart", "qEnd", "tName", 
307
+      "tSize", "tStart", "tEnd", "blockCount", "blockSizes", 
308
+      "qStarts", "tStarts")
309
+  ##psl1 = subset(psl1, matches/qSize >= cutIdentity/100)
310
+  ##for some reason, subset will cause error duing the Bioconductor build
311
+  psl1 <- psl1[psl1$matches / psl1$qSize >= cutIdentity/100, ]
312
+  ##psl2 = subset(psl2, matches/qSize >= cutIdentity/100)
313
+  psl2 <- psl2[psl2$matches / psl2$qSize >= cutIdentity/100, ]
314
+  psl1 <- table(psl1[ , "qName"])
315
+  psl2 <- table(psl2[ , "qName"])
316
+  CNEtNameIndex <- unname(psl1[paste0(CNE[ ,1], ":", CNE[ ,2]-1, "-", 
317
+                                      CNE[ ,3])])
318
+  CNEtNameIndex[is.na(CNEtNameIndex)] <- 0
319
+  CNEqNameIndex <- unname(psl2[paste0(CNE[ ,4], ":", CNE[ ,5]-1, "-", 
320
+                                      CNE[ ,6])])
321
+  CNEqNameIndex[is.na(CNEqNameIndex)] <- 0
322
+  CNE <- CNE[CNEtNameIndex <= cutoffs1 & CNEqNameIndex <= cutoffs2, ]
323
+  return(CNE)
324
+  # Here, the CNE's starts and ends are still 1-based.
325
+}
326
+
327
+
328
+ceScanOneStep <- function(axt1, filter1=NULL, sizes1, assembly1, twoBit1,
329
+                          axt2, filter2=NULL, sizes2, assembly2, twoBit2,
330
+                          thresholds=c("49_50"),
331
+                          blatBinary="blat",
332
+                          blatCutoff1, blatCutoff2
333
+                          ){
334
+  if(grepl("_", assembly1) || grepl("_", assembly2))
335
+    stop("The assembly name must not contain \"_\"")
336
+  if(assembly1 < assembly2){
337
+    .ceScanSwap(axt1=axt1, filter1=filter1, sizes1=sizes1, assembly1=assembly1,
338
+                twoBit1=twoBit1, axt2=axt2, filter2=filter2, sizes2=sizes2,
339
+                assembly2=assembly2, twoBit2=twoBit2, thresholds=thresholds,
340
+                blatBinary=blatBinary, blatCutoff1=blatCutoff1, 
341
+                blatCutoff2=blatCutoff2)
342
+  }else{
343
+    .ceScanSwap(axt1=axt2, filter1=filter2, sizes1=sizes2, assembly1=assembly2,
344
+                twoBit1=twoBit2, axt2=axt1, filter2=filter1,
345
+                sizes2=sizes1, assembly2=assembly1, twoBit2=twoBit1,
346
+                thresholds=thresholds, blatBinary=blatBinary,
347
+                blatCutoff1=blatCutoff2, blatCutoff2=blatCutoff1)
348
+  }
349
+}
350
+
351
+.ceScanSwap <- function(axt1, filter1=NULL, sizes1, assembly1, twoBit1,
352
+                        axt2, filter2=NULL, sizes2, assembly2, twoBit2,
353
+                        thresholds=c("49_50"),
354
+                        blatBinary="blat",
355
+                        blatCutoff1, blatCutoff2
356
+                        ){
357
+  ## In this function, we make sure "assembly1" is smaller than "assembly2".
358
+  ## danRer7 is smaller than hg19.
359
+  ## This is just for easier database storage table name: "danRer7_hg19_49_50"
360
+  CNE1 <- ceScan(axt1, filter1, filter2, sizes2, thresholds)
361
+  CNE2 <- ceScan(axt2, filter2, filter1, sizes1, thresholds)
362
+  CNEMerged <- mapply(cneMerge, CNE1, CNE2, SIMPLIFY=FALSE)
363
+  names(CNEMerged) = paste(assembly1, assembly2, thresholds, sep="_")
364
+  CNEBlated <- list()
365
+  for(i in 1:length(CNEMerged)){
366
+    CNEBlated[[names(CNEMerged)[i]]] <- 
367
+      blatCNE(CNEMerged[[i]], as.integer(sub(".+_.+_\\d+_", "", 
368
+                                           names(CNEMerged)[i])),
369
+              cutoffs1=blatCutoff1, cutoffs2=blatCutoff2,
370
+              assembly1Twobit=twoBit1, assembly2Twobit=twoBit2,
371
+              blatBinary=blatBinary)
372
+  }
373
+  ans <- CNE(assembly1=assembly1, assembly2=assembly2,
374
+             thresholds=thresholds,
375
+             CNE1=CNE1, CNE2=CNE2, CNEMerged=CNEMerged,
376
+             CNERepeatsFiltered=CNEBlated,
377
+             alignMethod=blatBinary)
378
+  return(ans)
379
+}
380
+
381
+
382
+
0 383
new file mode 100644
... ...
@@ -0,0 +1,84 @@
1
+### -----------------------------------------------------------------
2
+### read the bed file into GRanges.
3
+###
4
+readBedR <- function(bedFile){
5
+## This GRanges is in 1-based.
6
+  bed <- import.bed(bedFile, asRangedData=FALSE)
7
+  strand(bed) <- "+"
8
+  bed <- reduce(bed)
9
+}
10
+
11
+### -----------------------------------------------------------------
12
+### read the bed file (with only 3 columns) into GRanges.
13
+### Exported!
14
+readBed <- function(bedFile=NULL){
15
+## This GRanges have the different coordinates system 
16
+## with the original bed file. i.e. with 1-based start end coordinates.
17
+  if(is.null(bedFile)){
18
+    return(NULL)
19
+  }
20
+  if(!file.exists(bedFile)){
21
+    stop("No such file ", bedFile) 
22
+  }
23
+  bed <- .Call2("myReadBed", bedFile, PACKAGE="CNEr")
24
+  bed <- GRanges(seqnames=Rle(bed[[1]]),
25
+                 ranges=IRanges(start=bed[[2]], end=bed[[3]]),
26
+                 strand=factor("+"))
27
+  return(bed)
28
+}
29
+
30
+### -----------------------------------------------------------------
31
+### read the axt files into an axt object.
32
+### Exported!
33
+readAxt <- function(axtFiles){
34
+  # Read axt files into R axt object.
35
+  # The coordinates are 1-based for start and end.
36
+  index_noexists <- !file.exists(axtFiles)
37
+  if(any(index_noexists)){
38
+    stop("No such file ", paste(axtFiles[index_noexists], sep=" "))
39
+  }
40
+  myAxt <- .Call2("myReadAxt", axtFiles, PACKAGE="CNEr")
41
+  axts <- Axt(targetRanges=GRanges(seqnames=Rle(myAxt[[1]]),
42
+                                   ranges=IRanges(start=myAxt[[2]],
43
+                                                  end=myAxt[[3]]),
44
+                                   strand=Rle(myAxt[[4]])),
45
+              targetSeqs=DNAStringSet(myAxt[[5]]),
46
+              queryRanges=GRanges(seqnames=Rle(myAxt[[6]]),
47
+                                  ranges=IRanges(start=myAxt[[7]],
48
+                                                 end=myAxt[[8]]),
49
+                                  strand=Rle(myAxt[[9]])),
50
+              querySeqs=DNAStringSet(myAxt[[10]]),
51
+              score=myAxt[[11]],
52
+              symCount=myAxt[[12]]
53
+            )
54
+  return(axts)
55
+}
56
+
57
+### -----------------------------------------------------------------
58
+### read the axt files and return the widths of all the alignments
59
+### Exported!
60
+axtInfo <- function(axtFiles){
61
+  ans <- .Call2("axt_info", axtFiles, PACKAGE="CNEr")
62
+  return(ans)
63
+}
64
+
65
+### -----------------------------------------------------------------
66
+### write the Axt object to an axt file
67
+### Exported!
68
+writeAxt <- function(axt, con){
69
+  firstLine <- paste(0:(length(axt)-1), seqnames(targetRanges(axt)),
70
+                     start(targetRanges(axt)), end(targetRanges(axt)),
71
+                     seqnames(queryRanges(axt)),
72
+                     start(queryRanges(axt)), end(queryRanges(axt)),
73
+                     strand(queryRanges(axt)), score(axt)
74
+                     )
75
+  secondLine <- targetSeqs(axt)
76
+  thirdLine <- querySeqs(axt)
77
+  wholeLines <- paste(firstLine, as.character(targetSeqs(axt)), 
78
+                      as.character(querySeqs(axt)),
79
+                      "", sep="\n")
80
+  writeLines(wholeLines, con)
81
+}
82
+
83
+
84
+
0 85
new file mode 100644
... ...
@@ -0,0 +1,122 @@
1
+## This is used to download the annotation from UCSC and 
2
+### incooperate it for the tracks display
3
+
4
+#.SUPPORTED_UCSC_TABLES = c(
5
+#  ## tablename (unique key)   track             subtrack    auxiliary tablename
6
+#  "knownGene",              "UCSC Genes",       NA,         
7
+#  "refGene",                "RefSeq Genes",     NA,
8
+#  "ensGene",                "Ensembl Genes",    NA
9
+#  )
10
+
11
+.SUPPORTED_UCSC_TABLES = list(
12
+    "knownGene"    = c("knownGene", "kgXref"),
13
+    "refGene"  = c("refGene"),
14
+    "ensGene" = c("ensGene", "ensemblToGeneName")
15
+    )
16
+
17
+supportedUCSCtables = function(){
18
+  .SUPPORTED_UCSC_TABLES
19
+}
20
+
21
+queryrefGene = function(con){
22
+  query = "SELECT distinct name, name2, chrom, strand, exonStarts, exonEnds
23
+                   FROM refGene
24
+                   ORDER BY name2, name"
25
+  ans = dbGetQuery(con, query)
26
+  # process the ans into one exon per line
27
+  exonStarts = strsplit(ans$exonStarts, ",")
28
+  exonEnds = strsplit(ans$exonEnds, ",")
29
+  stopifnot(all(sapply(exonStarts, length) == sapply(exonEnds, length)))
30
+  repNum = sapply(exonStarts, length)
31
+  res = data.frame(chromosome=rep(ans$chrom, repNum),
32
+                   start=as.integer(unlist(exonStarts))+1,
33
+                   end=as.integer(unlist(exonEnds)),
34
+                   strand=rep(ans$strand, repNum),
35
+                   gene=rep(ans$name2, repNum),
36
+                   transcript=rep(ans$name, repNum),
37
+                   symbol=rep(ans$name2, repNum))
38
+  return(res)
39
+}
40
+
41
+queryknownGene = function(con){
42
+  query = "SELECT distinct kgID, geneSymbol, chrom, strand, exonStarts, exonEnds
43
+                   FROM knownGene, kgXref WHERE knownGene.name=kgXref.kgID 
44
+                   ORDER BY geneSymbol, kgID"
45
+  ans = dbGetQuery(con, query)
46
+  # process the ans into one exon per line
47
+  exonStarts = strsplit(ans$exonStarts, ",")
48
+  exonEnds = strsplit(ans$exonEnds, ",")
49
+  stopifnot(all(sapply(exonStarts, length) == sapply(exonEnds, length)))
50
+  repNum = sapply(exonStarts, length)
51
+  res = data.frame(chromosome=rep(ans$chrom, repNum),
52
+                   start=as.integer(unlist(exonStarts))+1,
53
+                   # The internal ucsc database use the 0-based start, 
54
+                   # 1-based end. We only use 1-based.
55
+                   end=as.integer(unlist(exonEnds)),
56
+                   strand=rep(ans$strand, repNum),
57
+                   gene=rep(ans$geneSymbol, repNum),
58
+                   transcript=rep(ans$kgID, repNum),
59
+                   symbol=rep(ans$geneSymbol, repNum))
60
+  return(res)
61
+}
62
+
63
+queryensGene = function(con){
64
+  query = "SELECT distinct chrom, strand, exonStarts, exonEnds, 
65
+    ensGene.name2, ensGene.name, ensemblToGeneName.value
66
+    FROM ensGene, ensemblToGeneName WHERE ensGene.name=ensemblToGeneName.name
67
+    ORDER BY ensGene.name, ensemblToGeneName.value"
68
+  ans = dbGetQuery(con, query)
69
+  # process the ans into one exon per line
70
+  exonStarts = strsplit(ans$exonStarts, ",")
71
+  exonEnds = strsplit(ans$exonEnds, ",")
72
+  stopifnot(all(sapply(exonStarts, length) == sapply(exonEnds, length)))
73
+  repNum = sapply(exonStarts, length)
74
+  res = data.frame(chromosome=rep(ans$chrom, repNum),
75
+                   start=as.integer(unlist(exonStarts))+1,
76
+                   end=as.integer(unlist(exonEnds)),
77
+                   strand=rep(ans$strand, repNum),
78
+                   gene=rep(ans$name2, repNum),
79
+                   transcript=rep(ans$name, repNum),
80
+                   symbol=rep(ans$value, repNum))
81
+  return(res)
82
+}
83
+
84
+makeGeneDbFromUCSC = function(genome="hg19",
85
+                              tablename="refGene",
86
+                              host="genome-mysql.cse.ucsc.edu",
87
+                              user="genome",
88
+                              password=NULL,
89
+                              dbnameSQLite="geneAnnotation.sqlite",
90
+                              tablenameSQLite=paste(genome, tablename, sep="_"),
91
+                              overwrite=FALSE 
92
+                              ){
93
+  if(!isSingleString(genome))
94
+    stop("'genome' must be a single string")
95
+  if(!isSingleString(tablename))
96
+    stop("'tablename' must be a single string")
97
+  if(!tablename %in% names(.SUPPORTED_UCSC_TABLES))
98
+    stop("table \"", tablename, "\" is not supported")
99
+  if(!isSingleString(host))
100
+    stop("'url' must be a single string")
101
+  con = dbConnect(MySQL(), user=user, password=password, 
102
+                  dbname=genome, host=host)
103
+  tableNames = .SUPPORTED_UCSC_TABLES[[tablename]] 
104
+  message("Download the ", tablename, " table ... ")
105
+  ans = switch(tablename,
106
+               "refGene"=queryrefGene(con),
107
+               "knownGene"=queryknownGene(con),
108
+               "ensGene"=queryensGene(con)
109
+               )
110
+  dbDisconnect(con)
111
+  # add the bin column
112
+  ans$bin = binFromCoordRange(ans$start, ans$end)
113
+  # reorder the columns, not necessary
114
+  ans = ans[ ,c("bin","chromosome","start","end","strand",
115
+                "gene", "transcript","symbol")]
116
+  con = dbConnect(SQLite(), dbname=dbnameSQLite)
117
+  dbWriteTable(con, tablenameSQLite, ans, overwrite=overwrite)
118
+  dbDisconnect(con)
119
+}
120
+
121
+
122
+
0 123
new file mode 100644
... ...
@@ -0,0 +1,200 @@
1
+### -----------------------------------------------------------------
2
+### Fetch the CNE coordinates from SQL and compute the densities
3
+### Exported!
4
+setMethod("CNEDensity", 
5
+          signature(tableName="character", assembly1="character",
6
+                    assembly2="missing", threshold="missing"),
7
+          function(dbName, tableName, assembly1, assembly2, threshold,
8
+                   chr, start, end, windowSize, minLength=NULL){
9
+            if(!grepl("^.+_.+_\\d+_\\d+$", tableName))
10
+              stop("The tableName should be in the format danRer7_hg19_49_50.")
11
+            assemblyNames <- strsplit(tableName, "_")[[1]][1:2]
12
+            stopifnot(assembly1 %in% assemblyNames)
13
+            if(which(assembly1 == assemblyNames) == 1){
14
+              whichAssembly = "L"
15
+            }else{
16
+              whichAssembly = "R"
17
+            }
18
+            .CNEDensityInternal(dbName=dbName, tableName=tableName,
19
+                                whichAssembly=whichAssembly,
20
+                                chr=chr, start=start, end=end,
21
+                                windowSize=windowSize, minLength=minLength)
22
+          }
23
+          )
24
+setMethod("CNEDensity", signature(tableName="missing", assembly1="character",
25
+                                  assembly2="character", threshold="character"),
26
+          function(dbName, tableName, assembly1, assembly2, threshold,
27
+                   chr, start, end, windowSize, minLength=NULL){
28
+            stopifnot(length(threshold) == 1L)
29
+            builtTableName = paste(paste(sort(c(assembly1, assembly2)), 
30
+                                         sep="_", collapse="_"), threshold, 
31
+                                   sep="_", collapse="_")
32
+            if(which(assembly1 == sort(c(assembly1, assembly2))) == 1){
33
+              whichAssembly = "L"
34
+            }else{
35
+              whichAssembly = "R"
36
+            }
37
+            .CNEDensityInternal(dbName=dbName, tableName=builtTableName,
38
+                                whichAssembly=whichAssembly,
39
+                                chr=chr, start=start, end=end,
40
+                                windowSize=windowSize, minLength=minLength)
41
+          }
42
+          )
43
+
44
+
45
+
46
+.CNEDensityInternal <- function(dbName, tableName, whichAssembly=c("L","R"), 
47
+                       chr, start, end, windowSize, 
48
+                       minLength=NULL){
49
+  nrGraphs <- 1
50
+  CNEstart <- start
51
+  CNEend <- end
52
+  # This is the pipeline of doing the density plot
53
+  # The windowSize is in kb.
54
+  whichAssembly <- match.arg(whichAssembly)
55
+  if(!is(windowSize, "integer"))
56
+    stop("windowSize must be an integer!")
57
+  windowSize <- windowSize * 1000
58
+  CNElength <- CNEend - CNEstart + 1
59
+  pixel_width <- 2048
60
+  if(CNElength <= pixel_width) {
61
+    step_size <- 1
62
+  }else{
63
+    step_size <- as.integer(CNElength/pixel_width)
64
+    if(step_size > windowSize/10)
65
+      step_size <- windowSize/10
66
+    while(windowSize %% step_size){
67
+      step_size <- step_size - 1
68
+    }
69
+  }
70
+  # make things easier
71
+  if(windowSize %% 2 == 0)
72
+    windowSize <- windowSize - 1L
73
+  context_start <- as.integer(max(CNEstart - (windowSize-1L)/2, 1))
74
+  context_end <- as.integer(CNEend + (windowSize-1)/2)
75
+  #win_nr_steps = windowSize / step_size
76
+  #context_start = CNEstart - as.integer(((win_nr_steps-1)*step_size)/2+0.5)
77
+  #if(context_start < 1)
78
+  #  context_start = 1
79
+  #context_end = CNEend + 
80
+  #  as.integer(((win_nr_steps-1)*step_size)/2+step_size+0.5)
81
+  ranges <- readCNERangesFromSQLite(dbName, tableName, chr, 
82
+                                    context_start, context_end, 
83
+                                    whichAssembly, minLength)
84
+  # Implement get_cne_ranges_in_region_partitioned_by_other_chr later!!!
85
+  ranges <- reduce(ranges)
86
+  covAll <- coverage(ranges, width=context_end)
87
+  runMeanAll <- runmean(covAll, k=windowSize, "constant")
88
+  resStart <- max(CNEstart, (windowSize-1)/2+1)
89
+  resEnd <- min(CNEend, context_end-(windowSize-1)/2)
90
+  resCoords <- seq(resStart, resEnd, by=step_size)
91
+  if(nrGraphs == 1){
92
+    runMeanRes <- runMeanAll[resCoords]*100
93
+    res <- cbind(resCoords, as.numeric(runMeanRes))
94
+    colnames(res) <- c("coordinates", "y")
95
+  }else{
96
+    runMeanRes <- lapply(runMeanAll, "[", resCoords)
97
+    runMeanRes <- lapply(runMeanRes, "*", 100)
98
+    res <- list()
99
+    for(i in 1:length(runMeanRes)){
100
+      res[[names(runMeanRes)[i]]] <- cbind(resCoords, 
101
+                                           as.numeric(runMeanRes[[i]]))
102
+      colnames(res[[names(runMeanRes)[i]]]) <- c("coordinates", "y")
103
+    }
104
+  }
105
+  return(res)
106
+}
107
+
108
+
109
+#calc_window_scores = function(CNEstart, CNEend, ranges, 
110
+                        # win_nr_steps, step_size){
111
+#  ## Here the starts and ends are 1-based.
112
+#  CNElength = CNEend - CNEstart + 1
113
+#  win_size = win_nr_steps * step_size
114
+#  offsetBlk = as.integer(((win_nr_steps-1)*step_size)/2+0.5)
115
+#  context_start = CNEstart - offsetBlk
116
+#  if(context_start < 1)
117
+#    context_start = 1
118
+#  context_end = CNEend + offsetBlk
119
+#  context_size = context_end - context_start + 1
120
+#  #nr_blocks = as.integer(context_size/step_size) + 
121
+   # ifelse(context_size%%step_size, 1, 0)
122
+#  #blk_scores = numeric(ifelse(nr_blocks>win_nr_steps, 
123
+  # nr_blocks, win_nr_steps+1))
124
+#
125
+#  covAll = coverage(ranges, width=context_end)
126
+#   
127
+#  #runMeanAll = runmean(covAll, k=windowSize, "constant")
128
+#  #resStart = max(CNEstart, (windowSize-1)/2+1)
129
+#  #resEnd = min(CNEend, computeEnd-(windowSize-1)/2)
130
+#  #height = runMeanAll[resStart:resEnd]*100
131
+#}
132
+
133
+#listToPlot = list(a=res, b=res)
134
+
135
+#plotCNE = function(listToPlot, horizonscale=2, nbands=3){
136
+#  mergedDf = as.data.frame(do.call(rbind, listToPlot))
137
+#  mergedDf$grouping = rep(names(listToPlot), sapply(listToPlot, nrow))
138
+#  mergedDf = mergedDf[ ,c("coordinates", "grouping", "y")]
139
+#  p = horizon.panel.ggplot(mergedDf, horizonscale=horizonscale, nbands=nbands)
140
+#  #if(!is.null(file)){
141
+#  #  postscript(file=file)
142
+#  #  on.exit(dev.off())
143
+#  #}
144
+#  return(p)
145
+#}
146
+
147
+#horizon.panel.ggplot = function(mergedDf, horizonscale=2, 
148
+#                                nbands=3, my.title="fun"){
149
+#  #require(ggplot2)
150
+#  #require(reshape2)
151
+#  origin = 0
152
+#  #require(RColorBrewer)
153
+#  #col.brew = brewer.pal(name="RdBu",n=10)
154
+#  #col.brew = c("#67001F", "#B2182B", "#D6604D", 
155
+#                #"#F4A582", "#FDDBC7", "#D1E5F0", 
156
+#                #"#92C5DE", "#4393C3", "#2166AC", "#053061")
157
+#  col.brew = c("yellow", "orange", "red", "chartreuse", "blue")
158
+#  colnames(mergedDf) = c("coordinates", "grouping", "y")
159
+#  for(i in 1:nbands){
160
+#    #do positive
161
+#    mergedDf[ ,paste("ypos",i,sep="")] =