Browse code

version 1.10.1

Fix a bug of ordering meta columns in readAncora


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/branches/RELEASE_3_4/madman/Rpacks/CNEr@123027 bc3139a8-67e5-0310-9ffc-ced21a209358

Ge Tan authored on 25/10/2016 15:39:13
Showing 16 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: CNEr 
2
-Version: 1.10.0
3
-Date: 2016-10-06
2
+Version: 1.10.1
3
+Date: 2016-10-25
4 4
 Title: CNE Detection and Visualization
5 5
 Description: Large-scale identification and advanced visualization 
6 6
              of sets of conserved noncoding elements.
... ...
@@ -25,7 +25,8 @@ Imports: Biostrings (>= 2.33.4),
25 25
          poweRlaw (>= 0.60.3),
26 26
          annotate (>= 1.50.0),
27 27
          GO.db (>= 3.3.0),
28
-         R.utils (>= 2.3.0)
28
+         R.utils (>= 2.3.0),
29
+         KEGGREST (>= 1.14.0)
29 30
 Depends: R (>= 3.2.2)
30 31
 Suggests: Gviz (>= 1.7.4),
31 32
           BiocStyle,
... ...
@@ -66,4 +67,5 @@ Collate:
66 67
         WholeGenomeAlignment.R
67 68
         Ancora.R
68 69
         CNE-methods.R
69
-        GO.R
70 70
\ No newline at end of file
71
+        GO.R
72
+        KEGG.R
... ...
@@ -62,6 +62,7 @@ importFrom(GO.db, GOCCANCESTOR, GOCCOFFSPRING, GOCCCHILDREN,
62 62
                   GOMFANCESTOR, GOMFOFFSPRING, GOMFCHILDREN)
63 63
 importFrom("grDevices", "jpeg", "pdf", "png", "postscript")
64 64
 importFrom(R.utils, gunzip, gzip)
65
+importFrom(KEGGREST, keggGet)
65 66
 
66 67
 ### -----------------------------------------------------------------
67 68
 ### Export S4 classes defined in CNEr
... ...
@@ -1,3 +1,9 @@
1
+CHANGES IN Bioc 3.5
2
+------------------------
3
+NEW FEATURES
4
+    o Add function orgKEGGIds2EntrezIDs to fetch the mapping between KEGG IDs
5
+      and Entrez IDs
6
+    
1 7
 CHANGES IN Bioc 3.4
2 8
 ------------------------
3 9
 NEW FEATURES
... ...
@@ -50,8 +50,8 @@ readAncora <- function(fn, assembly=NULL,
50 50
                                   strand="*",
51 51
                                   name=paste0(cne[[1]], ":", 
52 52
                                               (cne[[2]]+1), "-", cne[[3]]),
53
-                                  seqnames2=cne[[1]],
54 53
                                   itemRgb=chr2colour(cne[[1]]),
54
+                                  seqnames2=cne[[1]],
55 55
                                   seqinfo=seqinfoQuery)
56 56
                      )
57 57
   if(is.null(assembly)){
... ...
@@ -144,13 +144,13 @@ makeCNEDensity <- function(x, outputDir=".",
144 144
   message("Making bedGraph files...")
145 145
   bedFirst <- reduce(bedFirst, ignore.strand=TRUE)
146 146
   covFirst <- coverage(bedFirst)
147
-  densityFirst <- runmean(covFirst, k=windowSizeFirst*1000, 
148
-                          endrule = "constant") * 100
147
+  densityFirst <- suppressWarnings(runmean(covFirst, k=windowSizeFirst*1000, 
148
+                          endrule = "constant") * 100)
149 149
   
150 150
   bedSecond <- reduce(bedSecond, ignore.strand=TRUE)
151 151
   covSecond <- coverage(bedSecond)
152
-  densitySecond <- runmean(covSecond, k=windowSizeSecond*1000,
153
-                           endrule = "constant") * 100
152
+  densitySecond <- suppressWarnings(runmean(covSecond, k=windowSizeSecond*1000,
153
+                           endrule = "constant") * 100)
154 154
  
155 155
   firstTrackLine <- new("GraphTrackLine",
156 156
                         name=paste(genomeFirst, "CNEs density", threshold),
... ...
@@ -2,7 +2,8 @@
2 2
 ### make GRBs from CNEs
3 3
 ### Exported!
4 4
 makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1, 
5
-                     background=c("chromosome", "genome")){#,
5
+                     background=c("chromosome", "genome"),
6
+                     minCNEs=1L){#,
6 7
                      #byChrom=FALSE
7 8
   if(!is(x, "GRangesList")){
8 9
     stop("The input `x` must be a `GRangesList` object!")
... ...
@@ -21,7 +22,7 @@ makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1,
21 22
     names(xList) <- as.character(1:length(xList))
22 23
   }
23 24
   x <- unlist(x)
24
-  x <- reduce(x)
25
+  x <- reduce(x, ignore.strand=TRUE)
25 26
   cov <- coverage(x)
26 27
 
27 28
   # Guess winSize from genome size if NULL
... ...
@@ -36,7 +37,7 @@ makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1,
36 37
     stop("The `genes` must be a `GRanges` object!")
37 38
   }
38 39
 
39
-  density <- runmean(cov, k=winSize,  endrule="constant")
40
+  density <- suppressWarnings(runmean(cov, k=winSize,  endrule="constant"))
40 41
   if(background == "genome"){
41 42
     # calculate the background percentage of coverage
42 43
     totalGenomeSize <- 
... ...
@@ -93,11 +94,28 @@ makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1,
93 94
   for(i in 1:length(xList)){
94 95
     hitsCNEs <- findOverlaps(xList[[i]], clusterRanges,
95 96
                              ignore.strand=TRUE, type="within")
96
-    cnes <- sapply(split(queryHits(hitsCNEs), subjectHits(hitsCNEs)), length)
97
+    
98
+    ## Add the number of CNEs
99
+    cnes <- lengths(split(queryHits(hitsCNEs), subjectHits(hitsCNEs)))
97 100
     mcols(clusterRanges)[[names(xList)[i]]] <- 0L
98 101
     mcols(clusterRanges)[[names(xList)[i]]][as.integer(names(cnes))] <- cnes
102
+    
103
+    ## Add the CNE ranges
104
+    cnes <- split(xList[[i]][queryHits(hitsCNEs)], subjectHits(hitsCNEs))
105
+    missingCNEs <- setdiff(seq_len(length(clusterRanges)),
106
+                           subjectHits(hitsCNEs))
107
+    for(j in missingCNEs){
108
+      ## This is really a bad implementation. 
109
+      cnes[[as.character(j)]] <- GRanges()
110
+    }
111
+    mcols(clusterRanges)[[paste(names(xList)[i], "CNE")]] <- 
112
+      cnes[order(as.integer(names(cnes)))]
99 113
   }
100 114
   
115
+  # Filter out the GRBs with few CNEs
116
+  indexToKeep <- apply(as.data.frame(mcols(clusterRanges)[names(xList)]) >= 
117
+                         minCNEs, 1, any)
118
+  clusterRanges <- clusterRanges[indexToKeep]
119
+  
101 120
   return(clusterRanges)
102
-}
103
-
121
+}
104 122
\ No newline at end of file
105 123
new file mode 100644
... ...
@@ -0,0 +1,53 @@
1
+### -----------------------------------------------------------------
2
+### orgKEGGIds2EntrezIDs: This script is supposed to parse the html page of 
3
+###              certain species's pathway from KEGG and 
4
+###              download the associated information for each KEGG pathway ID.
5
+### Exported!
6
+orgKEGGIds2EntrezIDs <- function(organism="Homo sapiens"){
7
+  ## Species mapping
8
+  organismMapping <- read.table("http://rest.kegg.jp/list/organism",
9
+                                header=FALSE, sep="\t", quote="",
10
+                                comment.char="")
11
+  organismID <- organismMapping[grepl(organism, organismMapping$V3,
12
+                                      ignore.case=TRUE),
13
+                                2, drop=TRUE]
14
+  if(length(organismID) == 0L){
15
+    stop("The provided organism is not available.",
16
+         "Please refer to http://rest.kegg.jp/list/organism for available",
17
+         "organisms")
18
+  }
19
+  html <- readLines(paste0("http://www.genome.jp/kegg-bin/show_organism?menu_type=pathway_maps&org=", organismID))
20
+  html <- grep("^\\d{5}&", html, value=TRUE)
21
+  ### Hopefully the ID is always 5-digit
22
+  pathwayIDs <- paste0(organismID, substr(html, 1L, 5L))
23
+  groups <- sample(rep_len(1L:ceiling(length(pathwayIDs) / 10),
24
+                           length.out=length(pathwayIDs)))
25
+  pathwayIDs <- split(pathwayIDs, groups)
26
+  
27
+  ## query with KEGG Rest server with 10 entries (maximal) a time, 
28
+  ## and 200s to 400s gap between each query.
29
+  query <- lapply(pathwayIDs,
30
+                  function(x){Sys.sleep(sample(200L:400L, size=1L));keggGet(x)})
31
+  
32
+  ## re-organise the query object
33
+  pathways <- list()
34
+  for(i in 1:length(query)){
35
+    for(j in 1:length(query[[i]])){
36
+      pathways[[query[[i]][[j]]$ENTRY]] <-
37
+        query[[i]][[j]]
38
+    }
39
+  }
40
+  ## Get the Pathway IDs to Entrez Gene IDs mapping
41
+  pathwayIDs2GeneIDs <- list()
42
+  for(i in 1:length(pathways)){
43
+    genesInfo <- pathways[[i]]$GENE
44
+    if(is.null(genesInfo)){
45
+      pathwayIDs2GeneIDs[[pathways[[i]]$ENTRY]] <- NA
46
+      next
47
+    }
48
+    pathwayIDs2GeneIDs[[pathways[[i]]$ENTRY]] <-
49
+      genesInfo[seq(1, length(genesInfo), by=2)]
50
+  }
51
+  pathwayIDs2GeneIDs <- pathwayIDs2GeneIDs[!is.na(pathwayIDs2GeneIDs)]
52
+  return(pathwayIDs2GeneIDs)
53
+}
0 54
\ No newline at end of file
... ...
@@ -49,7 +49,7 @@ CNEDensity <- function(dbName, tableName, chr, start, end,
49 49
   # Implement get_cne_ranges_in_region_partitioned_by_other_chr later!!!
50 50
   ranges <- reduce(ranges)
51 51
   covAll <- coverage(ranges, width=context_end)
52
-  runMeanAll <- runmean(covAll, k=windowSize, "constant")
52
+  runMeanAll <- suppressWarnings(runmean(covAll, k=windowSize, "constant"))
53 53
   ans <- as(runMeanAll, "GRanges")
54 54
   ans$score <- ans$score * 100
55 55
   return(ans)
... ...
@@ -52,7 +52,7 @@ CNE(assembly1Fn=character(1), assembly2Fn=character(1),
52 52
     \item{CNE12}{Object of class \code{"GRangePairs"}:
53 53
       The preliminary CNEs from axt file with assembly1 as reference.}
54 54
     \item{CNE21}{Object of class \code{"GRangePairs"}:
55
-      The preliminart CNEs from axt file with assembly2 as reference.}
55
+      The preliminary CNEs from axt file with assembly2 as reference.}
56 56
     \item{CNEMerged}{Object of class \code{"GRangePairs"}:
57 57
       The CNEs after merging CNE1 and CNE2.}
58 58
     \item{CNEFinal}{Object of class \code{"GRangePairs"}:
... ...
@@ -9,7 +9,7 @@
9 9
 }
10 10
 \usage{
11 11
   makeGRBs(x, winSize=NULL, genes=NULL, ratio=1,
12
-           background=c("chromosome", "genome"))
12
+           background=c("chromosome", "genome"), minCNEs=1L)
13 13
 }
14 14
   
15 15
 \arguments{
... ...
@@ -33,21 +33,26 @@
33 33
   }
34 34
   \item{background}{
35 35
     \code{character}(1):
36
-    can be "chromosome" or "genome". When slice the CNE density,
36
+    can be "chromosome" or "genome". When using \code{slice} for the CNE density,
37 37
     the background is calculated on a per-chromosome or whole-genome basis.
38 38
   }
39
+  \item{minCNEs}{
40
+    \code{integer}(1): the minimal number of CNEs that a GRB needs to have.
41
+  }
39 42
 }
40 43
 \details{
41
-  First we calculated the CNE densities from the CNEs.
44
+  First we calculate the CNE densities from the CNEs.
42 45
   Then we segment the regions according to the values of CNE densities.
43 46
   The regions with CNE densities above the expected CNE densities * ratio are
44 47
   considered as putative GRBs.
45
-  As a last step, the putative GRBs that do not encompass any gene
46
-  are filtered out.
48
+  Putative GRBs that do not encompass any gene are filtered out.
49
+  Finally, the GRBs that have fewer than \code{minCNEs} number of CNEs will
50
+  be filtered out.
47 51
 }
48 52
 \value{
49
-  A \code{GRanges} object of GRB coordinates is returned, with the number of CNEs
50
-  within each GRB.
53
+  A \code{GRanges} object of GRB coordinates is returned.
54
+  The numbers of CNEs and the coordinates of CNEs within each GRB are
55
+  returned as a metadata column.
51 56
 }
52 57
 
53 58
 \author{
... ...
@@ -69,5 +74,5 @@
69 74
   makeGRBs(cneList, winSize=200, genes=refGenesDanRer10, ratio=1.2,
70 75
            background="genome")
71 76
   makeGRBs(cneList, winSize=200, genes=refGenesDanRer10, ratio=1.2,
72
-           background="chromosome")
77
+           background="chromosome", minCNEs=3L)
73 78
 }
74 79
new file mode 100644
... ...
@@ -0,0 +1,31 @@
1
+\name{orgKEGGIds2EntrezIDs}
2
+\alias{orgKEGGIds2EntrezIDs}
3
+\title{
4
+  Fetch mapping from KEGG IDs to Entrez IDs
5
+}
6
+\description{
7
+  Given the desired organism name, fetch the mapping between KEGG IDs and Entrez
8
+  gene IDs.
9
+}
10
+\usage{
11
+  orgKEGGIds2EntrezIDs(organism="Homo sapiens")
12
+}
13
+
14
+\arguments{
15
+  \item{organism}{
16
+    \code{character}(1): the name of prganism to query. It has to be available
17
+    at http://rest.kegg.jp/list/organism.
18
+  }
19
+}
20
+\value{
21
+  A \code{list} of Entrez gene IDs with KEGG IDs as names.
22
+}
23
+\author{
24
+  Ge Tan
25
+}
26
+
27
+\examples{
28
+  \donttest{
29
+    orgKEGGIds2EntrezIDs(organism="Homo sapiens")
30
+  }
31
+}
0 32
\ No newline at end of file
... ...
@@ -6,8 +6,8 @@
6 6
 }
7 7
 \description{
8 8
   CNE widths can follow heavy tailed distribution that are associated with power-laws.
9
-  This function plot the reverse cumulative density distribution of CNE widths,
10
-  and fit a discrete power-law distribution.
9
+  This function plots the reverse cumulative density distribution of CNE widths,
10
+  and fits a discrete power-law distribution.
11 11
   Goodness of fit can also be evaluated.
12 12
 }
13 13
 \usage{
... ...
@@ -1,10 +1,10 @@
1 1
 \name{read.rmskFasta}
2 2
 \alias{read.rmskFasta}
3 3
 \title{
4
-  Read a soft reoeat masked fasta
4
+  Read a soft repeat masked fasta
5 5
 }
6 6
 \description{
7
-  Read a soft repeat masked fasta file into a \code{GRanges}.
7
+  Read a soft repeat masked fasta file into a \code{GRanges} object.
8 8
 }
9 9
 \usage{
10 10
   read.rmskFasta(fn)
... ...
@@ -5,7 +5,7 @@
5 5
   Read the cne file from Ancora format.
6 6
 }
7 7
 \description{
8
-  Read the Ancora CNE file into a \code{GRanges} or \code{GRangePairs}.
8
+  Read the Ancora CNE file into a \code{GRanges} or \code{GRangePairs} object.
9 9
 }
10 10
 \usage{
11 11
   readAncora(fn, assembly=NULL, tAssemblyFn=NULL, qAssemblyFn=NULL)
... ...
@@ -6,7 +6,7 @@
6 6
 }
7 7
 \description{
8 8
 Query the SQLite database based on chromosome, 
9
-coordinates and some other criterias.
9
+coordinates and some other criteria.
10 10
 Primarily not intended to be used directly.
11 11
 For the CNE density plot, \code{fetchCNEDensity} function should be used.
12 12
 }
... ...
@@ -15,7 +15,7 @@
15 15
   \item{distance}{
16 16
     It can be "far", "medium" or "close". It defines the scoring matrix used in
17 17
     \emph{lastz} aligner.
18
-    Generally, if two species are close to each other at human and chimp level,
18
+    Generally, if two species are close to each other, for example human and chimp,
19 19
     "close" should be used.
20 20
     If two species have a divergence time of 100 MYA, "far" should be used.
21 21
     In other cases, "medium" should be used.
... ...
@@ -55,7 +55,7 @@ Then we can run `lastz` function to generate the _lav_ files.
55 55
 assemblyDir <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit"
56 56
 axtDir <- "/Users/gtan/OneDrive/Project/CSC/CNEr/axt"
57 57
 assemblyTarget <- file.path(system.file("extdata",
58
-                              package="BSgenome.Drerio.UCSC.danRer10"),
58
+                            package="BSgenome.Drerio.UCSC.danRer10"),
59 59
                             "single_sequences.2bit")
60 60
 assemblyQuery <- file.path(system.file("extdata",
61 61
                                        package="BSgenome.Hsapiens.UCSC.hg38"),
... ...
@@ -156,7 +156,7 @@ netSyntenicFile <- chainNetSyntenic(allPreChain, assemblyTarget, assemblyQuery,
156 156
                                                       sub("\\.2bit$", "",
157 157
                                                       basename(assemblyQuery),
158 158
                                                       ignore.case = TRUE),
159
-                                              ".noClass.net")),
159
+                                               ".noClass.net")),
160 160
                      binaryChainNet="chainNet", binaryNetSyntenic="netSyntenic")
161 161
 ```
162 162
 
... ...
@@ -184,5 +184,4 @@ document was compiled:
184 184
 
185 185
 ```{r sessionInfo, echo=FALSE}
186 186
 sessionInfo()
187
-```
188
-
187
+```
189 188
\ No newline at end of file