Fix a bug of ordering meta columns in readAncora
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/branches/RELEASE_3_4/madman/Rpacks/CNEr@123027 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
Package: CNEr |
2 |
-Version: 1.10.0 |
|
3 |
-Date: 2016-10-06 |
|
2 |
+Version: 1.10.1 |
|
3 |
+Date: 2016-10-25 |
|
4 | 4 |
Title: CNE Detection and Visualization |
5 | 5 |
Description: Large-scale identification and advanced visualization |
6 | 6 |
of sets of conserved noncoding elements. |
... | ... |
@@ -25,7 +25,8 @@ Imports: Biostrings (>= 2.33.4), |
25 | 25 |
poweRlaw (>= 0.60.3), |
26 | 26 |
annotate (>= 1.50.0), |
27 | 27 |
GO.db (>= 3.3.0), |
28 |
- R.utils (>= 2.3.0) |
|
28 |
+ R.utils (>= 2.3.0), |
|
29 |
+ KEGGREST (>= 1.14.0) |
|
29 | 30 |
Depends: R (>= 3.2.2) |
30 | 31 |
Suggests: Gviz (>= 1.7.4), |
31 | 32 |
BiocStyle, |
... | ... |
@@ -66,4 +67,5 @@ Collate: |
66 | 67 |
WholeGenomeAlignment.R |
67 | 68 |
Ancora.R |
68 | 69 |
CNE-methods.R |
69 |
- GO.R |
|
70 | 70 |
\ No newline at end of file |
71 |
+ GO.R |
|
72 |
+ KEGG.R |
... | ... |
@@ -62,6 +62,7 @@ importFrom(GO.db, GOCCANCESTOR, GOCCOFFSPRING, GOCCCHILDREN, |
62 | 62 |
GOMFANCESTOR, GOMFOFFSPRING, GOMFCHILDREN) |
63 | 63 |
importFrom("grDevices", "jpeg", "pdf", "png", "postscript") |
64 | 64 |
importFrom(R.utils, gunzip, gzip) |
65 |
+importFrom(KEGGREST, keggGet) |
|
65 | 66 |
|
66 | 67 |
### ----------------------------------------------------------------- |
67 | 68 |
### Export S4 classes defined in CNEr |
... | ... |
@@ -50,8 +50,8 @@ readAncora <- function(fn, assembly=NULL, |
50 | 50 |
strand="*", |
51 | 51 |
name=paste0(cne[[1]], ":", |
52 | 52 |
(cne[[2]]+1), "-", cne[[3]]), |
53 |
- seqnames2=cne[[1]], |
|
54 | 53 |
itemRgb=chr2colour(cne[[1]]), |
54 |
+ seqnames2=cne[[1]], |
|
55 | 55 |
seqinfo=seqinfoQuery) |
56 | 56 |
) |
57 | 57 |
if(is.null(assembly)){ |
... | ... |
@@ -144,13 +144,13 @@ makeCNEDensity <- function(x, outputDir=".", |
144 | 144 |
message("Making bedGraph files...") |
145 | 145 |
bedFirst <- reduce(bedFirst, ignore.strand=TRUE) |
146 | 146 |
covFirst <- coverage(bedFirst) |
147 |
- densityFirst <- runmean(covFirst, k=windowSizeFirst*1000, |
|
148 |
- endrule = "constant") * 100 |
|
147 |
+ densityFirst <- suppressWarnings(runmean(covFirst, k=windowSizeFirst*1000, |
|
148 |
+ endrule = "constant") * 100) |
|
149 | 149 |
|
150 | 150 |
bedSecond <- reduce(bedSecond, ignore.strand=TRUE) |
151 | 151 |
covSecond <- coverage(bedSecond) |
152 |
- densitySecond <- runmean(covSecond, k=windowSizeSecond*1000, |
|
153 |
- endrule = "constant") * 100 |
|
152 |
+ densitySecond <- suppressWarnings(runmean(covSecond, k=windowSizeSecond*1000, |
|
153 |
+ endrule = "constant") * 100) |
|
154 | 154 |
|
155 | 155 |
firstTrackLine <- new("GraphTrackLine", |
156 | 156 |
name=paste(genomeFirst, "CNEs density", threshold), |
... | ... |
@@ -2,7 +2,8 @@ |
2 | 2 |
### make GRBs from CNEs |
3 | 3 |
### Exported! |
4 | 4 |
makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1, |
5 |
- background=c("chromosome", "genome")){#, |
|
5 |
+ background=c("chromosome", "genome"), |
|
6 |
+ minCNEs=1L){#, |
|
6 | 7 |
#byChrom=FALSE |
7 | 8 |
if(!is(x, "GRangesList")){ |
8 | 9 |
stop("The input `x` must be a `GRangesList` object!") |
... | ... |
@@ -21,7 +22,7 @@ makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1, |
21 | 22 |
names(xList) <- as.character(1:length(xList)) |
22 | 23 |
} |
23 | 24 |
x <- unlist(x) |
24 |
- x <- reduce(x) |
|
25 |
+ x <- reduce(x, ignore.strand=TRUE) |
|
25 | 26 |
cov <- coverage(x) |
26 | 27 |
|
27 | 28 |
# Guess winSize from genome size if NULL |
... | ... |
@@ -36,7 +37,7 @@ makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1, |
36 | 37 |
stop("The `genes` must be a `GRanges` object!") |
37 | 38 |
} |
38 | 39 |
|
39 |
- density <- runmean(cov, k=winSize, endrule="constant") |
|
40 |
+ density <- suppressWarnings(runmean(cov, k=winSize, endrule="constant")) |
|
40 | 41 |
if(background == "genome"){ |
41 | 42 |
# calculate the background percentage of coverage |
42 | 43 |
totalGenomeSize <- |
... | ... |
@@ -93,11 +94,28 @@ makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1, |
93 | 94 |
for(i in 1:length(xList)){ |
94 | 95 |
hitsCNEs <- findOverlaps(xList[[i]], clusterRanges, |
95 | 96 |
ignore.strand=TRUE, type="within") |
96 |
- cnes <- sapply(split(queryHits(hitsCNEs), subjectHits(hitsCNEs)), length) |
|
97 |
+ |
|
98 |
+ ## Add the number of CNEs |
|
99 |
+ cnes <- lengths(split(queryHits(hitsCNEs), subjectHits(hitsCNEs))) |
|
97 | 100 |
mcols(clusterRanges)[[names(xList)[i]]] <- 0L |
98 | 101 |
mcols(clusterRanges)[[names(xList)[i]]][as.integer(names(cnes))] <- cnes |
102 |
+ |
|
103 |
+ ## Add the CNE ranges |
|
104 |
+ cnes <- split(xList[[i]][queryHits(hitsCNEs)], subjectHits(hitsCNEs)) |
|
105 |
+ missingCNEs <- setdiff(seq_len(length(clusterRanges)), |
|
106 |
+ subjectHits(hitsCNEs)) |
|
107 |
+ for(j in missingCNEs){ |
|
108 |
+ ## This is really a bad implementation. |
|
109 |
+ cnes[[as.character(j)]] <- GRanges() |
|
110 |
+ } |
|
111 |
+ mcols(clusterRanges)[[paste(names(xList)[i], "CNE")]] <- |
|
112 |
+ cnes[order(as.integer(names(cnes)))] |
|
99 | 113 |
} |
100 | 114 |
|
115 |
+ # Filter out the GRBs with few CNEs |
|
116 |
+ indexToKeep <- apply(as.data.frame(mcols(clusterRanges)[names(xList)]) >= |
|
117 |
+ minCNEs, 1, any) |
|
118 |
+ clusterRanges <- clusterRanges[indexToKeep] |
|
119 |
+ |
|
101 | 120 |
return(clusterRanges) |
102 |
-} |
|
103 |
- |
|
121 |
+} |
|
104 | 122 |
\ No newline at end of file |
105 | 123 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,53 @@ |
1 |
+### ----------------------------------------------------------------- |
|
2 |
+### orgKEGGIds2EntrezIDs: This script is supposed to parse the html page of |
|
3 |
+### certain species's pathway from KEGG and |
|
4 |
+### download the associated information for each KEGG pathway ID. |
|
5 |
+### Exported! |
|
6 |
+orgKEGGIds2EntrezIDs <- function(organism="Homo sapiens"){ |
|
7 |
+ ## Species mapping |
|
8 |
+ organismMapping <- read.table("http://rest.kegg.jp/list/organism", |
|
9 |
+ header=FALSE, sep="\t", quote="", |
|
10 |
+ comment.char="") |
|
11 |
+ organismID <- organismMapping[grepl(organism, organismMapping$V3, |
|
12 |
+ ignore.case=TRUE), |
|
13 |
+ 2, drop=TRUE] |
|
14 |
+ if(length(organismID) == 0L){ |
|
15 |
+ stop("The provided organism is not available.", |
|
16 |
+ "Please refer to http://rest.kegg.jp/list/organism for available", |
|
17 |
+ "organisms") |
|
18 |
+ } |
|
19 |
+ html <- readLines(paste0("http://www.genome.jp/kegg-bin/show_organism?menu_type=pathway_maps&org=", organismID)) |
|
20 |
+ html <- grep("^\\d{5}&", html, value=TRUE) |
|
21 |
+ ### Hopefully the ID is always 5-digit |
|
22 |
+ pathwayIDs <- paste0(organismID, substr(html, 1L, 5L)) |
|
23 |
+ groups <- sample(rep_len(1L:ceiling(length(pathwayIDs) / 10), |
|
24 |
+ length.out=length(pathwayIDs))) |
|
25 |
+ pathwayIDs <- split(pathwayIDs, groups) |
|
26 |
+ |
|
27 |
+ ## query with KEGG Rest server with 10 entries (maximal) a time, |
|
28 |
+ ## and 200s to 400s gap between each query. |
|
29 |
+ query <- lapply(pathwayIDs, |
|
30 |
+ function(x){Sys.sleep(sample(200L:400L, size=1L));keggGet(x)}) |
|
31 |
+ |
|
32 |
+ ## re-organise the query object |
|
33 |
+ pathways <- list() |
|
34 |
+ for(i in 1:length(query)){ |
|
35 |
+ for(j in 1:length(query[[i]])){ |
|
36 |
+ pathways[[query[[i]][[j]]$ENTRY]] <- |
|
37 |
+ query[[i]][[j]] |
|
38 |
+ } |
|
39 |
+ } |
|
40 |
+ ## Get the Pathway IDs to Entrez Gene IDs mapping |
|
41 |
+ pathwayIDs2GeneIDs <- list() |
|
42 |
+ for(i in 1:length(pathways)){ |
|
43 |
+ genesInfo <- pathways[[i]]$GENE |
|
44 |
+ if(is.null(genesInfo)){ |
|
45 |
+ pathwayIDs2GeneIDs[[pathways[[i]]$ENTRY]] <- NA |
|
46 |
+ next |
|
47 |
+ } |
|
48 |
+ pathwayIDs2GeneIDs[[pathways[[i]]$ENTRY]] <- |
|
49 |
+ genesInfo[seq(1, length(genesInfo), by=2)] |
|
50 |
+ } |
|
51 |
+ pathwayIDs2GeneIDs <- pathwayIDs2GeneIDs[!is.na(pathwayIDs2GeneIDs)] |
|
52 |
+ return(pathwayIDs2GeneIDs) |
|
53 |
+} |
|
0 | 54 |
\ No newline at end of file |
... | ... |
@@ -49,7 +49,7 @@ CNEDensity <- function(dbName, tableName, chr, start, end, |
49 | 49 |
# Implement get_cne_ranges_in_region_partitioned_by_other_chr later!!! |
50 | 50 |
ranges <- reduce(ranges) |
51 | 51 |
covAll <- coverage(ranges, width=context_end) |
52 |
- runMeanAll <- runmean(covAll, k=windowSize, "constant") |
|
52 |
+ runMeanAll <- suppressWarnings(runmean(covAll, k=windowSize, "constant")) |
|
53 | 53 |
ans <- as(runMeanAll, "GRanges") |
54 | 54 |
ans$score <- ans$score * 100 |
55 | 55 |
return(ans) |
... | ... |
@@ -52,7 +52,7 @@ CNE(assembly1Fn=character(1), assembly2Fn=character(1), |
52 | 52 |
\item{CNE12}{Object of class \code{"GRangePairs"}: |
53 | 53 |
The preliminary CNEs from axt file with assembly1 as reference.} |
54 | 54 |
\item{CNE21}{Object of class \code{"GRangePairs"}: |
55 |
- The preliminart CNEs from axt file with assembly2 as reference.} |
|
55 |
+ The preliminary CNEs from axt file with assembly2 as reference.} |
|
56 | 56 |
\item{CNEMerged}{Object of class \code{"GRangePairs"}: |
57 | 57 |
The CNEs after merging CNE1 and CNE2.} |
58 | 58 |
\item{CNEFinal}{Object of class \code{"GRangePairs"}: |
... | ... |
@@ -9,7 +9,7 @@ |
9 | 9 |
} |
10 | 10 |
\usage{ |
11 | 11 |
makeGRBs(x, winSize=NULL, genes=NULL, ratio=1, |
12 |
- background=c("chromosome", "genome")) |
|
12 |
+ background=c("chromosome", "genome"), minCNEs=1L) |
|
13 | 13 |
} |
14 | 14 |
|
15 | 15 |
\arguments{ |
... | ... |
@@ -33,21 +33,26 @@ |
33 | 33 |
} |
34 | 34 |
\item{background}{ |
35 | 35 |
\code{character}(1): |
36 |
- can be "chromosome" or "genome". When slice the CNE density, |
|
36 |
+ can be "chromosome" or "genome". When using \code{slice} for the CNE density, |
|
37 | 37 |
the background is calculated on a per-chromosome or whole-genome basis. |
38 | 38 |
} |
39 |
+ \item{minCNEs}{ |
|
40 |
+ \code{integer}(1): the minimal number of CNEs that a GRB needs to have. |
|
41 |
+ } |
|
39 | 42 |
} |
40 | 43 |
\details{ |
41 |
- First we calculated the CNE densities from the CNEs. |
|
44 |
+ First we calculate the CNE densities from the CNEs. |
|
42 | 45 |
Then we segment the regions according to the values of CNE densities. |
43 | 46 |
The regions with CNE densities above the expected CNE densities * ratio are |
44 | 47 |
considered as putative GRBs. |
45 |
- As a last step, the putative GRBs that do not encompass any gene |
|
46 |
- are filtered out. |
|
48 |
+ Putative GRBs that do not encompass any gene are filtered out. |
|
49 |
+ Finally, the GRBs that have fewer than \code{minCNEs} number of CNEs will |
|
50 |
+ be filtered out. |
|
47 | 51 |
} |
48 | 52 |
\value{ |
49 |
- A \code{GRanges} object of GRB coordinates is returned, with the number of CNEs |
|
50 |
- within each GRB. |
|
53 |
+ A \code{GRanges} object of GRB coordinates is returned. |
|
54 |
+ The numbers of CNEs and the coordinates of CNEs within each GRB are |
|
55 |
+ returned as a metadata column. |
|
51 | 56 |
} |
52 | 57 |
|
53 | 58 |
\author{ |
... | ... |
@@ -69,5 +74,5 @@ |
69 | 74 |
makeGRBs(cneList, winSize=200, genes=refGenesDanRer10, ratio=1.2, |
70 | 75 |
background="genome") |
71 | 76 |
makeGRBs(cneList, winSize=200, genes=refGenesDanRer10, ratio=1.2, |
72 |
- background="chromosome") |
|
77 |
+ background="chromosome", minCNEs=3L) |
|
73 | 78 |
} |
74 | 79 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,31 @@ |
1 |
+\name{orgKEGGIds2EntrezIDs} |
|
2 |
+\alias{orgKEGGIds2EntrezIDs} |
|
3 |
+\title{ |
|
4 |
+ Fetch mapping from KEGG IDs to Entrez IDs |
|
5 |
+} |
|
6 |
+\description{ |
|
7 |
+ Given the desired organism name, fetch the mapping between KEGG IDs and Entrez |
|
8 |
+ gene IDs. |
|
9 |
+} |
|
10 |
+\usage{ |
|
11 |
+ orgKEGGIds2EntrezIDs(organism="Homo sapiens") |
|
12 |
+} |
|
13 |
+ |
|
14 |
+\arguments{ |
|
15 |
+ \item{organism}{ |
|
16 |
+ \code{character}(1): the name of prganism to query. It has to be available |
|
17 |
+ at http://rest.kegg.jp/list/organism. |
|
18 |
+ } |
|
19 |
+} |
|
20 |
+\value{ |
|
21 |
+ A \code{list} of Entrez gene IDs with KEGG IDs as names. |
|
22 |
+} |
|
23 |
+\author{ |
|
24 |
+ Ge Tan |
|
25 |
+} |
|
26 |
+ |
|
27 |
+\examples{ |
|
28 |
+ \donttest{ |
|
29 |
+ orgKEGGIds2EntrezIDs(organism="Homo sapiens") |
|
30 |
+ } |
|
31 |
+} |
|
0 | 32 |
\ No newline at end of file |
... | ... |
@@ -6,8 +6,8 @@ |
6 | 6 |
} |
7 | 7 |
\description{ |
8 | 8 |
CNE widths can follow heavy tailed distribution that are associated with power-laws. |
9 |
- This function plot the reverse cumulative density distribution of CNE widths, |
|
10 |
- and fit a discrete power-law distribution. |
|
9 |
+ This function plots the reverse cumulative density distribution of CNE widths, |
|
10 |
+ and fits a discrete power-law distribution. |
|
11 | 11 |
Goodness of fit can also be evaluated. |
12 | 12 |
} |
13 | 13 |
\usage{ |
... | ... |
@@ -1,10 +1,10 @@ |
1 | 1 |
\name{read.rmskFasta} |
2 | 2 |
\alias{read.rmskFasta} |
3 | 3 |
\title{ |
4 |
- Read a soft reoeat masked fasta |
|
4 |
+ Read a soft repeat masked fasta |
|
5 | 5 |
} |
6 | 6 |
\description{ |
7 |
- Read a soft repeat masked fasta file into a \code{GRanges}. |
|
7 |
+ Read a soft repeat masked fasta file into a \code{GRanges} object. |
|
8 | 8 |
} |
9 | 9 |
\usage{ |
10 | 10 |
read.rmskFasta(fn) |
... | ... |
@@ -5,7 +5,7 @@ |
5 | 5 |
Read the cne file from Ancora format. |
6 | 6 |
} |
7 | 7 |
\description{ |
8 |
- Read the Ancora CNE file into a \code{GRanges} or \code{GRangePairs}. |
|
8 |
+ Read the Ancora CNE file into a \code{GRanges} or \code{GRangePairs} object. |
|
9 | 9 |
} |
10 | 10 |
\usage{ |
11 | 11 |
readAncora(fn, assembly=NULL, tAssemblyFn=NULL, qAssemblyFn=NULL) |
... | ... |
@@ -6,7 +6,7 @@ |
6 | 6 |
} |
7 | 7 |
\description{ |
8 | 8 |
Query the SQLite database based on chromosome, |
9 |
-coordinates and some other criterias. |
|
9 |
+coordinates and some other criteria. |
|
10 | 10 |
Primarily not intended to be used directly. |
11 | 11 |
For the CNE density plot, \code{fetchCNEDensity} function should be used. |
12 | 12 |
} |
... | ... |
@@ -15,7 +15,7 @@ |
15 | 15 |
\item{distance}{ |
16 | 16 |
It can be "far", "medium" or "close". It defines the scoring matrix used in |
17 | 17 |
\emph{lastz} aligner. |
18 |
- Generally, if two species are close to each other at human and chimp level, |
|
18 |
+ Generally, if two species are close to each other, for example human and chimp, |
|
19 | 19 |
"close" should be used. |
20 | 20 |
If two species have a divergence time of 100 MYA, "far" should be used. |
21 | 21 |
In other cases, "medium" should be used. |
... | ... |
@@ -55,7 +55,7 @@ Then we can run `lastz` function to generate the _lav_ files. |
55 | 55 |
assemblyDir <- "/Users/gtan/OneDrive/Project/CSC/CNEr/2bit" |
56 | 56 |
axtDir <- "/Users/gtan/OneDrive/Project/CSC/CNEr/axt" |
57 | 57 |
assemblyTarget <- file.path(system.file("extdata", |
58 |
- package="BSgenome.Drerio.UCSC.danRer10"), |
|
58 |
+ package="BSgenome.Drerio.UCSC.danRer10"), |
|
59 | 59 |
"single_sequences.2bit") |
60 | 60 |
assemblyQuery <- file.path(system.file("extdata", |
61 | 61 |
package="BSgenome.Hsapiens.UCSC.hg38"), |
... | ... |
@@ -156,7 +156,7 @@ netSyntenicFile <- chainNetSyntenic(allPreChain, assemblyTarget, assemblyQuery, |
156 | 156 |
sub("\\.2bit$", "", |
157 | 157 |
basename(assemblyQuery), |
158 | 158 |
ignore.case = TRUE), |
159 |
- ".noClass.net")), |
|
159 |
+ ".noClass.net")), |
|
160 | 160 |
binaryChainNet="chainNet", binaryNetSyntenic="netSyntenic") |
161 | 161 |
``` |
162 | 162 |
|
... | ... |
@@ -184,5 +184,4 @@ document was compiled: |
184 | 184 |
|
185 | 185 |
```{r sessionInfo, echo=FALSE} |
186 | 186 |
sessionInfo() |
187 |
-``` |
|
188 |
- |
|
187 |
+``` |
|
189 | 188 |
\ No newline at end of file |