git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@128651 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,250 +0,0 @@ |
1 |
- |
|
2 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
3 |
-## ## |
|
4 |
-## Project : seqTools ## |
|
5 |
-## Created : 26.August.2013 ## |
|
6 |
-## Author : W. Kaisers ## |
|
7 |
-## File : kMer.r ## |
|
8 |
-## Content : Functionality for counting DNA k-mers ## |
|
9 |
-## (independent of fastq or fasta files) ## |
|
10 |
-## countKmers, countDnaKmers, revCountDnaKmers, ## |
|
11 |
-## countGenomeKmers, countSpliceKmers ## |
|
12 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
13 |
- |
|
14 |
- |
|
15 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
16 |
-## Counts DNA k-mers on specified regions inside single (character) sequence |
|
17 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
18 |
-countDnaKmers <- function(dna, k, start=1, width=nchar(dna) - k + 1) |
|
19 |
-{ |
|
20 |
- if(!is.character(dna)) |
|
21 |
- stop("'dna' must be character.") |
|
22 |
- |
|
23 |
- if(length(dna) != 1) |
|
24 |
- stop("'dna' must have length 1.") |
|
25 |
- |
|
26 |
- if(is.numeric(start)) |
|
27 |
- start <- as.integer(start) |
|
28 |
- |
|
29 |
- if(is.numeric(width)) |
|
30 |
- width <- as.integer(width) |
|
31 |
- |
|
32 |
- if(length(width) == 1) |
|
33 |
- width <- rep(width, length(start)) |
|
34 |
- |
|
35 |
- if(is.numeric(k)) |
|
36 |
- k <- as.integer(k) |
|
37 |
- |
|
38 |
- if(length(k)!=1) |
|
39 |
- stop("'k' must have length 1.") |
|
40 |
- |
|
41 |
- if(k < 1) |
|
42 |
- stop("'k' must be positive.") |
|
43 |
- |
|
44 |
- if(k > max_k) |
|
45 |
- stop("'k' must not exceed", max_k, ".") |
|
46 |
- |
|
47 |
- nc <- nchar(dna) |
|
48 |
- if(k > nc) |
|
49 |
- stop("'k' must be <= nchar(dna).") |
|
50 |
- if(any(start + width + k > nc + 2)) |
|
51 |
- stop("Search region exceeds string end.") |
|
52 |
- |
|
53 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
54 |
- ## Counts N's |
|
55 |
- ## ToDo: Return value |
|
56 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
57 |
- nn <- integer(length(start)) |
|
58 |
- return(.Call("count_dna_Kmers", |
|
59 |
- dna, start, width, k, nn, PACKAGE="seqTools")) |
|
60 |
- |
|
61 |
-} |
|
62 |
- |
|
63 |
- |
|
64 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
65 |
-## Counts DNA k-mers on specified regions inside single (character) sequence |
|
66 |
-## in reverse direction |
|
67 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
68 |
-revCountDnaKmers <- function(dna, k, start, width) |
|
69 |
-{ |
|
70 |
- if(!is.character(dna)) |
|
71 |
- stop("'dna' must be character.") |
|
72 |
- |
|
73 |
- if(length(dna) != 1) |
|
74 |
- stop("'dna' must have length 1.") |
|
75 |
- |
|
76 |
- if(is.numeric(start)) |
|
77 |
- start <- as.integer(start) |
|
78 |
- |
|
79 |
- if(is.numeric(width)) |
|
80 |
- width <- as.integer(width) |
|
81 |
- |
|
82 |
- if(length(width) == 1) |
|
83 |
- width <- rep(width, length(start)) |
|
84 |
- |
|
85 |
- if(length(width) != length(start)) |
|
86 |
- stop("'width' must have length 1 or the same length as 'start'.") |
|
87 |
- |
|
88 |
- if(is.numeric(k)) |
|
89 |
- k <- as.integer(k) |
|
90 |
- |
|
91 |
- if(any(width + k > start)) |
|
92 |
- stop("'width' must be <= 'start' - 'k'.") |
|
93 |
- |
|
94 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
95 |
- ## nn contains N counts |
|
96 |
- ## ToDo: Add value of nn to returned object |
|
97 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
98 |
- nn <- integer(length(start)) |
|
99 |
- |
|
100 |
- return(.Call("rev_count_dna_Kmers", |
|
101 |
- dna, start, width, k, nn, PACKAGE="seqTools")) |
|
102 |
-} |
|
103 |
- |
|
104 |
- |
|
105 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
106 |
-## Counts DNA k-mers on specified regions inside multiple (character) sequences |
|
107 |
-## in possibly reversed direction (depending on strand) |
|
108 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
109 |
-countGenomeKmers <- function(dna, seqid, start, width, strand, k) |
|
110 |
-{ |
|
111 |
- if(!is.character(dna)) |
|
112 |
- stop("'dna' must be character.") |
|
113 |
- |
|
114 |
- if(!is.numeric(seqid)) |
|
115 |
- seqid <- as.integer(seqid) |
|
116 |
- rg <- range(seqid) |
|
117 |
- |
|
118 |
- if(rg[1] < 0) |
|
119 |
- stop("Negative seqid's are not allowed.") |
|
120 |
- |
|
121 |
- if(rg[2] > length(dna)) |
|
122 |
- stop("Out of range seqid's.") |
|
123 |
- |
|
124 |
- if(!is.numeric(start)) |
|
125 |
- stop("'start' must be numeric.") |
|
126 |
- start <- as.integer(start) |
|
127 |
- |
|
128 |
- if(!is.numeric(width)) |
|
129 |
- stop("'width' must be numeric") |
|
130 |
- width <- as.integer(width) |
|
131 |
- |
|
132 |
- if(is.factor(strand)) |
|
133 |
- strand <- as.integer(strand) |
|
134 |
- else |
|
135 |
- { |
|
136 |
- if(!is.numeric("strand")) |
|
137 |
- strand <- as.integer(strand) |
|
138 |
- } |
|
139 |
- |
|
140 |
- nStart <- length(start) |
|
141 |
- if( (length(seqid) != nStart) | (length(width) != nStart) | |
|
142 |
- (length(strand) != nStart) ) |
|
143 |
- stop("'seqid', 'start', 'width' and 'strand' must have same length.") |
|
144 |
- |
|
145 |
- if(length(k) != 1) |
|
146 |
- stop("'k' must be a single value.") |
|
147 |
- |
|
148 |
- if(!is.numeric(k)) |
|
149 |
- stop("'k' must be numeric.") |
|
150 |
- k<-as.integer(k) |
|
151 |
- |
|
152 |
- if(k <= 0) |
|
153 |
- stop("'k' must be >=1") |
|
154 |
- |
|
155 |
- if(k > max_k) |
|
156 |
- stop("'k' must not exceed", max_k, ".") |
|
157 |
- |
|
158 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
159 |
- ## Counts N's |
|
160 |
- ## ToDo: Return value |
|
161 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
162 |
- nn <- integer(length(start)) |
|
163 |
- return(.Call("count_genome_Kmers", dna, seqid, start, width, |
|
164 |
- strand, k, nn, PACKAGE="seqTools")) |
|
165 |
-} |
|
166 |
- |
|
167 |
- |
|
168 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
169 |
-## Counts DNA k-mers on each border of a splice-site defined by wLend and |
|
170 |
-## wRstart in range of size width |
|
171 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
172 |
- |
|
173 |
-countSpliceKmers <- function(dna, seqid, lEnd, rStart, width, strand, k) |
|
174 |
-{ |
|
175 |
- if(!is.character(dna)) |
|
176 |
- stop("'dna' must be character.") |
|
177 |
- |
|
178 |
- if(!is.numeric(seqid)) |
|
179 |
- stop("'seqid' must be numeric.") |
|
180 |
- seqid <- as.integer(seqid) |
|
181 |
- |
|
182 |
- rg <- range(seqid) |
|
183 |
- if(rg[1] < 0) |
|
184 |
- stop("Negative seqid's are not allowed.") |
|
185 |
- |
|
186 |
- if(rg[2] > length(dna)) |
|
187 |
- stop("Out of range seqid's.") |
|
188 |
- |
|
189 |
- if(!is.numeric(lEnd)) |
|
190 |
- stop("'lEnd' must be numeric.") |
|
191 |
- lEnd <- as.integer(lEnd) |
|
192 |
- |
|
193 |
- if(!is.numeric(rStart)) |
|
194 |
- stop("'rStart' must be numeric.") |
|
195 |
- rStart <- as.integer(rStart) |
|
196 |
- |
|
197 |
- if(!is.numeric(width)) |
|
198 |
- stop("'width' must be numeric.") |
|
199 |
- width <- as.integer(width) |
|
200 |
- |
|
201 |
- if(is.factor(strand)) |
|
202 |
- strand <- as.integer(strand) |
|
203 |
- else |
|
204 |
- { |
|
205 |
- if(!is.numeric("strand")) |
|
206 |
- strand <- as.integer(strand) |
|
207 |
- } |
|
208 |
- |
|
209 |
- nStart <- length(lEnd) |
|
210 |
- if(length(seqid) != nStart | length(rStart) != nStart | |
|
211 |
- length(width) != nStart | length(strand) != nStart) |
|
212 |
- { |
|
213 |
- stop(paste("'seqid', 'lEnd', 'rStart', 'width'", |
|
214 |
- "and 'strand' must have equal length.")) |
|
215 |
- } |
|
216 |
- if(!is.numeric(k)) |
|
217 |
- stop("'k' must be numeric.") |
|
218 |
- k <- as.integer(k) |
|
219 |
- |
|
220 |
- if(length(k) != 1) |
|
221 |
- stop("'k' must be a single value.") |
|
222 |
- |
|
223 |
- if(k > max_k) |
|
224 |
- stop("'k' must not exceed", max_k, ".") |
|
225 |
- |
|
226 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
227 |
- ## Plus strand |
|
228 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
229 |
- plus_strand<-strand == 1 |
|
230 |
- |
|
231 |
- if(sum(plus_strand) > 0) |
|
232 |
- { |
|
233 |
- if(any((lEnd[plus_strand] - width[plus_strand] - k + 1) < 0)) |
|
234 |
- stop("lEnd must be >= width+k-1 for all +-strand coordinates") |
|
235 |
- } |
|
236 |
- |
|
237 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
238 |
- ## Counts N's |
|
239 |
- ## ToDo: Return value |
|
240 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
241 |
- nn <- integer(length = nStart) |
|
242 |
- |
|
243 |
- return(.Call("count_splice_Kmers", dna, seqid, lEnd, rStart, width, |
|
244 |
- strand, k, nn, PACKAGE="seqTools")) |
|
245 |
-} |
|
246 |
- |
|
247 |
- |
|
248 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
249 |
-## END OF FILE |
|
250 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |