git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@128652 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,1123 +0,0 @@ |
1 |
- |
|
2 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
3 |
-## ## |
|
4 |
-## Project : seqTools ## |
|
5 |
-## Created : 26.August.2013 ## |
|
6 |
-## Author : W. Kaisers ## |
|
7 |
-## Content : Doing some diagnostic and interventional tasks on fastq ## |
|
8 |
-## and fasta ## |
|
9 |
-## esp. DNA k-mer counts. ## |
|
10 |
-## Version : 0.99.34 ## |
|
11 |
-## ## |
|
12 |
-## Changelog : ## |
|
13 |
-## 26.08.13 : 0.0.1 Project created ## |
|
14 |
-## 03.09.13 : 0.0.6 C-Code valgrind tested ## |
|
15 |
-## 27.09.13 : 0.1.0 Added fastqDnaKmers ## |
|
16 |
-## 14.10.13 : 0.1.1 Added C-library for parsing gz fasta and fastq files ## |
|
17 |
-## 17.10.13 : 0.1.3 C-Wrapper for fastq working. ## |
|
18 |
-## 17.10.13 : 0.1.6 First version of R package ## |
|
19 |
-## 21.10.13 : 0.3.0 New C library for fastq parsing ## |
|
20 |
-## 28.10.13 : 0.4.0 Added fastq-loc functions ## |
|
21 |
-## 29.10.13 : 0.4.4 seqTools C-code valgrind tested. ## |
|
22 |
-## 01.11.13 : 0.5.0 Distance matrices implemented ## |
|
23 |
-## 02.11.13 : 0.5.1 First working version with clustering based on ## |
|
24 |
-## K-mers ## |
|
25 |
-## 07.11.13 : 0.5.4 countFastaKmers now resizes arrays automatically ## |
|
26 |
-## 09.11.13 : 0.6.0 count_fastq now resizes arrays automatically ## |
|
27 |
-## 11.11.13 : 0.6.5 Added fastq simulation functions ## |
|
28 |
-## 19.11.13 : 0.7.0 Added trimFastq function ## |
|
29 |
-## 30.11.13 : 0.9.0 Separated R source files ## |
|
30 |
-## 22.12.13 : 0.99.2 Added '['-operator for Fastqq class ## |
|
31 |
-## 19.01.14 : 0.99.7 Added zlib version control for correction of ## |
|
32 |
-## gzBuffer ## |
|
33 |
-## error ## |
|
34 |
-## + checks: cran win-builder + valgrind ## |
|
35 |
-## 21.05.14 : 0.99.34 Corrected error in count_fasta_Kmers ## |
|
36 |
-## which freezed function ## |
|
37 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
38 |
- |
|
39 |
-.onUnload <- function(libpath) { library.dynam.unload("seqTools", libpath) } |
|
40 |
- |
|
41 |
-## see: http://bioconductor.org/developers/how-to/coding-style/ |
|
42 |
- |
|
43 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
44 |
-## Data collection functions: |
|
45 |
-## Fastqq, fastqKmerLocs, fastqKmerSubsetLocs |
|
46 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
47 |
- |
|
48 |
-fastqq <- function(filenames, k=6, probeLabel) |
|
49 |
-{ |
|
50 |
- k <- as.integer(k) |
|
51 |
- |
|
52 |
- tl <- list() |
|
53 |
- tl$start <- Sys.time() |
|
54 |
- filenames <- path.expand(filenames) |
|
55 |
- |
|
56 |
- res <- .Call("count_fastq", filenames, k, PACKAGE="seqTools") |
|
57 |
- |
|
58 |
- tl$end <- Sys.time() |
|
59 |
- res@collectTime <- tl |
|
60 |
- |
|
61 |
- # Correct minSeqLen when empty files are counted |
|
62 |
- if(any(res@nReads==0)) |
|
63 |
- res@seqLen[1,res@nReads==0] <- 0 |
|
64 |
- |
|
65 |
- if(!missing(probeLabel)) |
|
66 |
- { |
|
67 |
- if(length(probeLabel) == res@nFiles) |
|
68 |
- res@probeLabel <- as.character(probeLabel) |
|
69 |
- else{ |
|
70 |
- warning("[Fastqq] probeLabel and filenames must have equal length.") |
|
71 |
- res@probeLabel <- as.character(1:res@nFiles) |
|
72 |
- } |
|
73 |
- }else{ |
|
74 |
- res@probeLabel <- as.character(1:res@nFiles) |
|
75 |
- } |
|
76 |
- return(res) |
|
77 |
-} |
|
78 |
- |
|
79 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
80 |
-## fastq K-mer locs |
|
81 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
82 |
- |
|
83 |
-fastqKmerLocs <- function(filenames, k=4) |
|
84 |
-{ |
|
85 |
- if(!is.numeric(k)) |
|
86 |
- stop("'k' must be numeric.") |
|
87 |
- k <- as.integer(k) |
|
88 |
- |
|
89 |
- if( (k < 0) || (k > max_k) ) |
|
90 |
- stop("'k' must be in range 0, ... , 16.") |
|
91 |
- |
|
92 |
- return(.Call("fastq_Kmer_locs", filenames, k, PACKAGE="seqTools")) |
|
93 |
-} |
|
94 |
- |
|
95 |
- |
|
96 |
-fastqKmerSubsetLocs <- function(filenames, k=4, kIndex) |
|
97 |
-{ |
|
98 |
- # Returns list with matrix elements. |
|
99 |
- if(!is.numeric(k)) |
|
100 |
- stop("'k' must be numeric.") |
|
101 |
- |
|
102 |
- k <- as.integer(k) |
|
103 |
- |
|
104 |
- if( (k < 0) || (k > max_k) ) |
|
105 |
- stop("'k' must be in range 0, ... , ", max_k, ".") |
|
106 |
- |
|
107 |
- if(!is.numeric(kIndex)) |
|
108 |
- stop("'kIndex' must be numeric.") |
|
109 |
- kIndex <- as.integer(kIndex) |
|
110 |
- |
|
111 |
- if(any(kIndex < 0)) |
|
112 |
- stop("No negative 'kIndex' values allowed.") |
|
113 |
- |
|
114 |
- if(any(kIndex > (4^k)) ) |
|
115 |
- stop("'kIndex' out of range (>4^k).") |
|
116 |
- |
|
117 |
- return(.Call("fastq_KmerSubset_locs", |
|
118 |
- filenames, k, kIndex, PACKAGE="seqTools")) |
|
119 |
-} |
|
120 |
- |
|
121 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
122 |
-## End: Data collection functions. |
|
123 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
124 |
- |
|
125 |
- |
|
126 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
127 |
-## Standard slot accessor functions |
|
128 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
129 |
- |
|
130 |
- |
|
131 |
-setMethod("fileNames", "Fastqq", function(object) |
|
132 |
- {return(object@filenames)}) |
|
133 |
- |
|
134 |
-setMethod("collectTime", "Fastqq", function(object) |
|
135 |
- {return(object@collectTime)}) |
|
136 |
- |
|
137 |
-setMethod("collectDur", "Fastqq", function(object) { |
|
138 |
-return(as.numeric(difftime(object@collectTime$end, object@collectTime$start, |
|
139 |
- units = "secs"))) |
|
140 |
-}) |
|
141 |
- |
|
142 |
-setMethod("getK", "Fastqq", function(object) {return(object@k)}) |
|
143 |
- |
|
144 |
-setMethod("nFiles", "Fastqq", function(object) {return(object@nFiles)}) |
|
145 |
- |
|
146 |
-setMethod("nNnucs", "Fastqq", function(object) {return(object@nN)}) |
|
147 |
- |
|
148 |
-setMethod("nReads", "Fastqq", function(object) {return(object@nReads)}) |
|
149 |
- |
|
150 |
-setMethod("maxSeqLen", "Fastqq", function(object) {return(object@maxSeqLen)}) |
|
151 |
- |
|
152 |
-setMethod("seqLenCount", "Fastqq", function(object) |
|
153 |
-{ |
|
154 |
- res<-object@seqLenCount |
|
155 |
- colnames(res) <- object@probeLabel |
|
156 |
- return(res) |
|
157 |
-}) |
|
158 |
- |
|
159 |
-setMethod("nucFreq", "Fastqq", function(object, i) |
|
160 |
-{ |
|
161 |
- if(missing(i)) |
|
162 |
- stop("Argument 'i' is not optional.") |
|
163 |
- |
|
164 |
- if(!is.numeric(i)) |
|
165 |
- stop("'i' must be numeric.") |
|
166 |
- |
|
167 |
- i <- as.integer(i) |
|
168 |
- if( (i < 1) || (i > object@nFiles) ) |
|
169 |
- stop("'i' must be >0 and < nFiles.") |
|
170 |
- |
|
171 |
- return(object@nac[[i]]) |
|
172 |
-}) |
|
173 |
- |
|
174 |
-setMethod("gcContent", "Fastqq", function(object, i) |
|
175 |
-{ |
|
176 |
- if(missing(i)) |
|
177 |
- stop("Argument 'i' is not optional.") |
|
178 |
- |
|
179 |
- if(!is.numeric(i)) |
|
180 |
- stop("'i' must be numeric.") |
|
181 |
- |
|
182 |
- i <- as.integer(i) |
|
183 |
- if( (i < 1) || (i > object@nFiles)) |
|
184 |
- stop("'i' must be >0 and < nFiles.") |
|
185 |
- |
|
186 |
- return(object@gcContent[, i]) |
|
187 |
-}) |
|
188 |
- |
|
189 |
- |
|
190 |
-setMethod("gcContentMatrix", "Fastqq", function(object) |
|
191 |
-{ |
|
192 |
- gcc <- object@gcContent |
|
193 |
- colnames(gcc) <- object@probeLabel |
|
194 |
- return(gcc) |
|
195 |
-}) |
|
196 |
- |
|
197 |
-setMethod("seqLen", "Fastqq", function(object) |
|
198 |
-{ |
|
199 |
- sql <- object@seqLen |
|
200 |
- colnames(sql) <- object@probeLabel |
|
201 |
- return(sql) |
|
202 |
-}) |
|
203 |
- |
|
204 |
-setMethod("kmerCount", "Fastqq", function(object) |
|
205 |
-{ |
|
206 |
- kmer <- object@kmer |
|
207 |
- colnames(kmer) <- object@probeLabel |
|
208 |
- return(kmer) |
|
209 |
-}) |
|
210 |
- |
|
211 |
- |
|
212 |
-setMethod("probeLabel", "Fastqq", function(object){return(object@probeLabel)}) |
|
213 |
-setReplaceMethod("probeLabel", "Fastqq", function(object, value) |
|
214 |
-{ |
|
215 |
- if(length(value) != nFiles(object)) |
|
216 |
- stop("'value' must have length ", nFiles(object), ".") |
|
217 |
- |
|
218 |
- val <- as.character(value) |
|
219 |
- if(any(table(val)) > 1) |
|
220 |
- { |
|
221 |
- warning("[probeLabel <- .Fastqq] probeLabel unique suffix added.") |
|
222 |
- val <- paste(1:nFiles(object), val, sep="_") |
|
223 |
- } |
|
224 |
- object@probeLabel <- val |
|
225 |
- |
|
226 |
- return(object) |
|
227 |
-}) |
|
228 |
- |
|
229 |
-setMethod("phred", signature="Fastqq", definition=function(object, i) |
|
230 |
-{ |
|
231 |
- if(missing(i)) |
|
232 |
- stop("Argument 'i' is not optional.") |
|
233 |
- |
|
234 |
- if(!is.numeric(i)) |
|
235 |
- stop("'i' must be numeric.") |
|
236 |
- |
|
237 |
- i <- as.integer(i) |
|
238 |
- if( (i < 1) || (i > object@nFiles) ) |
|
239 |
- stop("'i' must be >0 and < nFiles.") |
|
240 |
- |
|
241 |
- return(object@phred[[i]]) |
|
242 |
-}) |
|
243 |
- |
|
244 |
- |
|
245 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
246 |
-## End: Standard slot accessor functions |
|
247 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
248 |
- |
|
249 |
- |
|
250 |
-setMethod("show", "Fastqq", function(object) |
|
251 |
-{ |
|
252 |
- bm <- Sys.localeconv()[7] |
|
253 |
- w <- 20 |
|
254 |
- r <- "right" |
|
255 |
- cat("Class : ", format(class(object), w=w, j=r) , "\n", sep="") |
|
256 |
- cat("nFiles : ", format(format(nFiles(object) , big.m=bm), w=w, j=r) , "\n", sep="") |
|
257 |
- cat("maxSeqLen : ", format(format(maxSeqLen(object) , big.m=bm), w=w, j=r) , "\n", sep="") |
|
258 |
- cat("k (Kmer len): ", format(format(getK(object) , big.m=bm), w=w, j=r) , "\n", sep="") |
|
259 |
- cat("\n") |
|
260 |
- cat("nReads : ", format(format(sum(as.numeric(nReads(object))) , big.m=bm), w=w, j=r) , "\n", sep="") |
|
261 |
- cat("nr N nuc : ", format(format(sum(nNnucs(object)) , big.m=bm), w=w, j=r) , "\n", sep="") |
|
262 |
- cat("Min seq len : ", format(format(min(seqLen(object)[1, ]), big.m=bm), w=w, j=r) , "\n", sep="") |
|
263 |
- cat("Max seq len : ", format(format(max(seqLen(object)[2, ]), big.m=bm), w=w, j=r) , "\n", sep="") |
|
264 |
- return(invisible()) |
|
265 |
-}) |
|
266 |
- |
|
267 |
- |
|
268 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
269 |
-## Phred related functions |
|
270 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
271 |
- |
|
272 |
-setMethod("phredQuantiles", "Fastqq", function(object, quantiles, i, ...) |
|
273 |
-{ |
|
274 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
275 |
- # Checking arguments |
|
276 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
277 |
- |
|
278 |
- ## Check quantiles argument |
|
279 |
- if(missing(quantiles)) |
|
280 |
- stop("'quantiles' argument is not optional") |
|
281 |
- |
|
282 |
- if(!is.numeric(quantiles)) |
|
283 |
- stop("Quantiles must be numeric.") |
|
284 |
- |
|
285 |
- if(!(all(quantiles >= 0) & all(quantiles <= 1) )) |
|
286 |
- stop("All quantiles mustbe in [0, 1]") |
|
287 |
- |
|
288 |
- quantiles <- sort(unique(round(quantiles, 2))) |
|
289 |
- |
|
290 |
- ## Check 'i' argument |
|
291 |
- if(missing(i)) |
|
292 |
- stop("'i' argument is not optional") |
|
293 |
- |
|
294 |
- if(!is.numeric(i)) |
|
295 |
- stop("'i' must be numeric.") |
|
296 |
- |
|
297 |
- if(length(i) > 1) |
|
298 |
- stop("'i' must have length 1") |
|
299 |
- |
|
300 |
- i <- as.integer(i) |
|
301 |
- if( (i < 1) || (i > object@nFiles) ) |
|
302 |
- stop("'i' must be >0 and <nFiles.") |
|
303 |
- |
|
304 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
305 |
- # Count qual values for each sequence position |
|
306 |
- # Convert integer counts into column-wise relative values |
|
307 |
- # Maximum counted read length |
|
308 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
309 |
- vec <- 1:seqLen(object)[2, i] |
|
310 |
- qrel <- as.data.frame(apply(object@phred[[i]][, vec], 2, rel_int)) |
|
311 |
- names(qrel) <- vec |
|
312 |
- |
|
313 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
314 |
- # Walk through each column and extract row number |
|
315 |
- # for given quantile values |
|
316 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
317 |
- res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE="seqTools") |
|
318 |
- return(res) |
|
319 |
-}) |
|
320 |
- |
|
321 |
- |
|
322 |
-setMethod("plotPhredQuant", "Fastqq", function(object, i, main, ...){ |
|
323 |
- if(!is.numeric(i)) |
|
324 |
- stop("'i' must be numeric.") |
|
325 |
- |
|
326 |
- i <- as.integer(i) |
|
327 |
- |
|
328 |
- if( (i < 1) || (i > object@nFiles) ) |
|
329 |
- stop("'i' must be >0 and <nFiles.") |
|
330 |
- |
|
331 |
- quant <- c(0.1, 0.25, 0.5, 0.75, 0.9) |
|
332 |
- cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4") |
|
333 |
- qq <- phredQuantiles(object, quant, i) |
|
334 |
- maxQ = floor(1.2*max(qq)) |
|
335 |
- xv <- 1:ncol(qq) |
|
336 |
- |
|
337 |
- if(missing(main)) |
|
338 |
- { |
|
339 |
- main <- paste("Position wise Phred Quantiles (", |
|
340 |
- probeLabel(object)[i], ")", sep="") |
|
341 |
- } |
|
342 |
- |
|
343 |
- plot(xv, xv, ylim=c(0, maxQ), type="n", bty="n", las=1, |
|
344 |
- ylab = "Phred score", xlab="Sequence position", main=main, ...) |
|
345 |
- |
|
346 |
- lines(xv, qq[1, ], col=cols[1], lty=2) |
|
347 |
- lines(xv, qq[2, ], col=cols[2], lty=1) |
|
348 |
- lines(xv, qq[3, ], col=cols[3], lwd=2) |
|
349 |
- lines(xv, qq[4, ], col=cols[4], lty=1) |
|
350 |
- lines(xv, qq[5, ], col=cols[5], lty=2) |
|
351 |
- |
|
352 |
- legend("top", ncol=6, lty=c(2, 1, 1, 1, 2), |
|
353 |
- lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5, |
|
354 |
- legend=c("10%", "25%", "50%", "75%", "90%"), bty="n", cex=0.8) |
|
355 |
- return(invisible()) |
|
356 |
-}) |
|
357 |
- |
|
358 |
- |
|
359 |
- |
|
360 |
- |
|
361 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
362 |
-## Global Phred content functions |
|
363 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
364 |
-setMethod("phredDist", "Fastqq", function(object, i){ |
|
365 |
- idx <- 1:nFiles(object) |
|
366 |
- |
|
367 |
- if(missing(i)) |
|
368 |
- i <- idx |
|
369 |
- else |
|
370 |
- { |
|
371 |
- if(!is.numeric(i)) |
|
372 |
- stop("'i' must be numeric.") |
|
373 |
- |
|
374 |
- if(!all(is.element(i, idx))) |
|
375 |
- stop("'i' is out of range.") |
|
376 |
- } |
|
377 |
- |
|
378 |
- phred <- Reduce("+", object@phred[i]) |
|
379 |
- phred <- matrix(as.numeric(phred), nrow=nrow(phred)) |
|
380 |
- |
|
381 |
- phred_vals <- apply(phred, 1, sum) |
|
382 |
- phred_dist <- phred_vals/sum(phred_vals) |
|
383 |
- names(phred_dist) <- rownames(object@phred[[1]]) |
|
384 |
- |
|
385 |
- return(phred_dist) |
|
386 |
-}) |
|
387 |
- |
|
388 |
- |
|
389 |
-setMethod("plotPhredDist", "Fastqq", function(object, i, maxp=45, col, ...) |
|
390 |
-{ |
|
391 |
- if(!is.numeric(maxp)) |
|
392 |
- stop("maxp must be numeric") |
|
393 |
- |
|
394 |
- if(!is.integer(maxp)) |
|
395 |
- maxp<-as.integer(maxp) |
|
396 |
- |
|
397 |
- if(maxp <= 0) |
|
398 |
- stop("maxp must be >=0") |
|
399 |
- |
|
400 |
- if(missing(col)) |
|
401 |
- col <- topo.colors(10)[3] |
|
402 |
- |
|
403 |
- phred <- phredDist(object, i) |
|
404 |
- maxy <- ceiling(max(phred) * 5) / 5 |
|
405 |
- x <- 1:maxp |
|
406 |
- xmax <- 10 * (ceiling(maxp / 10)) |
|
407 |
- |
|
408 |
- plot(x, phred[x], ylim=c(0, maxy), xlim=c(0, xmax), type="l", lwd=2, |
|
409 |
- col=col, yaxt="n", bty="n", xlab="Phred value", |
|
410 |
- ylab="Content (%)", ...) |
|
411 |
- |
|
412 |
- ylb <- 0:(10 * maxy) / 10 |
|
413 |
- axis(side=2, at=ylb, labels=100 * ylb, las=1) |
|
414 |
- return(invisible()) |
|
415 |
-}) |
|
416 |
- |
|
417 |
- |
|
418 |
-# + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
419 |
-# Not exported: |
|
420 |
-# + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
421 |
-pd_l10 <- function(x){ x <- phredDist(x); return(sum(x[1:10]) / sum(x))} |
|
422 |
- |
|
423 |
- |
|
424 |
-setMethod("propPhred","Fastqq",function(object, greater=30, less=93) |
|
425 |
-{ |
|
426 |
- if(!is.numeric(greater)) |
|
427 |
- stop("'greater' must be numeric.") |
|
428 |
- |
|
429 |
- if(length(greater) != 1) |
|
430 |
- stop("'greater' must have length 1.") |
|
431 |
- |
|
432 |
- if(!is.numeric(less)) |
|
433 |
- stop("'less' must be numeric.") |
|
434 |
- |
|
435 |
- if(length(less) != 1) |
|
436 |
- stop("'less must have length 1.") |
|
437 |
- |
|
438 |
- ## + + + + + + + + + + + + + + + + + + + + + + ## |
|
439 |
- ## greater and less shall be used as |
|
440 |
- ## array indices: increase greater |
|
441 |
- ## + + + + + + + + + + + + + + + + + + + + + + ## |
|
442 |
- greater<-as.integer(greater+1) |
|
443 |
- less<-as.integer(less) |
|
444 |
- |
|
445 |
- if(greater < 1) |
|
446 |
- stop("'greater' must be >=0.") |
|
447 |
- |
|
448 |
- if(less > 93) |
|
449 |
- stop("'less' must be < 94.") |
|
450 |
- |
|
451 |
- if(greater >= less) |
|
452 |
- stop("'greater' must be <= 'less'") |
|
453 |
- |
|
454 |
- n <- nFiles(object) |
|
455 |
- res <- numeric(n) |
|
456 |
- for(i in 1:n) |
|
457 |
- { |
|
458 |
- pd <- phredDist(object, i) |
|
459 |
- res[i] <- sum(pd[greater:less]) |
|
460 |
- } |
|
461 |
- names(res) <- probeLabel(object) |
|
462 |
- return(res) |
|
463 |
-}) |
|
464 |
- |
|
465 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
466 |
-## End: Global Phred content functions |
|
467 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
468 |
- |
|
469 |
-setMethod("mergedPhred", "Fastqq", function(object){ |
|
470 |
- sql <- seqLen(object) |
|
471 |
- maxSeqLen <- max(sql[2, ]) |
|
472 |
- |
|
473 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
474 |
- ## as.numeric: Sum on integer is likely to exceed |
|
475 |
- ## maximal 32-bit integer values |
|
476 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
477 |
- |
|
478 |
- if(sql[2, 1] < maxSeqLen) |
|
479 |
- { |
|
480 |
- mp <- as.numeric(.Call("r_enlarge_int_mat", object@phred[[1]], |
|
481 |
- c(nrow(object@phred[[1]]), maxSeqLen), PACKAGE="seqTools")) |
|
482 |
- }else{ |
|
483 |
- mp <- as.numeric(object@phred[[1]]) |
|
484 |
- } |
|
485 |
- |
|
486 |
- |
|
487 |
- n <- nFiles(object) |
|
488 |
- if(n > 1) |
|
489 |
- { |
|
490 |
- for(i in 2:n) |
|
491 |
- { |
|
492 |
- if(sql[2, i] < maxSeqLen) |
|
493 |
- { |
|
494 |
- mp <- mp + as.numeric(.Call("r_enlarge_int_mat", |
|
495 |
- object@phred[[i]], |
|
496 |
- c(nrow(object@phred[[i]]), maxSeqLen), |
|
497 |
- PACKAGE="seqTools")) |
|
498 |
- }else{ |
|
499 |
- mp <- mp + as.numeric(object@phred[[i]]) |
|
500 |
- } |
|
501 |
- } |
|
502 |
- } |
|
503 |
- mp <- matrix(mp, ncol = maxSeqLen) |
|
504 |
- rownames(mp) <- rownames(object@phred[[1]]) |
|
505 |
- colnames(mp) <- 1:maxSeqLen |
|
506 |
- return(mp) |
|
507 |
-}) |
|
508 |
- |
|
509 |
-setMethod("mergedPhredQuantiles", "Fastqq", function(object, quantiles) |
|
510 |
-{ |
|
511 |
- if(!(all(quantiles >= 0) & all(quantiles <= 1)) ) |
|
512 |
- stop("[mergedPhredQuantiles.Fastqq] all quantiles mustbe in [0, 1]") |
|
513 |
- quantiles <- sort(unique(round(quantiles, 2))) |
|
514 |
- |
|
515 |
- sql <- seqLen(object) |
|
516 |
- maxSeqLen <- max(sql[2, ]) |
|
517 |
- |
|
518 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
519 |
- ## Count qual values for each sequence position |
|
520 |
- ## Convert counts into column-wise relative values |
|
521 |
- ## Maximum counted read length |
|
522 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
523 |
- vec <- 1:maxSeqLen |
|
524 |
- mrg <- mergedPhred(object) |
|
525 |
- qrel <- as.data.frame(apply(mrg[, vec], 2, rel_real)) |
|
526 |
- names(qrel) |
|
527 |
- names(qrel) <- vec |
|
528 |
- |
|
529 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
530 |
- ## Walk through each column and extract row number |
|
531 |
- ## for given quantile values |
|
532 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
533 |
- res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE="seqTools") |
|
534 |
- return(res) |
|
535 |
-}) |
|
536 |
- |
|
537 |
- |
|
538 |
-setMethod("plotMergedPhredQuant", "Fastqq", function(object, main, ...) |
|
539 |
-{ |
|
540 |
- quant <- c(0.1, 0.25, 0.5, 0.75, 0.9) |
|
541 |
- cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4") |
|
542 |
- qq <- mergedPhredQuantiles(object, quant) |
|
543 |
- maxQ = floor(1.2*max(qq)) |
|
544 |
- xv <- 1:ncol(qq) |
|
545 |
- |
|
546 |
- if(missing(main)) |
|
547 |
- main <- paste("Merged position wise Phred Quantiles.", sep = "") |
|
548 |
- |
|
549 |
- plot(xv, xv, ylim=c(0, maxQ), type="n", bty="n", las=1, |
|
550 |
- ylab="Phred score", xlab="Sequence position", main=main, ...) |
|
551 |
- |
|
552 |
- |
|
553 |
- lines(xv, qq[1, ], col=cols[1], lty=2) |
|
554 |
- lines(xv, qq[2, ], col=cols[2], lty=1) |
|
555 |
- lines(xv, qq[3, ], col=cols[3], lwd=2) |
|
556 |
- lines(xv, qq[4, ], col=cols[4], lty=1) |
|
557 |
- lines(xv, qq[5, ], col=cols[5], lty=2) |
|
558 |
- |
|
559 |
- legend("top", ncol=6, lty=c(2, 1, 1, 1, 2), |
|
560 |
- lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5, |
|
561 |
- legend=c("10%", "25%", "50%", "75%", "90%"), bty="n", cex=0.8) |
|
562 |
- |
|
563 |
- return(invisible()) |
|
564 |
-}) |
|
565 |
- |
|
566 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
567 |
-## End: Phred related functions |
|
568 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
569 |
- |
|
570 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
571 |
-## Nucleotide frequency related functions |
|
572 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
573 |
- |
|
574 |
- |
|
575 |
-setMethod("plotNucFreq", "Fastqq", function(object, i, main, maxx, ...) |
|
576 |
-{ |
|
577 |
- if(!is.numeric(i)) |
|
578 |
- stop("'i' must be numeric.") |
|
579 |
- |
|
580 |
- i <- as.integer(i) |
|
581 |
- |
|
582 |
- if( (i < 1) || (i > object@nFiles) ) |
|
583 |
- stop("'i' must be >0 and <nFiles.") |
|
584 |
- |
|
585 |
- maxSeqlen <- max(seqLen(object)[2, ]) |
|
586 |
- if(missing(maxx)) |
|
587 |
- { |
|
588 |
- maxx <- maxSeqlen |
|
589 |
- } |
|
590 |
- else |
|
591 |
- { |
|
592 |
- if(!is.numeric(maxx)) |
|
593 |
- stop("'maxx' must be numeric.") |
|
594 |
- maxx <- as.integer(maxx) |
|
595 |
- if(maxx < 1) |
|
596 |
- stop("'maxx' must be >0.") |
|
597 |
- if(maxx > maxSeqlen) |
|
598 |
- maxx <- maxSeqlen |
|
599 |
- } |
|
600 |
- |
|
601 |
- # Skip extra line |
|
602 |
- x <- 1:maxx |
|
603 |
- nac <- object@nac[[i]][1:4, x] |
|
604 |
- nacrel <- apply(nac, 2, rel_int) |
|
605 |
- maxY = round(1.4 * max(nacrel), 1) |
|
606 |
- |
|
607 |
- # Maximum counted read length |
|
608 |
- nacrel <- apply(nac, 2, rel_int) |
|
609 |
- cols <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00") |
|
610 |
- |
|
611 |
- if(missing(main)) |
|
612 |
- main <- paste("Position wise Nucleotide frequency (", |
|
613 |
- probeLabel(object)[i], ")", sep="") |
|
614 |
- |
|
615 |
- plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1, |
|
616 |
- ylab="Nucleotide fequency", xlab="sequence position", |
|
617 |
- main=main, ...) |
|
618 |
- |
|
619 |
- lines(x, nacrel[1, ], col=cols[1], lwd=2) |
|
620 |
- lines(x, nacrel[2, ], col=cols[2], lwd=2) |
|
621 |
- lines(x, nacrel[3, ], col=cols[3], lwd=2) |
|
622 |
- lines(x, nacrel[4, ], col=cols[4], lwd=2) |
|
623 |
- |
|
624 |
- legend("top", ncol=6, |
|
625 |
- lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5, |
|
626 |
- legend=c("A", "C", "G", "T"), bty="n", cex=0.8) |
|
627 |
- |
|
628 |
- return(invisible()) |
|
629 |
-}) |
|
630 |
- |
|
631 |
- |
|
632 |
-setMethod("plotGCcontent", "Fastqq", function(object,main,...) |
|
633 |
-{ |
|
634 |
- cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", |
|
635 |
- "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A") |
|
636 |
- |
|
637 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
638 |
- ## Normalize matrix to colsum = 1 |
|
639 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
640 |
- |
|
641 |
- gc <- apply(object@gcContent, 2, rel_int) |
|
642 |
- maxY = round(1.3*max(gc), 2) |
|
643 |
- nf <- nFiles(object) |
|
644 |
- x <- 1:nrow(gc) |
|
645 |
- |
|
646 |
- if(missing(main)) |
|
647 |
- main<-"GC content" |
|
648 |
- |
|
649 |
- plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1, |
|
650 |
- ylab="Proportion of reads (%)", xlab="Relative GC content (%)", |
|
651 |
- main=main, ...) |
|
652 |
- |
|
653 |
- for(i in 1:nf) |
|
654 |
- lines(x, gc[, i], col=cols[i], lwd=2) |
|
655 |
- |
|
656 |
- legend("right", lwd=2, col=cols, legend=probeLabel(object), |
|
657 |
- bty="n", cex=0.8) |
|
658 |
- |
|
659 |
- return(invisible()) |
|
660 |
-}) |
|
661 |
- |
|
662 |
- |
|
663 |
-setMethod("plotNucCount", "Fastqq", function(object, nucs=16, maxx, ...) |
|
664 |
-{ |
|
665 |
- |
|
666 |
- # j = 16: N, j = 2:3: gc |
|
667 |
- if(!is.numeric(nucs)) |
|
668 |
- stop("'nucs' must be numeric.") |
|
669 |
- |
|
670 |
- nucs <- as.integer(nucs) |
|
671 |
- if(any(nucs < 1) || any(nucs > 19)) |
|
672 |
- stop("'nucs' must be >0 and <20.") |
|
673 |
- |
|
674 |
- maxSeqlen <- max(seqLen(object)[2, ]) |
|
675 |
- |
|
676 |
- if(missing(maxx)) |
|
677 |
- { |
|
678 |
- maxx <- maxSeqlen |
|
679 |
- } |
|
680 |
- else |
|
681 |
- { |
|
682 |
- if(!is.numeric(maxx)) |
|
683 |
- stop("'maxx' must be numeric.") |
|
684 |
- |
|
685 |
- maxx <- as.integer(maxx) |
|
686 |
- |
|
687 |
- if(maxx<1) |
|
688 |
- stop("'maxx must be >0.") |
|
689 |
- |
|
690 |
- if(maxx > maxSeqlen) |
|
691 |
- maxx <- maxSeqlen |
|
692 |
- } |
|
693 |
- |
|
694 |
- n <- nFiles(object) |
|
695 |
- fvec <- 1:n |
|
696 |
- |
|
697 |
- i <- 1 |
|
698 |
- |
|
699 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
700 |
- ## Skip extra line |
|
701 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
702 |
- x <- 1:maxx |
|
703 |
- nac <- object@nac[[i]][, x] |
|
704 |
- nacrel <- apply(nac, 2, rel_int) |
|
705 |
- |
|
706 |
- if(length(nucs) == 1){ |
|
707 |
- dfr <- data.frame(a = nacrel[nucs, ]) |
|
708 |
- }else{ |
|
709 |
- dfr <- data.frame(a = apply(nacrel[nucs, ], 2, sum)) |
|
710 |
- } |
|
711 |
- |
|
712 |
- if(n > 1) |
|
713 |
- { |
|
714 |
- for(i in 2:n) |
|
715 |
- { |
|
716 |
- nac <- object@nac[[i]][, x] |
|
717 |
- nacrel <- apply(nac, 2, rel_int) |
|
718 |
- if(length(nucs) == 1){ |
|
719 |
- dfr[, i] <- nacrel[nucs, ] |
|
720 |
- }else{ |
|
721 |
- dfr[, i] <- apply(nacrel[nucs, ], 2, sum) |
|
722 |
- } |
|
723 |
- } |
|
724 |
- } |
|
725 |
- |
|
726 |
- maxY = round(1.4 * max(dfr), 3) |
|
727 |
- cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", |
|
728 |
- "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A") |
|
729 |
- |
|
730 |
- nv <- paste(iupac_chars[nucs], collapse = "") |
|
731 |
- |
|
732 |
- plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1, |
|
733 |
- ylab="Nucleotide fequency", xlab="sequence position", |
|
734 |
- main=paste("Position wise Nucleotide frequency: '", |
|
735 |
- nv, "'", sep="")) |
|
736 |
- |
|
737 |
- for(i in fvec) |
|
738 |
- lines(x, dfr[, i], col=cols[i %% 10], lwd=2) |
|
739 |
- |
|
740 |
- legend("top", ncol=6, |
|
741 |
- lwd=c(1, 1, 2, 1, 1), col=cols[fvec %% 10], xjust=0.5, |
|
742 |
- legend=probeLabel(object), bty="n", cex=0.8) |
|
743 |
- |
|
744 |
- return(invisible()) |
|
745 |
-}) |
|
746 |
- |
|
747 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
748 |
-## End: Nucleotide frequency related functions |
|
749 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
750 |
- |
|
751 |
- |
|
752 |
- |
|
753 |
-setMethod("plotKmerCount", "Fastqq", |
|
754 |
- function(object, index, mxey, main="K-mer count", ...) |
|
755 |
-{ |
|
756 |
- n <- nFiles(object) |
|
757 |
- if(missing(index)) |
|
758 |
- { |
|
759 |
- index <- 1:n |
|
760 |
- }else{ |
|
761 |
- if(!is.numeric(index)) |
|
762 |
- stop("'index' must be numeric.") |
|
763 |
- |
|
764 |
- index <- sort(unique(as.integer(index))) |
|
765 |
- |
|
766 |
- if(any(index < 0) || any(index > n)) |
|
767 |
- stop("'index' must be in 1, .., ", n, " ( = nFiles).") |
|
768 |
- } |
|
769 |
- |
|
770 |
- if(!missing(mxey)) |
|
771 |
- { |
|
772 |
- if(!is.numeric(mxey)) |
|
773 |
- stop("'mxey' must be numeric.") |
|
774 |
- mxey <- as.integer(mxey) |
|
775 |
- |
|
776 |
- if(mxey<1) |
|
777 |
- stop("'mxey' must be positive.") |
|
778 |
- } |
|
779 |
- |
|
780 |
- cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", |
|
781 |
- "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A") |
|
782 |
- pk <- 6 |
|
783 |
- if(object@k <= pk) |
|
784 |
- { |
|
785 |
- pk <- object@k |
|
786 |
- x <- 1:(4^pk) |
|
787 |
- |
|
788 |
- if(missing(mxey)) |
|
789 |
- lg2y <- floor(log2(1.2 * (max(object@kmer)))) |
|
790 |
- else |
|
791 |
- lg2y <- mxey |
|
792 |
- |
|
793 |
- maxY <- 2^lg2y |
|
794 |
- |
|
795 |
- |
|
796 |
- plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1, |
|
797 |
- ylab="K-mer count", xlab="K-mer index", main=main, |
|
798 |
- axes=FALSE, ...) |
|
799 |
- |
|
800 |
- for(i in index) |
|
801 |
- lines(x, object@kmer[, i], col=cols[i], lwd=2) |
|
802 |
- } |
|
803 |
- else |
|
804 |
- { |
|
805 |
- x <- 1:(4^pk) |
|
806 |
- melt_factor <- as.integer(4^(object@k - pk)) |
|
807 |
- |
|
808 |
- y_factor <- max(.Call("melt_vector", object@kmer[, 1], melt_factor, |
|
809 |
- PACKAGE="seqTools")) / max(object@kmer[, 1]) |
|
810 |
- |
|
811 |
- if(missing(mxey)) |
|
812 |
- lg2y <- floor(log2(1.2 * (max(object@kmer)) * y_factor)) |
|
813 |
- else |
|
814 |
- lg2y <- mxey |
|
815 |
- maxY <- 2^lg2y |
|
816 |
- |
|
817 |
- plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1, |
|
818 |
- ylab="K-mer count", xlab="K-mer index", |
|
819 |
- main=main, axes=FALSE, ...) |
|
820 |
- |
|
821 |
- for(i in index) |
|
822 |
- { |
|
823 |
- lines(x, .Call("melt_vector", object@kmer[, i], melt_factor, |
|
824 |
- PACKAGE="seqTools"), col=cols[i], lwd=2) |
|
825 |
- } |
|
826 |
- } |
|
827 |
- axis(side=1, at=0:4 * 4^(pk - 1), labels=c("A", "C", "G", "T", "")) |
|
828 |
- |
|
829 |
- axis(side=2, at=c(0, maxY), |
|
830 |
- labels=c(0, paste("2^", lg2y, sep="")), las=1) |
|
831 |
- |
|
832 |
- legend("right", lwd=2, col=cols[index], |
|
833 |
- legend=probeLabel(object)[index], bty="n", cex=0.8) |
|
834 |
- |
|
835 |
- return(invisible()) |
|
836 |
-}) |
|
837 |
- |
|
838 |
- |
|
839 |
- |
|
840 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
841 |
-## Merging and melting Fastqq objects |
|
842 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
843 |
- |
|
844 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
845 |
-setMethod("mergeFastqq", "Fastqq", function(lhs, rhs) |
|
846 |
-{ |
|
847 |
- |
|
848 |
- res <- new("Fastqq") |
|
849 |
- ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## |
|
850 |
- ## Simple concatenations |
|
851 |
- ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## |
|
852 |
- |
|
853 |
- res@filenames <- c(lhs@filenames, rhs@filenames) |
|
854 |
- res@nFiles <- lhs@nFiles+rhs@nFiles |
|
855 |
- res@nReads <- c(lhs@nReads, rhs@nReads) |
|
856 |
- res@nN <- c(lhs@nN, rhs@nN) |
|
857 |
- res@seqLenCount <- cbind(lhs@seqLenCount, rhs@seqLenCount) |
|
858 |
- res@gcContent <- cbind(gcContentMatrix(lhs), gcContentMatrix(rhs)) |
|
859 |
- res@seqLen <- cbind(lhs@seqLen, rhs@seqLen) |
|
860 |
- |
|
861 |
- ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## |
|
862 |
- ## Singularize probeLabel |
|
863 |
- ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## |
|
864 |
- |
|
865 |
- res@probeLabel <- c(lhs@probeLabel, rhs@probeLabel) |
|
866 |
- if(any(table(res@probeLabel) > 1)) |
|
867 |
- { |
|
868 |
- message("[mergeFastqq] Singularizing probeLabel (new suffix).") |
|
869 |
- res@probeLabel <- paste(1:res@nFiles, res@probeLabel, sep="_") |
|
870 |
- } |
|
871 |
- |
|
872 |
- ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## |
|
873 |
- ## Eventually resize arrays when lhs and rhs have different maxSeqLen |
|
874 |
- ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ## |
|
875 |
- if(lhs@maxSeqLen > rhs@maxSeqLen) |
|
876 |
- { |
|
877 |
- message("[mergeFastqq] Resizing rhs.") |
|
878 |
- msl <- lhs@maxSeqLen |
|
879 |
- |
|
880 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
881 |
- ## nac |
|
882 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
883 |
- |
|
884 |
- new_dim <- as.integer(c(nrow(rhs@nac), msl)) |
|
885 |
- rhs_nac <- .Call("r_enlarge_int_mat", rhs@nac, new_dim, |
|
886 |
- PACKAGE="seqTools") |
|
887 |
- res@nac <- c(lhs@nac, rhs_nac) |
|
888 |
- |
|
889 |
- # seqLenCount |
|
890 |
- new_dim <- as.integer(c(msl, rhs@nFiles)) |
|
891 |
- rhs_seqLenCount <- .Call("r_enlarge_int_mat", |
|
892 |
- rhs@seqLenCount, new_dim, PACKAGE="seqTools") |
|
893 |
- |
|
894 |
- res@seqLenCount <- c(lhs@seqLenCount, rhs_seqLenCount) |
|
895 |
- |
|
896 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
897 |
- ## phred |
|
898 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
899 |
- |
|
900 |
- new_dim <- as.integer(nrow(rhs@phred), msl) |
|
901 |
- rhs_phred_list <- list() |
|
902 |
- |
|
903 |
- for(i in 1:nFiles(rhs)){ |
|
904 |
- rhs_phred_list[[i]] <- .Call("r_enlarge_int_mat", |
|
905 |
- rhs@phred[[i]], new_dim, PACKAGE="seqTools") |
|
906 |
- } |
|
907 |
- |
|
908 |
- res@phred <- c(lhs@phred, rhs_phred_list) |
|
909 |
- res@maxSeqLen <- lhs@maxSeqLen |
|
910 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
911 |
- |
|
912 |
- } else if(rhs@maxSeqLen > lhs@maxSeqLen) |
|
913 |
- { |
|
914 |
- |
|
915 |
- message("[mergeFastqq] Resizing lhs.") |
|
916 |
- msl <- rhs@maxSeqLen |
|
917 |
- |
|
918 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
919 |
- ## nac |
|
920 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
921 |
- |
|
922 |
- new_dim <- as.integer(c(nrow(lhs@nac), msl)) |
|
923 |
- |
|
924 |
- lhs_nac <- .Call("r_enlarge_int_mat", |
|
925 |
- lhs@nac, new_dim, PACKAGE="seqTools") |
|
926 |
- |
|
927 |
- res@nac <- c(rhs@nac, lhs_nac) |
|
928 |
- |
|
929 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
930 |
- ## seqLenCount |
|
931 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
932 |
- |
|
933 |
- new_dim <- as.integer(c(msl, lhs@nFiles)) |
|
934 |
- |
|
935 |
- lhs_seqLenCount <- .Call("r_enlarge_int_mat", |
|
936 |
- lhs@seqLenCount, new_dim, PACKAGE="seqTools") |
|
937 |
- |
|
938 |
- res@seqLenCount <- c(rhs@seqLenCount, lhs_seqLenCount) |
|
939 |
- |
|
940 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
941 |
- ## phred |
|
942 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
943 |
- |
|
944 |
- new_dim <- as.integer(nrow(lhs@phred), msl) |
|
945 |
- lhs_phred_list <- list() |
|
946 |
- |
|
947 |
- for(i in 1:nFiles(lhs)) |
|
948 |
- { |
|
949 |
- lhs_phred_list[[i]] <- .Call("r_enlarge_int_mat", |
|
950 |
- lhs@phred[[i]], new_dim, PACKAGE="seqTools") |
|
951 |
- } |
|
952 |
- res@phred <- c(rhs@phred, lhs_phred_list) |
|
953 |
- res@maxSeqLen <- rhs@maxSeqLen |
|
954 |
- |
|
955 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
956 |
- |
|
957 |
- } else { |
|
958 |
- |
|
959 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
960 |
- ## rhs@maxSeqLen == lhs@maxSeqLen |
|
961 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
962 |
- |
|
963 |
- res@maxSeqLen = lhs@maxSeqLen |
|
964 |
- res@nac <- c(lhs@nac, rhs@nac) |
|
965 |
- res@phred <- c(lhs@phred, rhs@phred) |
|
966 |
- |
|
967 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
968 |
- |
|
969 |
- } |
|
970 |
- |
|
971 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
972 |
- ## Eventually melt down k-mer count matrix |
|
973 |
- ## when lhs and rhs have different k |
|
974 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
975 |
- res@k <- pmin(lhs@k, rhs@k) |
|
976 |
- |
|
977 |
- kml <- kmerCount(lhs) |
|
978 |
- if(lhs@k > res@k) |
|
979 |
- { |
|
980 |
- kml <- .Call("melt_kmer_matrix", |
|
981 |
- kml, c(lhs@k, res@k), PACKAGE="seqTools") |
|
982 |
- } |
|
983 |
- |
|
984 |
- kmr <- kmerCount(rhs) |
|
985 |
- if(rhs@k > res@k) |
|
986 |
- { |
|
987 |
- kmr <- .Call("melt_kmer_matrix", |
|
988 |
- kmr, c(rhs@k, res@k), PACKAGE="seqTools") |
|
989 |
- } |
|
990 |
- res@kmer <- cbind(kml, kmr) |
|
991 |
- |
|
992 |
- fkml <- lhs@firstKmer |
|
993 |
- if(lhs@k > res@k) |
|
994 |
- { |
|
995 |
- fkml <- .Call("melt_kmer_matrix", |
|
996 |
- fkml, c(lhs@k, res@k), PACKAGE="seqTools") |
|
997 |
- } |
|
998 |
- fkmr <- rhs@firstKmer |
|
999 |
- |
|
1000 |
- if(rhs@k > res@k) |
|
1001 |
- { |
|
1002 |
- fkmr <- .Call("melt_kmer_matrix", |
|
1003 |
- fkmr, c(rhs@k, res@k), PACKAGE="seqTools") |
|
1004 |
- } |
|
1005 |
- res@firstKmer <- cbind(fkml, fkmr) |
|
1006 |
- |
|
1007 |
- return(res) |
|
1008 |
-}) |
|
1009 |
- |
|
1010 |
- |
|
1011 |
-setMethod("meltDownK", "Fastqq", function(object, newK) |
|
1012 |
-{ |
|
1013 |
- |
|
1014 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
1015 |
- if(!is.numeric(newK)) |
|
1016 |
- stop("'newK' must be numeric") |
|
1017 |
- |
|
1018 |
- newK <- as.integer(newK) |
|
1019 |
- |
|
1020 |
- if(length(newK) != 1) |
|
1021 |
- stop("'newK' must have length 1.") |
|
1022 |
- |
|
1023 |
- if(newK < 1 || newK >= getK(object)) |
|
1024 |
- stop("'newK' must be >= 1 and <= ", getK(object), ".") |
|
1025 |
- |
|
1026 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
1027 |
- res <- new("Fastqq") |
|
1028 |
- res@filenames <- object@filenames |
|
1029 |
- res@nFiles <- object@nFiles |
|
1030 |
- res@k <- newK |
|
1031 |
- res@maxSeqLen <- object@maxSeqLen |
|
1032 |
- |
|
1033 |
- res@kmer <- .Call("melt_kmer_matrix", |
|
1034 |
- object@kmer, c(getK(object), newK), PACKAGE="seqTools") |
|
1035 |
- |
|
1036 |
- res@firstKmer <- .Call("melt_kmer_matrix", |
|
1037 |
- object@firstKmer, c(getK(object), newK), PACKAGE="seqTools") |
|
1038 |
- |
|
1039 |
- res@nReads <- object@nReads |
|
1040 |
- res@nN <- object@nN |
|
1041 |
- res@seqLenCount <- object@seqLenCount |
|
1042 |
- res@gcContent <- object@gcContent |
|
1043 |
- res@nac <- object@nac |
|
1044 |
- res@phred <- object@phred |
|
1045 |
- res@seqLen <- object@seqLen |
|
1046 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
1047 |
- |
|
1048 |
- return(res) |
|
1049 |
-}) |
|
1050 |
- |
|
1051 |
- |
|
1052 |
-listMelt <- function(x, oldK, newK) |
|
1053 |
-{ |
|
1054 |
- f <- function(x) .Call("melt_kmer_matrix", x, |
|
1055 |
- as.integer(c(oldK, newK)), PACKAGE="seqTools") |
|
1056 |
- |
|
1057 |
- return(lapply(x, f)) |
|
1058 |
-} |
|
1059 |
- |
|
1060 |
- |
|
1061 |
- |
|
1062 |
-setMethod("[", "Fastqq", function(x, i, j, drop="missing") |
|
1063 |
-{ |
|
1064 |
- |
|
1065 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
1066 |
- res <- new("Fastqq") |
|
1067 |
- res@filenames <- x@filenames[i] |
|
1068 |
- res@probeLabel <- x@probeLabel[i] |
|
1069 |
- res@nFiles <- length(i) |
|
1070 |
- res@k <- x@k |
|
1071 |
- res@seqLen <- x@seqLen[, i, drop=FALSE] |
|
1072 |
- res@maxSeqLen <- max(res@seqLen[2, ]) |
|
1073 |
- res@kmer <- x@kmer[, i, drop=FALSE] |
|
1074 |
- res@firstKmer <- x@firstKmer[, i, drop=FALSE] |
|
1075 |
- res@nN <- x@nN[i] |
|
1076 |
- res@seqLenCount <- x@seqLenCount[, i, drop=FALSE] |
|
1077 |
- res@gcContent <- x@gcContent[, i, drop=FALSE] |
|
1078 |
- res@nac <- x@nac[i] |
|
1079 |
- res@phred <- x@phred[i] |
|
1080 |
- res@nReads <- x@nReads[i] |
|
1081 |
- res@collectTime <- x@collectTime |
|
1082 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
1083 |
- |
|
1084 |
- return(res) |
|
1085 |
-}) |
|
1086 |
- |
|
1087 |
- |
|
1088 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
1089 |
-## Calculation of distance matrix based on Canberra distance |
|
1090 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
1091 |
- |
|
1092 |
-setMethod("cbDistMatrix", "Fastqq", |
|
1093 |
- function(object, nReadNorm=max(nReads(object))) |
|
1094 |
-{ |
|
1095 |
- if(!is.numeric(nReadNorm)) |
|
1096 |
- stop("'nReadNorm' must be numeric.") |
|
1097 |
- nReadNorm <- as.integer(nReadNorm) |
|
1098 |
- |
|
1099 |
- if(nReadNorm < max(nReads(object))) |
|
1100 |
- stop("'nReadNorm' must be greater than all nRead.") |
|
1101 |
- |
|
1102 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
1103 |
- ## Column-wise normalizing read counts (by upscaling) |
|
1104 |
- ## so that column sums become nearly equal in order to |
|
1105 |
- ## compensate sequencing depth artifacts in |
|
1106 |
- ## Canberra distance values |
|
1107 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
1108 |
- |
|
1109 |
- scale <- nReadNorm/nReads(object) |
|
1110 |
- |
|
1111 |
- scaled <- .Call("scale_kmer_matrix", |
|
1112 |
- kmerCount(object), scale, PACKAGE="seqTools") |
|
1113 |
- |
|
1114 |
- colnames(scaled) <- object@probeLabel |
|
1115 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
1116 |
- |
|
1117 |
- return(.Call("cb_distance_matrix", scaled, PACKAGE="seqTools")) |
|
1118 |
-}) |
|
1119 |
- |
|
1120 |
- |
|
1121 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
1122 |
-## END OF FILE |
|
1123 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |