git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@128653 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,387 +0,0 @@ |
1 |
- |
|
2 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
3 |
-## ## |
|
4 |
-## Project : seqTools ## |
|
5 |
-## Created : 26.August.2013 ## |
|
6 |
-## Author : W. Kaisers ## |
|
7 |
-## Version : 0.99.13 ## |
|
8 |
-## File : simFunctions.r ## |
|
9 |
-## Content : Functions which produce fastq files with simulated data ## |
|
10 |
-## (Done for testing Fastqq and DNA k-mer counting) ## |
|
11 |
-## writeSimFastq, writeSimContFastq, simFastqqRunTimes, ## |
|
12 |
-## sim_fq ## |
|
13 |
-## ## |
|
14 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
15 |
- |
|
16 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
17 |
-## Create fastq files with simulated k-mers |
|
18 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
19 |
- |
|
20 |
-writeSimFastq<-function(k=6, nk=5, nSeq=10, filename="sim.fq.gz") |
|
21 |
-{ |
|
22 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
23 |
- # k |
|
24 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
25 |
- if(!is.numeric(k)) |
|
26 |
- stop("'k' must be numeric.") |
|
27 |
- if(length(k) != 1) |
|
28 |
- stop("'k' must have length 1.") |
|
29 |
- k <- as.integer(k) |
|
30 |
- if( (k < 1) || (k > max_k) ) |
|
31 |
- stop("'k' out of range.") |
|
32 |
- |
|
33 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
34 |
- # nk |
|
35 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
36 |
- if(!is.numeric(nk)) |
|
37 |
- stop("'nk' must be numeric.") |
|
38 |
- if(length(nk) != 1) |
|
39 |
- stop("'nk' must have length 1.") |
|
40 |
- nk<-as.integer(nk) |
|
41 |
- if( (nk < 1) || (nk > 1000) ) |
|
42 |
- stop("nk must be positive and <1000.") |
|
43 |
- |
|
44 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
45 |
- # nSeq |
|
46 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
47 |
- if(!is.numeric(nSeq)) |
|
48 |
- stop("'nSeq' must be numeric.") |
|
49 |
- if(nSeq < 1) |
|
50 |
- stop("'nSeq' must be positive.") |
|
51 |
- |
|
52 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
53 |
- # filename |
|
54 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
55 |
- if(!is.character(filename)) |
|
56 |
- stop("'filename' must be string.") |
|
57 |
- |
|
58 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
59 |
- # Do the work |
|
60 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
61 |
- bm <- Sys.localeconv()[7] |
|
62 |
- val <- as.integer(c(k, nk, nSeq)) |
|
63 |
- |
|
64 |
- pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools") |
|
65 |
- res <- .Call("gzwrite_fastq_dna", val, pseq, filename, PACKAGE="seqTools") |
|
66 |
- |
|
67 |
- message("[writeSimFastq] file '", basename(filename), "': ", |
|
68 |
- format(res,big.mark=bm), " Bytes written.") |
|
69 |
- |
|
70 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
71 |
- # Terminate function. |
|
72 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
73 |
- return(invisible()) |
|
74 |
-} |
|
75 |
- |
|
76 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
77 |
-## writeSimContFastq |
|
78 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
79 |
-writeSimContFastq<-function(k=6, nk=5, nSeq=10, pos=1, |
|
80 |
- kIndex=1, nContam=nSeq, filename="simc.fq.gz") |
|
81 |
-{ |
|
82 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
83 |
- # k |
|
84 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
85 |
- if(!is.numeric(k)) |
|
86 |
- stop("k must be numeric.") |
|
87 |
- if(length(k) != 1) |
|
88 |
- stop("k must have length 1.") |
|
89 |
- k<-as.integer(k) |
|
90 |
- if( (k < 1) || (k > max_k) ) |
|
91 |
- stop("k out of range.") |
|
92 |
- |
|
93 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
94 |
- # nk |
|
95 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
96 |
- if(!is.numeric(nk)) |
|
97 |
- stop("nk must be numeric.") |
|
98 |
- if(length(nk) != 1) |
|
99 |
- stop("nk must have length 1.") |
|
100 |
- nk<-as.integer(nk) |
|
101 |
- if( (nk < 1) || (nk > 1000) ) |
|
102 |
- stop("nk must be positive and <1000.") |
|
103 |
- |
|
104 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
105 |
- # nSeq |
|
106 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
107 |
- if(!is.numeric(nSeq)) |
|
108 |
- stop("nSeq must be numeric.") |
|
109 |
- if(length(nSeq) != 1) |
|
110 |
- stop("nSeq must have length 1.") |
|
111 |
- if(nSeq < 1) |
|
112 |
- stop("nSeq must be positive.") |
|
113 |
- |
|
114 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
115 |
- # pos |
|
116 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
117 |
- if(!is.numeric(pos)) |
|
118 |
- stop("pos must be numeric.") |
|
119 |
- pos<-as.integer(pos) |
|
120 |
- if( any(pos < 1) || any(pos > (k * (nk - 1) + 1))) |
|
121 |
- stop("pos out of range.") |
|
122 |
- |
|
123 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
124 |
- # writeSimContFastq should take 1-based positions |
|
125 |
- # while "set_dna_k_mer" takes 0-based positions |
|
126 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
127 |
- pos<-pos - 1L |
|
128 |
- |
|
129 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
130 |
- # kIndex |
|
131 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
132 |
- if(!is.numeric(kIndex)) |
|
133 |
- stop("kIndex must be numeric.") |
|
134 |
- |
|
135 |
- kIndex <- as.integer(kIndex) |
|
136 |
- |
|
137 |
- if(length(pos) != length(kIndex)) |
|
138 |
- stop("pos and kIndex must have equal length.") |
|
139 |
- |
|
140 |
- if( any(kIndex < 1L) || any(kIndex > 4^k) ) |
|
141 |
- stop("kIndex out of range.") |
|
142 |
- |
|
143 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
144 |
- # writeSimContFastq should take 1-based k-indices |
|
145 |
- # while "set_dna_k_mer" takes 0-based k-indices |
|
146 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
147 |
- kIndex <- kIndex - 1L |
|
148 |
- |
|
149 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
150 |
- # nContam: Deterministic replacements are copied |
|
151 |
- # into the first 'nContam' reads |
|
152 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
153 |
- if(!is.numeric(nContam)) |
|
154 |
- stop("nContam must be integer.") |
|
155 |
- |
|
156 |
- nContam <- as.integer(nContam) |
|
157 |
- |
|
158 |
- if(length(nContam) != 1) |
|
159 |
- stop("nContam must have length 1.") |
|
160 |
- |
|
161 |
- if( (nContam < 1) || (nContam > nSeq) ) |
|
162 |
- stop("nContam must be >0 and <nSeq.") |
|
163 |
- |
|
164 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
165 |
- # filename |
|
166 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
167 |
- if(!is.character(filename)) |
|
168 |
- stop("filename must be character.") |
|
169 |
- |
|
170 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
171 |
- # Do the work |
|
172 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
173 |
- bm <- Sys.localeconv()[7] |
|
174 |
- |
|
175 |
- val <- as.integer(c(k, nk, nSeq)) |
|
176 |
- |
|
177 |
- pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools") |
|
178 |
- |
|
179 |
- prseq <- .Call("set_dna_k_mer", |
|
180 |
- val, pseq,pos, kIndex, nContam, PACKAGE="seqTools") |
|
181 |
- |
|
182 |
- res <- .Call("gzwrite_fastq_dna", |
|
183 |
- val, prseq, filename, PACKAGE="seqTools") |
|
184 |
- |
|
185 |
- message("[writeSimContFastq] file '", basename(filename), |
|
186 |
- "': ", format(res,big.mark=bm), " Bytes written.") |
|
187 |
- |
|
188 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
189 |
- # Terminate function |
|
190 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
191 |
- return(invisible()) |
|
192 |
-} |
|
193 |
- |
|
194 |
- |
|
195 |
- |
|
196 |
-simFastqqRunTimes<-function(k, nSeq, filedir=".") |
|
197 |
-{ |
|
198 |
- if(missing(k)) |
|
199 |
- k <- 2:(max_k) |
|
200 |
- else |
|
201 |
- { |
|
202 |
- if(!is.numeric(k)) |
|
203 |
- stop("'k' must be numeric.") |
|
204 |
- |
|
205 |
- k <- sort(unique(as.integer(k))) |
|
206 |
- |
|
207 |
- if(any(k < 1) || any(k > max_k)) |
|
208 |
- stop("'k' out of range.") |
|
209 |
- } |
|
210 |
- |
|
211 |
- if(missing(nSeq)) |
|
212 |
- nSeq <- as.integer(c(1e2, 1e3, 1e4, 1e5, 1e6, 1e7)) |
|
213 |
- else { |
|
214 |
- if(!is.numeric(nSeq)) |
|
215 |
- stop("'nSeq' must be numeric.") |
|
216 |
- nSeq <- as.integer(nSeq) |
|
217 |
- if(any(nSeq < 1) ) |
|
218 |
- stop("'nSeq' must be positive.") |
|
219 |
- } |
|
220 |
- |
|
221 |
- if(!is.character(filedir)) |
|
222 |
- stop("'filedir' must be character.") |
|
223 |
- |
|
224 |
- if(!file.exists(filedir)) |
|
225 |
- { |
|
226 |
- if(!dir.create(filedir)) |
|
227 |
- stop("Cannot create filedir '", filedir, "'.") |
|
228 |
- } |
|
229 |
- |
|
230 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
231 |
- # Write simulated fastq files |
|
232 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
233 |
- message("[simFastqqRunTimes] Creating fastq files:") |
|
234 |
- fqFiles <- character(length(nSeq)) |
|
235 |
- bm <- Sys.localeconv()[7] |
|
236 |
- |
|
237 |
- for(j in 1:length(nSeq)) |
|
238 |
- { |
|
239 |
- fqFiles[j] <- file.path(filedir, |
|
240 |
- paste("sfqrt_nSeq", nSeq[j], ".fq.gz", sep="")) |
|
241 |
- |
|
242 |
- message("[simFastqqRunTimes] (",format(j , w=3),"/", length(nSeq), |
|
243 |
- ") nSeq=", format(nSeq[j], big.mark=bm, w=12)) |
|
244 |
- |
|
245 |
- # Allways equal sized reads: k=6, read length=102 |
|
246 |
- writeSimFastq(k=6, nk=17, nSeq=nSeq[j], filename=fqFiles[j]) |
|
247 |
- } |
|
248 |
- |
|
249 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
250 |
- # Prepare result table |
|
251 |
- # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
252 |
- nSim <- length(k) * length(nSeq) |
|
253 |
- m <- 1 |
|
254 |
- res<-data.frame(id=1:nSim, k=0, nSeq=0, runtime=0) |
|
255 |
- |
|
256 |
- for(i in 1:length(k)) |
|
257 |
- { |
|
258 |
- for(j in 1:length(nSeq)) |
|
259 |
- { |
|
260 |
- message("[simFastqqRunTimes] (", |
|
261 |
- format(m, width=3),"/", nSim, ") Fastqq run.") |
|
262 |
- |
|
263 |
- fq <- fastqq(fqFiles[j], k=k[i]) |
|
264 |
- res$k[m] <- k[i] |
|
265 |
- res$nSeq[m] <- nSeq[j] |
|
266 |
- res$runtime[m] <- collectDur(fq) |
|
267 |
- m <- m + 1 |
|
268 |
- } |
|
269 |
- } |
|
270 |
- return(res) |
|
271 |
-} |
|
272 |
- |
|
273 |
- |
|
274 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
275 |
-## Encapsulates whole simulation studies |
|
276 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
277 |
- |
|
278 |
-sim_fq<-function(nRep=2, nContamVec=c(100, 1000), grSize=20, nSeq=1e4, |
|
279 |
- k=6, kIndex=1365, pos=20) |
|
280 |
-{ |
|
281 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
282 |
- ## Arguments |
|
283 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
284 |
- |
|
285 |
- # nContam : Absolute number of contaminated sequences |
|
286 |
- # grSize : Nr of fastq files in control and contamination group |
|
287 |
- # nSeq : Nr of reads per fastq file |
|
288 |
- # k : k-mer length used in fastqq function |
|
289 |
- # kIndex : k-mer index of contaminating sequence default is "CCCCCC" |
|
290 |
- # pos : Contamination is inserted at position in read sequence, |
|
291 |
- # default is 20 |
|
292 |
- |
|
293 |
- if(length(kIndex) != length(pos)) |
|
294 |
- stop("'kIndex' and 'pos' must have equal length.") |
|
295 |
- |
|
296 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
297 |
- |
|
298 |
- |
|
299 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
300 |
- ## Fixed internal values |
|
301 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
302 |
- |
|
303 |
- # ksim : k-mer sized used for simulation |
|
304 |
- ksim <- 6 |
|
305 |
- # nksim : number of k-mers per read (determines read length) |
|
306 |
- nksim <- 17 |
|
307 |
- |
|
308 |
- |
|
309 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
310 |
- ## Write simulated fastq files for control group |
|
311 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
312 |
- |
|
313 |
- # ctrl_vec=leaf labels for control group |
|
314 |
- ctrl_vec <- 1:grSize |
|
315 |
- ctrl_files <- paste("sfq_i", ctrl_vec, "_ctrl.fq.gz", sep = "") |
|
316 |
- |
|
317 |
- for(i in ctrl_vec){ |
|
318 |
- writeSimFastq(k=ksim, nk=nksim, nSeq=nSeq, |
|
319 |
- filename = ctrl_files[i]) |
|
320 |
- } |
|
321 |
- |
|
322 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
323 |
- ## Prepare values for contamination and result |
|
324 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
325 |
- |
|
326 |
- # cont_vec=leaf labels for contamination group |
|
327 |
- # (must be nonoverlapping with ctrl_vec) |
|
328 |
- cont_vec <- ctrl_vec + 100 |
|
329 |
- contamVecLen <- length(nContamVec) |
|
330 |
- nSim = nRep * contamVecLen |
|
331 |
- # Result data.frame |
|
332 |
- res <- data.frame(id=1:nSim, nContam=0, rep=0, sum=NA) |
|
333 |
- |
|
334 |
- m <- 1 |
|
335 |
- for(i in 1:length(nContamVec)) |
|
336 |
- { |
|
337 |
- nContam<-nContamVec[i] |
|
338 |
- for(j in 1:nRep) |
|
339 |
- { |
|
340 |
- message("[sim_fq] Simulation (", |
|
341 |
- format(m, width=3), "/", nSim, ").") |
|
342 |
- |
|
343 |
- cont_files <- paste("sfq_i", cont_vec, "_cont.fq.gz", sep="") |
|
344 |
- |
|
345 |
- for(i in 1:length(cont_vec)) |
|
346 |
- writeSimContFastq(k=ksim, nk=nksim, nSeq=nSeq, pos=pos, |
|
347 |
- nContam=nContam, kIndex=kIndex, |
|
348 |
- filename=cont_files[i]) |
|
349 |
- |
|
350 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
351 |
- ## read Fastqq |
|
352 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
353 |
- ctrl_fqq <- fastqq(ctrl_files, k=k) |
|
354 |
- probeLabel(ctrl_fqq) <- ctrl_vec |
|
355 |
- cont_fqq <- fastqq(cont_files, k=k) |
|
356 |
- probeLabel(cont_fqq) <- cont_vec |
|
357 |
- |
|
358 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
359 |
- ## merge, dist, hclust |
|
360 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
361 |
- mrg <- mergeFastqq(ctrl_fqq, cont_fqq) |
|
362 |
- mtx <- cbDistMatrix(mrg) |
|
363 |
- hc <- hclust(as.dist(mtx)) |
|
364 |
- |
|
365 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
366 |
- ## Calculate results and write into res |
|
367 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
368 |
- res$nContam[m] <- nContam |
|
369 |
- res$rep[m] <- j |
|
370 |
- |
|
371 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
372 |
- ## Calculate how many labels of the contamination group |
|
373 |
- ## are in the first half of leaf labels |
|
374 |
- ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
375 |
- res$sum[m] <- sum(hc$order[ctrl_vec] > grSize) |
|
376 |
- m <- m + 1 |
|
377 |
- } |
|
378 |
- } |
|
379 |
- res$lat <- pmin(res$sum, grSize - res$sum) / grSize * 100 |
|
380 |
- res$perc <- res$nContam / nSeq * 100 |
|
381 |
- return(res) |
|
382 |
-} |
|
383 |
- |
|
384 |
- |
|
385 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
386 |
-## END OF FILE |
|
387 |
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |