git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@128661 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,387 @@ |
1 |
+ |
|
2 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
3 |
+## ## |
|
4 |
+## Project : seqTools ## |
|
5 |
+## Created : 26.August.2013 ## |
|
6 |
+## Author : W. Kaisers ## |
|
7 |
+## Version : 0.99.13 ## |
|
8 |
+## File : simFunctions.r ## |
|
9 |
+## Content : Functions which produce fastq files with simulated data ## |
|
10 |
+## (Done for testing Fastqq and DNA k-mer counting) ## |
|
11 |
+## writeSimFastq, writeSimContFastq, simFastqqRunTimes, ## |
|
12 |
+## sim_fq ## |
|
13 |
+## ## |
|
14 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
15 |
+ |
|
16 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
17 |
+## Create fastq files with simulated k-mers |
|
18 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
19 |
+ |
|
20 |
+writeSimFastq<-function(k=6, nk=5, nSeq=10, filename="sim.fq.gz") |
|
21 |
+{ |
|
22 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
23 |
+ # k |
|
24 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
25 |
+ if(!is.numeric(k)) |
|
26 |
+ stop("'k' must be numeric.") |
|
27 |
+ if(length(k) != 1) |
|
28 |
+ stop("'k' must have length 1.") |
|
29 |
+ k <- as.integer(k) |
|
30 |
+ if( (k < 1) || (k > max_k) ) |
|
31 |
+ stop("'k' out of range.") |
|
32 |
+ |
|
33 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
34 |
+ # nk |
|
35 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
36 |
+ if(!is.numeric(nk)) |
|
37 |
+ stop("'nk' must be numeric.") |
|
38 |
+ if(length(nk) != 1) |
|
39 |
+ stop("'nk' must have length 1.") |
|
40 |
+ nk<-as.integer(nk) |
|
41 |
+ if( (nk < 1) || (nk > 1000) ) |
|
42 |
+ stop("nk must be positive and <1000.") |
|
43 |
+ |
|
44 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
45 |
+ # nSeq |
|
46 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
47 |
+ if(!is.numeric(nSeq)) |
|
48 |
+ stop("'nSeq' must be numeric.") |
|
49 |
+ if(nSeq < 1) |
|
50 |
+ stop("'nSeq' must be positive.") |
|
51 |
+ |
|
52 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
53 |
+ # filename |
|
54 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
55 |
+ if(!is.character(filename)) |
|
56 |
+ stop("'filename' must be string.") |
|
57 |
+ |
|
58 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
59 |
+ # Do the work |
|
60 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
61 |
+ bm <- Sys.localeconv()[7] |
|
62 |
+ val <- as.integer(c(k, nk, nSeq)) |
|
63 |
+ |
|
64 |
+ pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools") |
|
65 |
+ res <- .Call("gzwrite_fastq_dna", val, pseq, filename, PACKAGE="seqTools") |
|
66 |
+ |
|
67 |
+ message("[writeSimFastq] file '", basename(filename), "': ", |
|
68 |
+ format(res,big.mark=bm), " Bytes written.") |
|
69 |
+ |
|
70 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
71 |
+ # Terminate function. |
|
72 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
73 |
+ return(invisible()) |
|
74 |
+} |
|
75 |
+ |
|
76 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
77 |
+## writeSimContFastq |
|
78 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
79 |
+writeSimContFastq<-function(k=6, nk=5, nSeq=10, pos=1, |
|
80 |
+ kIndex=1, nContam=nSeq, filename="simc.fq.gz") |
|
81 |
+{ |
|
82 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
83 |
+ # k |
|
84 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
85 |
+ if(!is.numeric(k)) |
|
86 |
+ stop("k must be numeric.") |
|
87 |
+ if(length(k) != 1) |
|
88 |
+ stop("k must have length 1.") |
|
89 |
+ k<-as.integer(k) |
|
90 |
+ if( (k < 1) || (k > max_k) ) |
|
91 |
+ stop("k out of range.") |
|
92 |
+ |
|
93 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
94 |
+ # nk |
|
95 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
96 |
+ if(!is.numeric(nk)) |
|
97 |
+ stop("nk must be numeric.") |
|
98 |
+ if(length(nk) != 1) |
|
99 |
+ stop("nk must have length 1.") |
|
100 |
+ nk<-as.integer(nk) |
|
101 |
+ if( (nk < 1) || (nk > 1000) ) |
|
102 |
+ stop("nk must be positive and <1000.") |
|
103 |
+ |
|
104 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
105 |
+ # nSeq |
|
106 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
107 |
+ if(!is.numeric(nSeq)) |
|
108 |
+ stop("nSeq must be numeric.") |
|
109 |
+ if(length(nSeq) != 1) |
|
110 |
+ stop("nSeq must have length 1.") |
|
111 |
+ if(nSeq < 1) |
|
112 |
+ stop("nSeq must be positive.") |
|
113 |
+ |
|
114 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
115 |
+ # pos |
|
116 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
117 |
+ if(!is.numeric(pos)) |
|
118 |
+ stop("pos must be numeric.") |
|
119 |
+ pos<-as.integer(pos) |
|
120 |
+ if( any(pos < 1) || any(pos > (k * (nk - 1) + 1))) |
|
121 |
+ stop("pos out of range.") |
|
122 |
+ |
|
123 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
124 |
+ # writeSimContFastq should take 1-based positions |
|
125 |
+ # while "set_dna_k_mer" takes 0-based positions |
|
126 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
127 |
+ pos<-pos - 1L |
|
128 |
+ |
|
129 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
130 |
+ # kIndex |
|
131 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
132 |
+ if(!is.numeric(kIndex)) |
|
133 |
+ stop("kIndex must be numeric.") |
|
134 |
+ |
|
135 |
+ kIndex <- as.integer(kIndex) |
|
136 |
+ |
|
137 |
+ if(length(pos) != length(kIndex)) |
|
138 |
+ stop("pos and kIndex must have equal length.") |
|
139 |
+ |
|
140 |
+ if( any(kIndex < 1L) || any(kIndex > 4^k) ) |
|
141 |
+ stop("kIndex out of range.") |
|
142 |
+ |
|
143 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
144 |
+ # writeSimContFastq should take 1-based k-indices |
|
145 |
+ # while "set_dna_k_mer" takes 0-based k-indices |
|
146 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
147 |
+ kIndex <- kIndex - 1L |
|
148 |
+ |
|
149 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
150 |
+ # nContam: Deterministic replacements are copied |
|
151 |
+ # into the first 'nContam' reads |
|
152 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
153 |
+ if(!is.numeric(nContam)) |
|
154 |
+ stop("nContam must be integer.") |
|
155 |
+ |
|
156 |
+ nContam <- as.integer(nContam) |
|
157 |
+ |
|
158 |
+ if(length(nContam) != 1) |
|
159 |
+ stop("nContam must have length 1.") |
|
160 |
+ |
|
161 |
+ if( (nContam < 1) || (nContam > nSeq) ) |
|
162 |
+ stop("nContam must be >0 and <nSeq.") |
|
163 |
+ |
|
164 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
165 |
+ # filename |
|
166 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
167 |
+ if(!is.character(filename)) |
|
168 |
+ stop("filename must be character.") |
|
169 |
+ |
|
170 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
171 |
+ # Do the work |
|
172 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
173 |
+ bm <- Sys.localeconv()[7] |
|
174 |
+ |
|
175 |
+ val <- as.integer(c(k, nk, nSeq)) |
|
176 |
+ |
|
177 |
+ pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools") |
|
178 |
+ |
|
179 |
+ prseq <- .Call("set_dna_k_mer", |
|
180 |
+ val, pseq,pos, kIndex, nContam, PACKAGE="seqTools") |
|
181 |
+ |
|
182 |
+ res <- .Call("gzwrite_fastq_dna", |
|
183 |
+ val, prseq, filename, PACKAGE="seqTools") |
|
184 |
+ |
|
185 |
+ message("[writeSimContFastq] file '", basename(filename), |
|
186 |
+ "': ", format(res,big.mark=bm), " Bytes written.") |
|
187 |
+ |
|
188 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
189 |
+ # Terminate function |
|
190 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
191 |
+ return(invisible()) |
|
192 |
+} |
|
193 |
+ |
|
194 |
+ |
|
195 |
+ |
|
196 |
+simFastqqRunTimes<-function(k, nSeq, filedir=".") |
|
197 |
+{ |
|
198 |
+ if(missing(k)) |
|
199 |
+ k <- 2:(max_k) |
|
200 |
+ else |
|
201 |
+ { |
|
202 |
+ if(!is.numeric(k)) |
|
203 |
+ stop("'k' must be numeric.") |
|
204 |
+ |
|
205 |
+ k <- sort(unique(as.integer(k))) |
|
206 |
+ |
|
207 |
+ if(any(k < 1) || any(k > max_k)) |
|
208 |
+ stop("'k' out of range.") |
|
209 |
+ } |
|
210 |
+ |
|
211 |
+ if(missing(nSeq)) |
|
212 |
+ nSeq <- as.integer(c(1e2, 1e3, 1e4, 1e5, 1e6, 1e7)) |
|
213 |
+ else { |
|
214 |
+ if(!is.numeric(nSeq)) |
|
215 |
+ stop("'nSeq' must be numeric.") |
|
216 |
+ nSeq <- as.integer(nSeq) |
|
217 |
+ if(any(nSeq < 1) ) |
|
218 |
+ stop("'nSeq' must be positive.") |
|
219 |
+ } |
|
220 |
+ |
|
221 |
+ if(!is.character(filedir)) |
|
222 |
+ stop("'filedir' must be character.") |
|
223 |
+ |
|
224 |
+ if(!file.exists(filedir)) |
|
225 |
+ { |
|
226 |
+ if(!dir.create(filedir)) |
|
227 |
+ stop("Cannot create filedir '", filedir, "'.") |
|
228 |
+ } |
|
229 |
+ |
|
230 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
231 |
+ # Write simulated fastq files |
|
232 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
233 |
+ message("[simFastqqRunTimes] Creating fastq files:") |
|
234 |
+ fqFiles <- character(length(nSeq)) |
|
235 |
+ bm <- Sys.localeconv()[7] |
|
236 |
+ |
|
237 |
+ for(j in 1:length(nSeq)) |
|
238 |
+ { |
|
239 |
+ fqFiles[j] <- file.path(filedir, |
|
240 |
+ paste("sfqrt_nSeq", nSeq[j], ".fq.gz", sep="")) |
|
241 |
+ |
|
242 |
+ message("[simFastqqRunTimes] (",format(j , w=3),"/", length(nSeq), |
|
243 |
+ ") nSeq=", format(nSeq[j], big.mark=bm, w=12)) |
|
244 |
+ |
|
245 |
+ # Allways equal sized reads: k=6, read length=102 |
|
246 |
+ writeSimFastq(k=6, nk=17, nSeq=nSeq[j], filename=fqFiles[j]) |
|
247 |
+ } |
|
248 |
+ |
|
249 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
250 |
+ # Prepare result table |
|
251 |
+ # + + + + + + + + + + + + + + + + + + + + + + + + + + + + # |
|
252 |
+ nSim <- length(k) * length(nSeq) |
|
253 |
+ m <- 1 |
|
254 |
+ res<-data.frame(id=1:nSim, k=0, nSeq=0, runtime=0) |
|
255 |
+ |
|
256 |
+ for(i in 1:length(k)) |
|
257 |
+ { |
|
258 |
+ for(j in 1:length(nSeq)) |
|
259 |
+ { |
|
260 |
+ message("[simFastqqRunTimes] (", |
|
261 |
+ format(m, width=3),"/", nSim, ") Fastqq run.") |
|
262 |
+ |
|
263 |
+ fq <- fastqq(fqFiles[j], k=k[i]) |
|
264 |
+ res$k[m] <- k[i] |
|
265 |
+ res$nSeq[m] <- nSeq[j] |
|
266 |
+ res$runtime[m] <- collectDur(fq) |
|
267 |
+ m <- m + 1 |
|
268 |
+ } |
|
269 |
+ } |
|
270 |
+ return(res) |
|
271 |
+} |
|
272 |
+ |
|
273 |
+ |
|
274 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
275 |
+## Encapsulates whole simulation studies |
|
276 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
277 |
+ |
|
278 |
+sim_fq<-function(nRep=2, nContamVec=c(100, 1000), grSize=20, nSeq=1e4, |
|
279 |
+ k=6, kIndex=1365, pos=20) |
|
280 |
+{ |
|
281 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
282 |
+ ## Arguments |
|
283 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
284 |
+ |
|
285 |
+ # nContam : Absolute number of contaminated sequences |
|
286 |
+ # grSize : Nr of fastq files in control and contamination group |
|
287 |
+ # nSeq : Nr of reads per fastq file |
|
288 |
+ # k : k-mer length used in fastqq function |
|
289 |
+ # kIndex : k-mer index of contaminating sequence default is "CCCCCC" |
|
290 |
+ # pos : Contamination is inserted at position in read sequence, |
|
291 |
+ # default is 20 |
|
292 |
+ |
|
293 |
+ if(length(kIndex) != length(pos)) |
|
294 |
+ stop("'kIndex' and 'pos' must have equal length.") |
|
295 |
+ |
|
296 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
297 |
+ |
|
298 |
+ |
|
299 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
300 |
+ ## Fixed internal values |
|
301 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
302 |
+ |
|
303 |
+ # ksim : k-mer sized used for simulation |
|
304 |
+ ksim <- 6 |
|
305 |
+ # nksim : number of k-mers per read (determines read length) |
|
306 |
+ nksim <- 17 |
|
307 |
+ |
|
308 |
+ |
|
309 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
310 |
+ ## Write simulated fastq files for control group |
|
311 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
312 |
+ |
|
313 |
+ # ctrl_vec=leaf labels for control group |
|
314 |
+ ctrl_vec <- 1:grSize |
|
315 |
+ ctrl_files <- paste("sfq_i", ctrl_vec, "_ctrl.fq.gz", sep = "") |
|
316 |
+ |
|
317 |
+ for(i in ctrl_vec){ |
|
318 |
+ writeSimFastq(k=ksim, nk=nksim, nSeq=nSeq, |
|
319 |
+ filename = ctrl_files[i]) |
|
320 |
+ } |
|
321 |
+ |
|
322 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
323 |
+ ## Prepare values for contamination and result |
|
324 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
325 |
+ |
|
326 |
+ # cont_vec=leaf labels for contamination group |
|
327 |
+ # (must be nonoverlapping with ctrl_vec) |
|
328 |
+ cont_vec <- ctrl_vec + 100 |
|
329 |
+ contamVecLen <- length(nContamVec) |
|
330 |
+ nSim = nRep * contamVecLen |
|
331 |
+ # Result data.frame |
|
332 |
+ res <- data.frame(id=1:nSim, nContam=0, rep=0, sum=NA) |
|
333 |
+ |
|
334 |
+ m <- 1 |
|
335 |
+ for(i in 1:length(nContamVec)) |
|
336 |
+ { |
|
337 |
+ nContam<-nContamVec[i] |
|
338 |
+ for(j in 1:nRep) |
|
339 |
+ { |
|
340 |
+ message("[sim_fq] Simulation (", |
|
341 |
+ format(m, width=3), "/", nSim, ").") |
|
342 |
+ |
|
343 |
+ cont_files <- paste("sfq_i", cont_vec, "_cont.fq.gz", sep="") |
|
344 |
+ |
|
345 |
+ for(i in 1:length(cont_vec)) |
|
346 |
+ writeSimContFastq(k=ksim, nk=nksim, nSeq=nSeq, pos=pos, |
|
347 |
+ nContam=nContam, kIndex=kIndex, |
|
348 |
+ filename=cont_files[i]) |
|
349 |
+ |
|
350 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
351 |
+ ## read Fastqq |
|
352 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
353 |
+ ctrl_fqq <- fastqq(ctrl_files, k=k) |
|
354 |
+ probeLabel(ctrl_fqq) <- ctrl_vec |
|
355 |
+ cont_fqq <- fastqq(cont_files, k=k) |
|
356 |
+ probeLabel(cont_fqq) <- cont_vec |
|
357 |
+ |
|
358 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
359 |
+ ## merge, dist, hclust |
|
360 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
361 |
+ mrg <- mergeFastqq(ctrl_fqq, cont_fqq) |
|
362 |
+ mtx <- cbDistMatrix(mrg) |
|
363 |
+ hc <- hclust(as.dist(mtx)) |
|
364 |
+ |
|
365 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
366 |
+ ## Calculate results and write into res |
|
367 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
368 |
+ res$nContam[m] <- nContam |
|
369 |
+ res$rep[m] <- j |
|
370 |
+ |
|
371 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
372 |
+ ## Calculate how many labels of the contamination group |
|
373 |
+ ## are in the first half of leaf labels |
|
374 |
+ ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
375 |
+ res$sum[m] <- sum(hc$order[ctrl_vec] > grSize) |
|
376 |
+ m <- m + 1 |
|
377 |
+ } |
|
378 |
+ } |
|
379 |
+ res$lat <- pmin(res$sum, grSize - res$sum) / grSize * 100 |
|
380 |
+ res$perc <- res$nContam / nSeq * 100 |
|
381 |
+ return(res) |
|
382 |
+} |
|
383 |
+ |
|
384 |
+ |
|
385 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |
|
386 |
+## END OF FILE |
|
387 |
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ## |