Browse code

Reimport with .R suffix

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@128661 bc3139a8-67e5-0310-9ffc-ced21a209358

Wolfgang Kaisers authored on 13/04/2017 15:56:51
Showing 1 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,387 @@
1
+
2
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
3
+##                                                                           ##
4
+##  Project   :   seqTools                                                   ##
5
+##  Created   :   26.August.2013                                             ##
6
+##  Author    :   W. Kaisers                                                 ##
7
+##  Version   :   0.99.13                                                    ##
8
+##  File      :   simFunctions.r                                             ##
9
+##  Content   :   Functions which produce fastq files with simulated data    ##
10
+##                (Done for testing Fastqq and DNA k-mer counting)           ##
11
+##                writeSimFastq, writeSimContFastq, simFastqqRunTimes,       ##
12
+##                sim_fq                                                     ##
13
+##                                                                           ##
14
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
15
+
16
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
17
+## Create fastq files with simulated k-mers
18
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
19
+
20
+writeSimFastq<-function(k=6, nk=5, nSeq=10, filename="sim.fq.gz")
21
+{
22
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
23
+    # k
24
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
25
+    if(!is.numeric(k))
26
+        stop("'k' must be numeric.")
27
+    if(length(k) != 1)
28
+        stop("'k' must have length 1.")
29
+    k <- as.integer(k)    
30
+    if( (k < 1) || (k > max_k) )
31
+        stop("'k' out of range.")
32
+    
33
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
34
+    # nk
35
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
36
+    if(!is.numeric(nk))
37
+        stop("'nk' must be numeric.")
38
+    if(length(nk) != 1)
39
+        stop("'nk' must have length 1.")
40
+    nk<-as.integer(nk)
41
+    if( (nk < 1) || (nk > 1000) )
42
+        stop("nk must be positive and <1000.")
43
+    
44
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
45
+    # nSeq
46
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
47
+    if(!is.numeric(nSeq))
48
+        stop("'nSeq' must be numeric.")
49
+    if(nSeq < 1)
50
+        stop("'nSeq' must be positive.")
51
+    
52
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
53
+    # filename
54
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
55
+    if(!is.character(filename))
56
+        stop("'filename' must be string.")
57
+    
58
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
59
+    # Do the work
60
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
61
+    bm <- Sys.localeconv()[7]
62
+    val <- as.integer(c(k, nk, nSeq))
63
+
64
+    pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools")
65
+    res <- .Call("gzwrite_fastq_dna", val, pseq, filename, PACKAGE="seqTools")
66
+
67
+    message("[writeSimFastq] file '", basename(filename), "': ",
68
+            format(res,big.mark=bm), " Bytes written.")
69
+  
70
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
71
+    # Terminate function.
72
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
73
+    return(invisible())
74
+}
75
+
76
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
77
+## writeSimContFastq
78
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
79
+writeSimContFastq<-function(k=6, nk=5, nSeq=10, pos=1, 
80
+                kIndex=1, nContam=nSeq, filename="simc.fq.gz")
81
+{
82
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
83
+    # k
84
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
85
+    if(!is.numeric(k))
86
+        stop("k must be numeric.")
87
+    if(length(k) != 1)
88
+        stop("k must have length 1.")
89
+    k<-as.integer(k)    
90
+    if( (k < 1) || (k > max_k) )
91
+        stop("k out of range.")
92
+    
93
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
94
+    # nk
95
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
96
+    if(!is.numeric(nk))
97
+        stop("nk must be numeric.")
98
+    if(length(nk) != 1)
99
+        stop("nk must have length 1.")
100
+    nk<-as.integer(nk)
101
+    if( (nk < 1) || (nk > 1000) )
102
+        stop("nk must be positive and <1000.") 
103
+    
104
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
105
+    # nSeq
106
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
107
+    if(!is.numeric(nSeq))
108
+        stop("nSeq must be numeric.")
109
+    if(length(nSeq) != 1)
110
+        stop("nSeq must have length 1.")
111
+    if(nSeq < 1)
112
+        stop("nSeq must be positive.")
113
+    
114
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
115
+    # pos
116
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
117
+    if(!is.numeric(pos))
118
+        stop("pos must be numeric.")
119
+    pos<-as.integer(pos)
120
+    if( any(pos < 1) || any(pos > (k * (nk - 1) + 1)))
121
+        stop("pos out of range.")
122
+    
123
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
124
+    # writeSimContFastq should take 1-based positions
125
+    # while "set_dna_k_mer" takes    0-based positions
126
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
127
+    pos<-pos - 1L
128
+    
129
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
130
+    # kIndex
131
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
132
+    if(!is.numeric(kIndex))
133
+        stop("kIndex must be numeric.")
134
+    
135
+    kIndex <- as.integer(kIndex)
136
+    
137
+    if(length(pos) != length(kIndex))
138
+        stop("pos and kIndex must have equal length.")
139
+    
140
+    if( any(kIndex < 1L) || any(kIndex > 4^k) )
141
+        stop("kIndex out of range.")
142
+    
143
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
144
+    # writeSimContFastq should take 1-based k-indices
145
+    # while "set_dna_k_mer" takes    0-based k-indices
146
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
147
+    kIndex <- kIndex - 1L
148
+    
149
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
150
+    # nContam: Deterministic replacements are copied
151
+    # into the first 'nContam' reads
152
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
153
+    if(!is.numeric(nContam))
154
+        stop("nContam must be integer.")
155
+    
156
+    nContam <- as.integer(nContam)
157
+    
158
+    if(length(nContam) != 1)
159
+        stop("nContam must have length 1.")
160
+    
161
+    if( (nContam < 1) || (nContam > nSeq) )
162
+        stop("nContam must be >0 and <nSeq.")
163
+    
164
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
165
+    # filename
166
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
167
+    if(!is.character(filename))
168
+        stop("filename must be character.")
169
+    
170
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
171
+    # Do the work
172
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
173
+    bm <- Sys.localeconv()[7]
174
+    
175
+    val <- as.integer(c(k, nk, nSeq))
176
+    
177
+    pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools")
178
+    
179
+    prseq <- .Call("set_dna_k_mer", 
180
+                val, pseq,pos, kIndex, nContam, PACKAGE="seqTools")
181
+  
182
+    res <- .Call("gzwrite_fastq_dna",
183
+                    val, prseq, filename, PACKAGE="seqTools")
184
+  
185
+    message("[writeSimContFastq] file '", basename(filename), 
186
+            "': ", format(res,big.mark=bm), " Bytes written.")
187
+  
188
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
189
+    # Terminate function
190
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
191
+    return(invisible())
192
+}
193
+
194
+
195
+
196
+simFastqqRunTimes<-function(k, nSeq, filedir=".")
197
+{
198
+    if(missing(k))
199
+        k <- 2:(max_k)
200
+    else
201
+    {
202
+        if(!is.numeric(k))
203
+            stop("'k' must be numeric.")
204
+        
205
+        k <- sort(unique(as.integer(k)))
206
+
207
+        if(any(k < 1) || any(k > max_k))
208
+            stop("'k' out of range.")
209
+    }
210
+    
211
+    if(missing(nSeq))
212
+        nSeq <- as.integer(c(1e2, 1e3, 1e4, 1e5, 1e6, 1e7))
213
+    else {
214
+        if(!is.numeric(nSeq))
215
+            stop("'nSeq' must be numeric.")
216
+        nSeq <- as.integer(nSeq)
217
+        if(any(nSeq < 1) )
218
+            stop("'nSeq' must be positive.")
219
+    }
220
+    
221
+    if(!is.character(filedir))
222
+        stop("'filedir' must be character.")
223
+    
224
+    if(!file.exists(filedir))
225
+    {
226
+        if(!dir.create(filedir))
227
+            stop("Cannot create filedir '", filedir, "'.")
228
+    }
229
+    
230
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
231
+    # Write simulated fastq files
232
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
233
+    message("[simFastqqRunTimes] Creating fastq files:")
234
+    fqFiles <- character(length(nSeq))
235
+    bm <- Sys.localeconv()[7]
236
+    
237
+    for(j in 1:length(nSeq))
238
+    {
239
+        fqFiles[j] <- file.path(filedir,
240
+            paste("sfqrt_nSeq", nSeq[j], ".fq.gz", sep=""))
241
+        
242
+        message("[simFastqqRunTimes] (",format(j , w=3),"/", length(nSeq),
243
+            ") nSeq=", format(nSeq[j], big.mark=bm, w=12))
244
+        
245
+        # Allways equal sized reads: k=6, read length=102
246
+        writeSimFastq(k=6, nk=17, nSeq=nSeq[j], filename=fqFiles[j])    
247
+    }
248
+    
249
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
250
+    # Prepare result table
251
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
252
+    nSim <- length(k) * length(nSeq)
253
+    m <- 1
254
+    res<-data.frame(id=1:nSim, k=0, nSeq=0, runtime=0)
255
+    
256
+    for(i in 1:length(k))
257
+    {
258
+        for(j in 1:length(nSeq))
259
+        {
260
+            message("[simFastqqRunTimes] (",
261
+                        format(m, width=3),"/", nSim, ") Fastqq run.")
262
+            
263
+            fq <- fastqq(fqFiles[j], k=k[i])
264
+            res$k[m] <- k[i]
265
+            res$nSeq[m] <- nSeq[j]
266
+            res$runtime[m] <- collectDur(fq)
267
+            m <- m + 1
268
+        }
269
+    }
270
+    return(res)
271
+}
272
+
273
+
274
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
275
+## Encapsulates whole simulation studies
276
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
277
+
278
+sim_fq<-function(nRep=2, nContamVec=c(100, 1000), grSize=20, nSeq=1e4,
279
+                k=6, kIndex=1365, pos=20)
280
+{
281
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
282
+    ## Arguments
283
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
284
+    
285
+    # nContam   : Absolute number of contaminated sequences
286
+    # grSize    : Nr of fastq files in control and contamination group
287
+    # nSeq      : Nr of reads per fastq file
288
+    # k         : k-mer length used in fastqq function
289
+    # kIndex    : k-mer index of contaminating sequence default is "CCCCCC"
290
+    # pos       : Contamination is inserted at position in read sequence, 
291
+    #               default is 20
292
+  
293
+    if(length(kIndex) != length(pos))
294
+        stop("'kIndex' and 'pos' must have equal length.")
295
+    
296
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
297
+    
298
+    
299
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
300
+    ## Fixed internal values
301
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
302
+    
303
+    # ksim        : k-mer sized used for simulation
304
+    ksim <- 6
305
+    # nksim     : number of k-mers per read (determines read length)
306
+    nksim <- 17
307
+    
308
+    
309
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
310
+    ## Write simulated fastq files for control group
311
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
312
+    
313
+    # ctrl_vec=leaf labels for control group
314
+    ctrl_vec <- 1:grSize
315
+    ctrl_files <- paste("sfq_i", ctrl_vec, "_ctrl.fq.gz", sep = "")
316
+    
317
+    for(i in ctrl_vec){
318
+        writeSimFastq(k=ksim, nk=nksim, nSeq=nSeq,
319
+                                        filename = ctrl_files[i])
320
+    }
321
+    
322
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
323
+    ## Prepare values for contamination and result
324
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
325
+    
326
+    # cont_vec=leaf labels for contamination group
327
+    # (must be nonoverlapping with ctrl_vec)
328
+    cont_vec <- ctrl_vec + 100
329
+    contamVecLen <- length(nContamVec)
330
+    nSim = nRep * contamVecLen
331
+    # Result data.frame
332
+    res <- data.frame(id=1:nSim, nContam=0, rep=0, sum=NA)
333
+    
334
+    m <- 1
335
+    for(i in 1:length(nContamVec))
336
+    {
337
+        nContam<-nContamVec[i]
338
+        for(j in 1:nRep)
339
+        {
340
+            message("[sim_fq] Simulation (",
341
+                                    format(m, width=3), "/", nSim, ").")
342
+            
343
+            cont_files <- paste("sfq_i", cont_vec, "_cont.fq.gz", sep="")
344
+            
345
+            for(i in 1:length(cont_vec))
346
+                writeSimContFastq(k=ksim, nk=nksim, nSeq=nSeq, pos=pos,
347
+                    nContam=nContam, kIndex=kIndex,
348
+                    filename=cont_files[i])
349
+
350
+            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
351
+            ## read Fastqq
352
+            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
353
+            ctrl_fqq <- fastqq(ctrl_files, k=k)
354
+            probeLabel(ctrl_fqq) <- ctrl_vec
355
+            cont_fqq <- fastqq(cont_files, k=k)
356
+            probeLabel(cont_fqq) <- cont_vec
357
+            
358
+            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
359
+            ## merge, dist, hclust
360
+            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
361
+            mrg <- mergeFastqq(ctrl_fqq, cont_fqq)
362
+            mtx <- cbDistMatrix(mrg)
363
+            hc <- hclust(as.dist(mtx))
364
+            
365
+            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
366
+            ## Calculate results and write into res
367
+            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
368
+            res$nContam[m] <- nContam
369
+            res$rep[m] <- j
370
+            
371
+            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
372
+            ## Calculate how many labels of the contamination group
373
+            ## are in the first half of leaf labels
374
+            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
375
+            res$sum[m] <- sum(hc$order[ctrl_vec] > grSize)
376
+            m <- m + 1    
377
+        }
378
+    }
379
+    res$lat <- pmin(res$sum, grSize - res$sum) / grSize * 100
380
+    res$perc <- res$nContam / nSeq * 100
381
+    return(res)
382
+}
383
+
384
+
385
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
386
+## END OF FILE
387
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##