Browse code

Prepare for reimport

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@128653 bc3139a8-67e5-0310-9ffc-ced21a209358

Wolfgang Kaisers authored on 13/04/2017 15:55:03
Showing 1 changed files

1 1
deleted file mode 100644
... ...
@@ -1,387 +0,0 @@
1
-
2
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
3
-##                                                                           ##
4
-##  Project   :   seqTools                                                   ##
5
-##  Created   :   26.August.2013                                             ##
6
-##  Author    :   W. Kaisers                                                 ##
7
-##  Version   :   0.99.13                                                    ##
8
-##  File      :   simFunctions.r                                             ##
9
-##  Content   :   Functions which produce fastq files with simulated data    ##
10
-##                (Done for testing Fastqq and DNA k-mer counting)           ##
11
-##                writeSimFastq, writeSimContFastq, simFastqqRunTimes,       ##
12
-##                sim_fq                                                     ##
13
-##                                                                           ##
14
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
15
-
16
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
17
-## Create fastq files with simulated k-mers
18
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
19
-
20
-writeSimFastq<-function(k=6, nk=5, nSeq=10, filename="sim.fq.gz")
21
-{
22
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
23
-    # k
24
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
25
-    if(!is.numeric(k))
26
-        stop("'k' must be numeric.")
27
-    if(length(k) != 1)
28
-        stop("'k' must have length 1.")
29
-    k <- as.integer(k)    
30
-    if( (k < 1) || (k > max_k) )
31
-        stop("'k' out of range.")
32
-    
33
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
34
-    # nk
35
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
36
-    if(!is.numeric(nk))
37
-        stop("'nk' must be numeric.")
38
-    if(length(nk) != 1)
39
-        stop("'nk' must have length 1.")
40
-    nk<-as.integer(nk)
41
-    if( (nk < 1) || (nk > 1000) )
42
-        stop("nk must be positive and <1000.")
43
-    
44
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
45
-    # nSeq
46
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
47
-    if(!is.numeric(nSeq))
48
-        stop("'nSeq' must be numeric.")
49
-    if(nSeq < 1)
50
-        stop("'nSeq' must be positive.")
51
-    
52
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
53
-    # filename
54
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
55
-    if(!is.character(filename))
56
-        stop("'filename' must be string.")
57
-    
58
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
59
-    # Do the work
60
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
61
-    bm <- Sys.localeconv()[7]
62
-    val <- as.integer(c(k, nk, nSeq))
63
-
64
-    pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools")
65
-    res <- .Call("gzwrite_fastq_dna", val, pseq, filename, PACKAGE="seqTools")
66
-
67
-    message("[writeSimFastq] file '", basename(filename), "': ",
68
-            format(res,big.mark=bm), " Bytes written.")
69
-  
70
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
71
-    # Terminate function.
72
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
73
-    return(invisible())
74
-}
75
-
76
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
77
-## writeSimContFastq
78
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
79
-writeSimContFastq<-function(k=6, nk=5, nSeq=10, pos=1, 
80
-                kIndex=1, nContam=nSeq, filename="simc.fq.gz")
81
-{
82
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
83
-    # k
84
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
85
-    if(!is.numeric(k))
86
-        stop("k must be numeric.")
87
-    if(length(k) != 1)
88
-        stop("k must have length 1.")
89
-    k<-as.integer(k)    
90
-    if( (k < 1) || (k > max_k) )
91
-        stop("k out of range.")
92
-    
93
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
94
-    # nk
95
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
96
-    if(!is.numeric(nk))
97
-        stop("nk must be numeric.")
98
-    if(length(nk) != 1)
99
-        stop("nk must have length 1.")
100
-    nk<-as.integer(nk)
101
-    if( (nk < 1) || (nk > 1000) )
102
-        stop("nk must be positive and <1000.") 
103
-    
104
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
105
-    # nSeq
106
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
107
-    if(!is.numeric(nSeq))
108
-        stop("nSeq must be numeric.")
109
-    if(length(nSeq) != 1)
110
-        stop("nSeq must have length 1.")
111
-    if(nSeq < 1)
112
-        stop("nSeq must be positive.")
113
-    
114
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
115
-    # pos
116
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
117
-    if(!is.numeric(pos))
118
-        stop("pos must be numeric.")
119
-    pos<-as.integer(pos)
120
-    if( any(pos < 1) || any(pos > (k * (nk - 1) + 1)))
121
-        stop("pos out of range.")
122
-    
123
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
124
-    # writeSimContFastq should take 1-based positions
125
-    # while "set_dna_k_mer" takes    0-based positions
126
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
127
-    pos<-pos - 1L
128
-    
129
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
130
-    # kIndex
131
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
132
-    if(!is.numeric(kIndex))
133
-        stop("kIndex must be numeric.")
134
-    
135
-    kIndex <- as.integer(kIndex)
136
-    
137
-    if(length(pos) != length(kIndex))
138
-        stop("pos and kIndex must have equal length.")
139
-    
140
-    if( any(kIndex < 1L) || any(kIndex > 4^k) )
141
-        stop("kIndex out of range.")
142
-    
143
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
144
-    # writeSimContFastq should take 1-based k-indices
145
-    # while "set_dna_k_mer" takes    0-based k-indices
146
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
147
-    kIndex <- kIndex - 1L
148
-    
149
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
150
-    # nContam: Deterministic replacements are copied
151
-    # into the first 'nContam' reads
152
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
153
-    if(!is.numeric(nContam))
154
-        stop("nContam must be integer.")
155
-    
156
-    nContam <- as.integer(nContam)
157
-    
158
-    if(length(nContam) != 1)
159
-        stop("nContam must have length 1.")
160
-    
161
-    if( (nContam < 1) || (nContam > nSeq) )
162
-        stop("nContam must be >0 and <nSeq.")
163
-    
164
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
165
-    # filename
166
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
167
-    if(!is.character(filename))
168
-        stop("filename must be character.")
169
-    
170
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
171
-    # Do the work
172
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
173
-    bm <- Sys.localeconv()[7]
174
-    
175
-    val <- as.integer(c(k, nk, nSeq))
176
-    
177
-    pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools")
178
-    
179
-    prseq <- .Call("set_dna_k_mer", 
180
-                val, pseq,pos, kIndex, nContam, PACKAGE="seqTools")
181
-  
182
-    res <- .Call("gzwrite_fastq_dna",
183
-                    val, prseq, filename, PACKAGE="seqTools")
184
-  
185
-    message("[writeSimContFastq] file '", basename(filename), 
186
-            "': ", format(res,big.mark=bm), " Bytes written.")
187
-  
188
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
189
-    # Terminate function
190
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
191
-    return(invisible())
192
-}
193
-
194
-
195
-
196
-simFastqqRunTimes<-function(k, nSeq, filedir=".")
197
-{
198
-    if(missing(k))
199
-        k <- 2:(max_k)
200
-    else
201
-    {
202
-        if(!is.numeric(k))
203
-            stop("'k' must be numeric.")
204
-        
205
-        k <- sort(unique(as.integer(k)))
206
-
207
-        if(any(k < 1) || any(k > max_k))
208
-            stop("'k' out of range.")
209
-    }
210
-    
211
-    if(missing(nSeq))
212
-        nSeq <- as.integer(c(1e2, 1e3, 1e4, 1e5, 1e6, 1e7))
213
-    else {
214
-        if(!is.numeric(nSeq))
215
-            stop("'nSeq' must be numeric.")
216
-        nSeq <- as.integer(nSeq)
217
-        if(any(nSeq < 1) )
218
-            stop("'nSeq' must be positive.")
219
-    }
220
-    
221
-    if(!is.character(filedir))
222
-        stop("'filedir' must be character.")
223
-    
224
-    if(!file.exists(filedir))
225
-    {
226
-        if(!dir.create(filedir))
227
-            stop("Cannot create filedir '", filedir, "'.")
228
-    }
229
-    
230
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
231
-    # Write simulated fastq files
232
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
233
-    message("[simFastqqRunTimes] Creating fastq files:")
234
-    fqFiles <- character(length(nSeq))
235
-    bm <- Sys.localeconv()[7]
236
-    
237
-    for(j in 1:length(nSeq))
238
-    {
239
-        fqFiles[j] <- file.path(filedir,
240
-            paste("sfqrt_nSeq", nSeq[j], ".fq.gz", sep=""))
241
-        
242
-        message("[simFastqqRunTimes] (",format(j , w=3),"/", length(nSeq),
243
-            ") nSeq=", format(nSeq[j], big.mark=bm, w=12))
244
-        
245
-        # Allways equal sized reads: k=6, read length=102
246
-        writeSimFastq(k=6, nk=17, nSeq=nSeq[j], filename=fqFiles[j])    
247
-    }
248
-    
249
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
250
-    # Prepare result table
251
-    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
252
-    nSim <- length(k) * length(nSeq)
253
-    m <- 1
254
-    res<-data.frame(id=1:nSim, k=0, nSeq=0, runtime=0)
255
-    
256
-    for(i in 1:length(k))
257
-    {
258
-        for(j in 1:length(nSeq))
259
-        {
260
-            message("[simFastqqRunTimes] (",
261
-                        format(m, width=3),"/", nSim, ") Fastqq run.")
262
-            
263
-            fq <- fastqq(fqFiles[j], k=k[i])
264
-            res$k[m] <- k[i]
265
-            res$nSeq[m] <- nSeq[j]
266
-            res$runtime[m] <- collectDur(fq)
267
-            m <- m + 1
268
-        }
269
-    }
270
-    return(res)
271
-}
272
-
273
-
274
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
275
-## Encapsulates whole simulation studies
276
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
277
-
278
-sim_fq<-function(nRep=2, nContamVec=c(100, 1000), grSize=20, nSeq=1e4,
279
-                k=6, kIndex=1365, pos=20)
280
-{
281
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
282
-    ## Arguments
283
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
284
-    
285
-    # nContam   : Absolute number of contaminated sequences
286
-    # grSize    : Nr of fastq files in control and contamination group
287
-    # nSeq      : Nr of reads per fastq file
288
-    # k         : k-mer length used in fastqq function
289
-    # kIndex    : k-mer index of contaminating sequence default is "CCCCCC"
290
-    # pos       : Contamination is inserted at position in read sequence, 
291
-    #               default is 20
292
-  
293
-    if(length(kIndex) != length(pos))
294
-        stop("'kIndex' and 'pos' must have equal length.")
295
-    
296
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
297
-    
298
-    
299
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
300
-    ## Fixed internal values
301
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
302
-    
303
-    # ksim        : k-mer sized used for simulation
304
-    ksim <- 6
305
-    # nksim     : number of k-mers per read (determines read length)
306
-    nksim <- 17
307
-    
308
-    
309
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
310
-    ## Write simulated fastq files for control group
311
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
312
-    
313
-    # ctrl_vec=leaf labels for control group
314
-    ctrl_vec <- 1:grSize
315
-    ctrl_files <- paste("sfq_i", ctrl_vec, "_ctrl.fq.gz", sep = "")
316
-    
317
-    for(i in ctrl_vec){
318
-        writeSimFastq(k=ksim, nk=nksim, nSeq=nSeq,
319
-                                        filename = ctrl_files[i])
320
-    }
321
-    
322
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
323
-    ## Prepare values for contamination and result
324
-    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
325
-    
326
-    # cont_vec=leaf labels for contamination group
327
-    # (must be nonoverlapping with ctrl_vec)
328
-    cont_vec <- ctrl_vec + 100
329
-    contamVecLen <- length(nContamVec)
330
-    nSim = nRep * contamVecLen
331
-    # Result data.frame
332
-    res <- data.frame(id=1:nSim, nContam=0, rep=0, sum=NA)
333
-    
334
-    m <- 1
335
-    for(i in 1:length(nContamVec))
336
-    {
337
-        nContam<-nContamVec[i]
338
-        for(j in 1:nRep)
339
-        {
340
-            message("[sim_fq] Simulation (",
341
-                                    format(m, width=3), "/", nSim, ").")
342
-            
343
-            cont_files <- paste("sfq_i", cont_vec, "_cont.fq.gz", sep="")
344
-            
345
-            for(i in 1:length(cont_vec))
346
-                writeSimContFastq(k=ksim, nk=nksim, nSeq=nSeq, pos=pos,
347
-                    nContam=nContam, kIndex=kIndex,
348
-                    filename=cont_files[i])
349
-
350
-            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
351
-            ## read Fastqq
352
-            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
353
-            ctrl_fqq <- fastqq(ctrl_files, k=k)
354
-            probeLabel(ctrl_fqq) <- ctrl_vec
355
-            cont_fqq <- fastqq(cont_files, k=k)
356
-            probeLabel(cont_fqq) <- cont_vec
357
-            
358
-            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
359
-            ## merge, dist, hclust
360
-            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
361
-            mrg <- mergeFastqq(ctrl_fqq, cont_fqq)
362
-            mtx <- cbDistMatrix(mrg)
363
-            hc <- hclust(as.dist(mtx))
364
-            
365
-            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
366
-            ## Calculate results and write into res
367
-            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
368
-            res$nContam[m] <- nContam
369
-            res$rep[m] <- j
370
-            
371
-            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
372
-            ## Calculate how many labels of the contamination group
373
-            ## are in the first half of leaf labels
374
-            ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
375
-            res$sum[m] <- sum(hc$order[ctrl_vec] > grSize)
376
-            m <- m + 1    
377
-        }
378
-    }
379
-    res$lat <- pmin(res$sum, grSize - res$sum) / grSize * 100
380
-    res$perc <- res$nContam / nSeq * 100
381
-    return(res)
382
-}
383
-
384
-
385
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
386
-## END OF FILE
387
-## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##