Browse code

add seqTools package

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@95415 bc3139a8-67e5-0310-9ffc-ced21a209358

Herve Pages authored on 13/10/2014 19:02:34
Showing81 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,14 @@
1
+Package: seqTools
2
+Type: Package
3
+Title: Analysis of nucleotide, sequence and quality content on fastq
4
+        files.
5
+Version: 0.99.44
6
+Date: 2013-10-14
7
+Author: Wolfgang Kaisers
8
+Maintainer: Wolfgang Kaisers <kaisers@med.uni-duesseldorf.de>
9
+Description: Analyze read length, phred scores and alphabet frequency and DNA k-mers on uncompressed and compressed fastq files.
10
+biocViews: QualityControl,Sequencing
11
+License: Artistic-2.0
12
+Depends: methods,utils,zlibbioc
13
+LinkingTo: zlibbioc
14
+Suggests: RUnit, BiocGenerics
0 15
new file mode 100644
... ...
@@ -0,0 +1,65 @@
1
+#exportPattern("^[[:alpha:]]+")
2
+useDynLib(seqTools)
3
+import(methods)
4
+import(zlibbioc)
5
+importFrom(utils,head)
6
+importFrom(utils,tail)
7
+export(
8
+    ascii2char,
9
+    countDnaKmers,
10
+    countFastaKmers,
11
+    countGenomeKmers,
12
+    countSpliceKmers,
13
+    char2ascii,
14
+    kMerIndex,
15
+    phredTable,
16
+    fastqq,
17
+    fastqKmerLocs,
18
+    fastqKmerSubsetLocs,
19
+    phredDist,
20
+    plotPhredDist,
21
+    plotPhredQuant,
22
+    phredQuantiles,
23
+    revCountDnaKmers,
24
+    simFastqqRunTimes,
25
+    sim_fq,
26
+    trimFastq,
27
+    writeFai,
28
+    writeSimFastq,
29
+    writeSimContFastq
30
+)
31
+exportMethods(
32
+    cbDistMatrix,
33
+    collectTime,
34
+    collectDur,
35
+    gcContent,
36
+    gcContentMatrix,
37
+    fileNames,
38
+    getK,
39
+    kmerCount,
40
+    maxSeqLen,
41
+    meltDownK,
42
+    mergedPhred,
43
+    mergedPhredQuantiles,
44
+    mergeFastqq,
45
+    nFiles,
46
+    nNnucs,
47
+    nReads,
48
+    nucFreq,
49
+    phred,
50
+    plotGCcontent,
51
+    plotKmerCount,
52
+    plotMergedPhredQuant,
53
+    plotNucCount,
54
+    plotNucFreq,
55
+    probeLabel,
56
+    "probeLabel<-",
57
+    propPhred,
58
+    seqLen,
59
+    seqLenCount
60
+)
61
+exportClasses(
62
+    Fastqq
63
+)
64
+
65
+
0 66
new file mode 100644
... ...
@@ -0,0 +1,32 @@
1
+CHANGES IN VERSION 0.99.42
2
+-------------------------
3
+
4
+NEW FEATURES
5
+
6
+    o (none)
7
+
8
+SIGNIFICANT USER-VISIBLE CHANGES
9
+
10
+    o (none)
11
+
12
+BUG FIXES
13
+
14
+    o trimFastq also includes record identifiers in output files
15
+	(=read title), e.g. SRR014849.1 EIXKN4201CFU84 length=93.
16
+
17
+
18
+CHANGES IN VERSION 0.99.40
19
+-------------------------
20
+
21
+NEW FEATURES
22
+
23
+    o Included this NEWS file
24
+
25
+
26
+SIGNIFICANT USER-VISIBLE CHANGES
27
+
28
+    o (none)
29
+
30
+BUG FIXES
31
+
32
+    o (none)
0 33
new file mode 100644
... ...
@@ -0,0 +1,35 @@
1
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
2
+##                                                                           ##
3
+##  Project   :   seqTools                                                   ##
4
+##  Created   :   26.August.2013                                             ##
5
+##  Author    :   W. Kaisers                                                 ##
6
+##  Version   :   0.9.0                                                      ##
7
+##  File      :   allClasses.r                                               ##
8
+##  Content   :   Declarations for all S4 classes in package                 ##
9
+##                                                                           ##
10
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
11
+
12
+.onUnload<-function(libpath) { library.dynam.unload("seqTools",libpath) }
13
+
14
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
15
+setClass("Fastqq",representation(
16
+    filenames="character",
17
+    probeLabel="character",
18
+    nFiles="integer",
19
+    k="integer",
20
+    maxSeqLen="integer",
21
+    kmer="matrix",
22
+    firstKmer="matrix",
23
+    nReads="integer",
24
+    nN="integer",
25
+    seqLenCount="matrix",
26
+    gcContent="matrix",
27
+    nac="list",
28
+    phred="list",
29
+    seqLen="matrix",
30
+    collectTime="list"))
31
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
32
+
33
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
34
+## END OF FILE
35
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
0 36
new file mode 100644
... ...
@@ -0,0 +1,105 @@
1
+
2
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
3
+##                                                                           ##
4
+##  Project   :   seqTools                                                   ##
5
+##  Created   :   26.August.2013                                             ##
6
+##  Author    :   W. Kaisers                                                 ##
7
+##  File      :   allGenerics.r                                              ##
8
+##  Content   :   All static variables and (not directly object related )    ##
9
+##                function declarations                                      ##
10
+##                                                                           ##
11
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
12
+
13
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
14
+## Changing Fastqq object structure
15
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
16
+setGeneric("mergeFastqq", function(lhs, rhs) standardGeneric("mergeFastqq"))
17
+setGeneric("meltDownK", function(object, newK) standardGeneric("meltDownK"))
18
+
19
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
20
+## Preparation for Hierarchical clustering (HC)
21
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
22
+setGeneric("cbDistMatrix", function(object, nReadNorm = max(nReads(object))) 
23
+                  standardGeneric("cbDistMatrix"))
24
+
25
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
26
+## Slot accessors
27
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
28
+
29
+setGeneric("fileNames",   function(object) standardGeneric("fileNames"))
30
+setGeneric("collectTime", function(object) standardGeneric("collectTime"))
31
+setGeneric("collectDur", function(object) standardGeneric("collectDur"))
32
+setGeneric("getK", function(object) standardGeneric("getK"))
33
+setGeneric("nFiles", function(object) standardGeneric("nFiles"))
34
+setGeneric("nNnucs", function(object) standardGeneric("nNnucs"))
35
+setGeneric("nReads", function(object) standardGeneric("nReads"))
36
+setGeneric("maxSeqLen", function(object) standardGeneric("maxSeqLen"))
37
+setGeneric("seqLenCount", function(object) standardGeneric("seqLenCount"))
38
+setGeneric("nucFreq", function(object, i) standardGeneric("nucFreq"))
39
+setGeneric("gcContent", function(object, i) standardGeneric("gcContent"))
40
+
41
+setGeneric("gcContentMatrix", function(object)
42
+                                    standardGeneric("gcContentMatrix"))
43
+
44
+setGeneric("seqLen", function(object) standardGeneric("seqLen"))
45
+setGeneric("kmerCount", function(object) standardGeneric("kmerCount"))
46
+
47
+setGeneric("probeLabel", function(object, label)
48
+                                        standardGeneric("probeLabel"))
49
+
50
+setGeneric("probeLabel<-", function(object, value)
51
+                                        standardGeneric("probeLabel<-"))
52
+
53
+setGeneric("phred", function(object, i) standardGeneric("phred"))
54
+
55
+
56
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
57
+## Retrieving Phred distribution
58
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
59
+
60
+setGeneric("phredQuantiles", function(object, quantiles, i, ...)
61
+                                        standardGeneric("phredQuantiles"))
62
+
63
+setGeneric("plotPhredQuant", function(object, i, main, ...)
64
+                                        standardGeneric("plotPhredQuant"))
65
+
66
+## Global Phred content functions
67
+setGeneric("phredDist", function(object, i) standardGeneric("phredDist"))
68
+
69
+setGeneric("plotPhredDist", function(object, i, maxp=45, col, ...)
70
+                                        standardGeneric("plotPhredDist"))
71
+
72
+setGeneric("propPhred", function(object, greater=30, less=93)
73
+                                            standardGeneric("propPhred"))
74
+
75
+
76
+setGeneric("mergedPhred", function(object) standardGeneric("mergedPhred"))
77
+
78
+setGeneric("mergedPhredQuantiles", function(object, quantiles)
79
+                                    standardGeneric("mergedPhredQuantiles"))
80
+
81
+setGeneric("plotMergedPhredQuant", function(object, main, ...)
82
+                                    standardGeneric("plotMergedPhredQuant"))
83
+
84
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
85
+## Predefined Plot functions
86
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
87
+
88
+setGeneric("plotNucFreq", function(object, i, main, maxx, ...)
89
+                                            standardGeneric("plotNucFreq"))
90
+
91
+setGeneric("plotGCcontent", function(object,main,...)
92
+                                        standardGeneric("plotGCcontent"))
93
+
94
+setGeneric("plotNucCount", function(object, nucs=16, maxx, ...)
95
+                                        standardGeneric("plotNucCount"))
96
+
97
+setGeneric("plotKmerCount",
98
+                    function(object, index, mxey, main="K-mer count", ...)
99
+                                            standardGeneric("plotKmerCount"))
100
+
101
+
102
+
103
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
104
+## END OF FILE
105
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
0 106
new file mode 100644
... ...
@@ -0,0 +1,113 @@
1
+
2
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
3
+##                                                                           ##
4
+##  Project   :   seqTools                                                   ##
5
+##  Created   :   26.August.2013                                             ##
6
+##  Author    :   W. Kaisers                                                 ##
7
+##  File      :   allStatics.r                                               ##
8
+##  Content   :   All static variables and (not directly object related )    ##
9
+##                function declarations                                      ##
10
+##                                                                           ##
11
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
12
+
13
+
14
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
15
+## Global variables:
16
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
17
+
18
+strandlevels <- c("+", "-", "*")
19
+iupac_chars <- c('a', 'c', 'g', 't', 'u', 'r', 'y', 's', 'w', 'k', 'm', 'b',
20
+                'd', 'h', 'v', 'n', '.', '-', '=')
21
+
22
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
23
+## Although static array sizes would allow larger k (e.g. 15: 8 GB RAM), 
24
+## exponential increase in run-time restricts usability to k values in 1:12
25
+## 4^12 = 16.777.216 possible combinations.
26
+## 'Hard' coded (in stat_defs.h: 15)
27
+max_k <- 12
28
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
29
+
30
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
31
+## End global variables
32
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
33
+
34
+
35
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
36
+## Some useful functions for work with phred's
37
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
38
+
39
+char2ascii <- function(c) { strtoi(charToRaw(c), base=16L) }
40
+
41
+ascii2char <- function(x, multiple=FALSE)
42
+                                { rawToChar(as.raw(x), multiple=multiple) }
43
+
44
+phredTable <- function(phred=0:93)
45
+{
46
+    if(!is.numeric(phred))
47
+        stop("phred must be numeric.")
48
+    
49
+    phred <- sort(unique(as.integer(phred)))
50
+    
51
+    if( (phred[1] < 0) || (max(phred) > 93) )
52
+        stop("Only phred values in 0:93 are allowed.")
53
+
54
+    ascii <- phred + 33
55
+    return(data.frame(ascii=ascii, phred=phred,
56
+                    char=ascii2char(ascii, multiple=TRUE)))
57
+}
58
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
59
+
60
+# unexported functions,  do not check for x
61
+rel_int  <- function(x) {return(.Call("rel_int", x, PACKAGE="seqTools"))}
62
+rel_real <- function(x) {return(.Call("rel_real", x, PACKAGE="seqTools"))}
63
+
64
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
65
+enlargeIntMat <- function(x, newDim){
66
+    if(!is.matrix(x))
67
+        stop("x must be matrix.")
68
+
69
+    if(!is.numeric(newDim))
70
+        stop("newDim must be numeric.")
71
+    
72
+    newDim <- as.integer(newDim)
73
+
74
+    if(length(newDim) != 2)
75
+        stop("newDim must have length 2.")
76
+
77
+    return(.Call("r_enlarge_int_mat", x, newDim, PACKAGE="seqTools"))
78
+}
79
+
80
+kMerIndex <- function(kMers, k=nchar(kMers)[1], base=1)
81
+{
82
+    if(!is.character(kMers))
83
+        stop("kMers must be character.")
84
+
85
+    if(!is.numeric(k))
86
+        stop("k must be numeric.")
87
+    k <- as.integer(k)
88
+
89
+    if(k<0 || k > max_k)
90
+        stop("k must be in range 0, ... , ", max_k, ".")
91
+
92
+    if(!all(nchar(kMers) == k))
93
+        stop("All kMers must have k characters!")
94
+
95
+    if(!is.numeric(base))
96
+        stop("base must be numeric")
97
+
98
+    if(length(base) > 1)
99
+        stop("base must have length 1.")
100
+    base<-as.integer(base)
101
+
102
+    if(!(base %in% 0:1))
103
+        stop("base must be 0 or 1.")
104
+
105
+    return(.Call("get_Kmer_Index", kMers, k, PACKAGE="seqTools") + base)
106
+}
107
+
108
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
109
+
110
+
111
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
112
+## END OF FILE
113
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
0 114
new file mode 100644
... ...
@@ -0,0 +1,53 @@
1
+
2
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
3
+##                                                                           ##
4
+##  Project   :   seqTools                                                   ##
5
+##  Created   :   26.August.2013                                             ##
6
+##  Author    :   W. Kaisers                                                 ##
7
+##  File      :   fastaFunctions.r                                           ##
8
+##  Content   :   Functions which work on fastq and fastq and write output   ##
9
+##                files: writeFai,  countFastaKmers                          ##
10
+##                                                                           ##
11
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
12
+
13
+writeFai <- function(infiles, outfiles)
14
+{
15
+    if(!is.character(infiles))
16
+        stop("infiles must be character.")
17
+    
18
+    if(!is.character(outfiles))
19
+        stop("outfiles must be character.")
20
+    
21
+    if(length(infiles) != length(outfiles))
22
+        stop("infiles and outfiles must have equal length.")
23
+    
24
+    if(any( !file.exists(infiles) ) )
25
+        stop("Some infile(s) do not exist!\n")
26
+    
27
+    .Call("write_fai", infiles, outfiles, PACKAGE = "seqTools")
28
+    message("[write_fai] done.\n")
29
+    
30
+    return(invisible())
31
+}
32
+
33
+
34
+countFastaKmers <- function(filenames, k=4)
35
+{
36
+    if(!is.character(filenames))
37
+        stop("filenames must be character.")
38
+    
39
+    if(!is.numeric(k))
40
+        stop("k must be numeric.")
41
+    k <- as.integer(k)
42
+    
43
+    if( (k < 0) || (k > max_k) )    
44
+        stop("k must be in range 0,    ... , ", max_k, ".")
45
+    
46
+    res <- .Call("count_fasta_Kmers", filenames, k, PACKAGE="seqTools")
47
+    
48
+    return(res)
49
+}
50
+
51
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
52
+## END OF FILE
53
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
0 54
new file mode 100644
... ...
@@ -0,0 +1,250 @@
1
+
2
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
3
+##                                                                           ##
4
+##  Project   :   seqTools                                                   ##
5
+##  Created   :   26.August.2013                                             ##
6
+##  Author    :   W. Kaisers                                                 ##
7
+##  File      :   kMer.r                                                     ##
8
+##  Content   :   Functionality for counting DNA k-mers                      ##
9
+##                (independent of fastq or fasta files)                      ##
10
+##                countKmers, countDnaKmers, revCountDnaKmers,               ##
11
+##                countGenomeKmers, countSpliceKmers                         ##
12
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
13
+
14
+
15
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
16
+## Counts DNA k-mers on specified regions inside single (character) sequence
17
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
18
+countDnaKmers <- function(dna, k, start=1, width=nchar(dna) - k + 1)
19
+{
20
+    if(!is.character(dna))
21
+        stop("'dna' must be character.")
22
+    
23
+    if(length(dna) != 1)
24
+        stop("'dna' must have length 1.")
25
+    
26
+    if(is.numeric(start))
27
+        start <- as.integer(start)
28
+    
29
+    if(is.numeric(width))
30
+        width <- as.integer(width)
31
+    
32
+    if(length(width) == 1)
33
+        width <- rep(width, length(start))
34
+    
35
+    if(is.numeric(k))
36
+        k <- as.integer(k)
37
+    
38
+    if(length(k)!=1)
39
+        stop("'k' must have length 1.")
40
+    
41
+    if(k < 1)
42
+        stop("'k' must be positive.")
43
+    
44
+    if(k > max_k)
45
+        stop("'k' must not exceed", max_k, ".")
46
+  
47
+    nc <- nchar(dna)
48
+    if(k > nc)
49
+        stop("'k' must be <= nchar(dna).")
50
+    if(any(start + width + k > nc + 2))
51
+        stop("Search region exceeds string end.")
52
+  
53
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
54
+    ## Counts N's
55
+    ## ToDo: Return value
56
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
57
+    nn <- integer(length(start))
58
+    return(.Call("count_dna_Kmers", 
59
+                            dna, start, width, k, nn, PACKAGE="seqTools"))
60
+    
61
+}
62
+
63
+
64
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
65
+## Counts DNA k-mers on specified regions inside single (character) sequence 
66
+## in reverse direction
67
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
68
+revCountDnaKmers <- function(dna, k, start, width)
69
+{
70
+    if(!is.character(dna))
71
+        stop("'dna' must be character.")
72
+    
73
+    if(length(dna) != 1)
74
+        stop("'dna' must have length 1.")
75
+    
76
+    if(is.numeric(start))
77
+        start <- as.integer(start)
78
+    
79
+    if(is.numeric(width))
80
+        width <- as.integer(width)
81
+    
82
+    if(length(width) == 1)
83
+        width <- rep(width, length(start))
84
+    
85
+    if(length(width) != length(start))
86
+        stop("'width' must have length 1 or the same length as 'start'.")
87
+    
88
+    if(is.numeric(k))
89
+        k <- as.integer(k)
90
+    
91
+    if(any(width + k > start))
92
+        stop("'width' must be <=  'start' - 'k'.")
93
+    
94
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
95
+    ## nn contains N counts
96
+    ## ToDo: Add value of nn to returned object
97
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
98
+    nn <- integer(length(start))
99
+    
100
+    return(.Call("rev_count_dna_Kmers",
101
+                            dna, start, width, k, nn, PACKAGE="seqTools"))
102
+}
103
+
104
+
105
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
106
+## Counts DNA k-mers on specified regions inside multiple (character) sequences
107
+## in possibly reversed direction (depending on strand)
108
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
109
+countGenomeKmers <- function(dna, seqid, start, width, strand, k)
110
+{
111
+    if(!is.character(dna))
112
+        stop("'dna' must be character.")
113
+    
114
+    if(!is.numeric(seqid))
115
+        seqid <- as.integer(seqid)
116
+    rg <- range(seqid)
117
+    
118
+    if(rg[1] < 0)
119
+        stop("Negative seqid's are not allowed.")
120
+    
121
+    if(rg[2] > length(dna))
122
+        stop("Out of range seqid's.")
123
+    
124
+    if(!is.numeric(start))
125
+        stop("'start' must be numeric.")
126
+    start <- as.integer(start)
127
+    
128
+    if(!is.numeric(width))
129
+        stop("'width' must be numeric")
130
+    width <- as.integer(width)
131
+    
132
+    if(is.factor(strand))
133
+        strand <- as.integer(strand)
134
+    else
135
+    {
136
+        if(!is.numeric("strand"))
137
+            strand <- as.integer(strand)        
138
+    }
139
+    
140
+    nStart <- length(start)
141
+    if( (length(seqid) != nStart) | (length(width) != nStart) |
142
+                (length(strand) != nStart) )
143
+        stop("'seqid', 'start', 'width' and 'strand' must have same length.")
144
+    
145
+    if(length(k) != 1)
146
+        stop("'k' must be a single value.")
147
+    
148
+    if(!is.numeric(k))
149
+        stop("'k' must be numeric.")
150
+    k<-as.integer(k)
151
+    
152
+    if(k <= 0)
153
+        stop("'k' must be >=1")
154
+    
155
+    if(k > max_k)
156
+        stop("'k' must not exceed", max_k, ".")
157
+    
158
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
159
+    ## Counts N's
160
+    ## ToDo: Return value
161
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
162
+    nn <- integer(length(start))
163
+    return(.Call("count_genome_Kmers", dna, seqid, start, width,
164
+                            strand, k, nn, PACKAGE="seqTools"))
165
+}
166
+
167
+
168
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
169
+## Counts DNA k-mers on each border of a splice-site defined by wLend and 
170
+## wRstart in range of size width
171
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
172
+
173
+countSpliceKmers <- function(dna, seqid, lEnd, rStart, width, strand, k)
174
+{
175
+    if(!is.character(dna))
176
+        stop("'dna' must be character.")
177
+    
178
+    if(!is.numeric(seqid))
179
+        stop("'seqid' must be numeric.")
180
+    seqid <- as.integer(seqid)
181
+    
182
+    rg <- range(seqid)
183
+    if(rg[1] < 0)
184
+        stop("Negative seqid's are not allowed.")
185
+    
186
+    if(rg[2] > length(dna))
187
+        stop("Out of range seqid's.")
188
+    
189
+    if(!is.numeric(lEnd))
190
+        stop("'lEnd' must be numeric.")
191
+    lEnd <- as.integer(lEnd)
192
+    
193
+    if(!is.numeric(rStart))
194
+        stop("'rStart' must be numeric.")
195
+    rStart <- as.integer(rStart)
196
+    
197
+    if(!is.numeric(width))
198
+        stop("'width' must be numeric.")
199
+    width <- as.integer(width)
200
+    
201
+    if(is.factor(strand))
202
+        strand <- as.integer(strand)
203
+    else
204
+    {
205
+        if(!is.numeric("strand"))
206
+            strand <- as.integer(strand)        
207
+    }
208
+    
209
+    nStart <- length(lEnd)
210
+    if(length(seqid) != nStart | length(rStart) != nStart | 
211
+             length(width) != nStart | length(strand) != nStart)
212
+    {
213
+        stop(paste("'seqid', 'lEnd', 'rStart', 'width'",
214
+                            "and 'strand' must have equal length."))
215
+    }
216
+    if(!is.numeric(k))
217
+        stop("'k' must be numeric.")
218
+    k <- as.integer(k)
219
+    
220
+    if(length(k) != 1)
221
+        stop("'k' must be a single value.")
222
+    
223
+    if(k > max_k)
224
+        stop("'k' must not exceed", max_k, ".")
225
+    
226
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
227
+    ## Plus strand
228
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
229
+    plus_strand<-strand == 1
230
+    
231
+    if(sum(plus_strand) > 0)
232
+    {
233
+        if(any((lEnd[plus_strand] - width[plus_strand] - k + 1) < 0))
234
+            stop("lEnd must be >= width+k-1 for all +-strand coordinates")
235
+    }
236
+    
237
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
238
+    ## Counts N's
239
+    ## ToDo: Return value
240
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
241
+    nn <- integer(length = nStart)
242
+    
243
+    return(.Call("count_splice_Kmers", dna, seqid, lEnd, rStart, width, 
244
+                            strand, k, nn, PACKAGE="seqTools")) 
245
+}
246
+
247
+
248
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
249
+## END OF FILE
250
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
0 251
new file mode 100644
... ...
@@ -0,0 +1,1123 @@
1
+
2
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
3
+##                                                                           ##
4
+##  Project   :   seqTools                                                   ##
5
+##  Created   :   26.August.2013                                             ##
6
+##  Author    :   W. Kaisers                                                 ##
7
+##  Content   :   Doing some diagnostic and interventional tasks on fastq    ##
8
+##                and fasta                                                  ##
9
+##                esp. DNA k-mer counts.                                     ##
10
+##  Version   :   0.99.34                                                    ##
11
+##                                                                           ##
12
+##  Changelog :                                                              ##
13
+##  26.08.13  :   0.0.1 Project created                                      ##
14
+##  03.09.13  :   0.0.6 C-Code valgrind tested                               ##
15
+##  27.09.13  :   0.1.0 Added fastqDnaKmers                                  ##
16
+##  14.10.13  :   0.1.1 Added C-library for parsing gz fasta and fastq files ##
17
+##  17.10.13  :   0.1.3 C-Wrapper for fastq working.                         ##
18
+##  17.10.13  :   0.1.6 First version of R package                           ##
19
+##  21.10.13  :   0.3.0 New C library for fastq parsing                      ##
20
+##  28.10.13  :   0.4.0 Added fastq-loc functions                            ##
21
+##  29.10.13  :   0.4.4 seqTools C-code valgrind tested.                     ##
22
+##  01.11.13  :   0.5.0 Distance matrices implemented                        ##
23
+##  02.11.13  :   0.5.1 First working version with clustering based on       ##
24
+##                      K-mers                                               ##
25
+##  07.11.13  :   0.5.4 countFastaKmers now resizes arrays automatically     ##
26
+##  09.11.13  :   0.6.0 count_fastq now resizes arrays automatically         ##
27
+##  11.11.13  :   0.6.5 Added fastq simulation functions                     ##
28
+##  19.11.13  :   0.7.0 Added trimFastq function                             ##
29
+##  30.11.13  :   0.9.0 Separated R source files                             ##
30
+##  22.12.13  :   0.99.2 Added '['-operator for Fastqq class                 ##
31
+##  19.01.14  :   0.99.7 Added zlib version control for correction of        ##
32
+##                       gzBuffer                                            ##
33
+##                       error                                               ##
34
+##                        + checks: cran win-builder + valgrind              ##
35
+##  21.05.14  :   0.99.34 Corrected error in count_fasta_Kmers               ##
36
+##                        which freezed function                             ##
37
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
38
+
39
+.onUnload <- function(libpath) { library.dynam.unload("seqTools", libpath) }
40
+
41
+## see: http://bioconductor.org/developers/how-to/coding-style/
42
+
43
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
44
+## Data collection functions:
45
+## Fastqq,  fastqKmerLocs,  fastqKmerSubsetLocs
46
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
47
+
48
+fastqq <- function(filenames, k=6, probeLabel)
49
+{
50
+    k <- as.integer(k)
51
+    
52
+    tl <- list()
53
+    tl$start <- Sys.time()
54
+    filenames <- path.expand(filenames)
55
+    
56
+    res <- .Call("count_fastq", filenames, k, PACKAGE="seqTools")
57
+    
58
+    tl$end <- Sys.time()
59
+    res@collectTime <- tl
60
+    
61
+    # Correct minSeqLen when empty files are counted
62
+    if(any(res@nReads==0))
63
+        res@seqLen[1,res@nReads==0] <- 0
64
+    
65
+    if(!missing(probeLabel))
66
+    {
67
+        if(length(probeLabel) == res@nFiles)
68
+            res@probeLabel <- as.character(probeLabel) 
69
+        else{
70
+            warning("[Fastqq] probeLabel and filenames must have equal length.")
71
+            res@probeLabel <- as.character(1:res@nFiles)
72
+        }
73
+    }else{
74
+        res@probeLabel <- as.character(1:res@nFiles)
75
+    }
76
+    return(res)
77
+}
78
+
79
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
80
+## fastq K-mer locs
81
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
82
+
83
+fastqKmerLocs <- function(filenames, k=4)
84
+{
85
+    if(!is.numeric(k))
86
+        stop("'k' must be numeric.")
87
+    k <- as.integer(k)
88
+    
89
+    if( (k < 0) || (k > max_k) )
90
+        stop("'k' must be in range 0, ... , 16.")
91
+    
92
+    return(.Call("fastq_Kmer_locs", filenames, k, PACKAGE="seqTools"))
93
+}
94
+
95
+
96
+fastqKmerSubsetLocs <- function(filenames, k=4, kIndex)
97
+{
98
+    # Returns list with matrix elements.
99
+    if(!is.numeric(k))
100
+        stop("'k' must be numeric.")
101
+    
102
+    k <- as.integer(k)
103
+    
104
+    if( (k < 0) || (k > max_k) )
105
+        stop("'k' must be in range 0, ... , ", max_k, ".")
106
+    
107
+    if(!is.numeric(kIndex))
108
+        stop("'kIndex' must be numeric.")
109
+    kIndex <- as.integer(kIndex)
110
+    
111
+    if(any(kIndex < 0))
112
+        stop("No negative 'kIndex' values allowed.")
113
+    
114
+    if(any(kIndex > (4^k)) )
115
+        stop("'kIndex' out of range (>4^k).")
116
+    
117
+    return(.Call("fastq_KmerSubset_locs",
118
+                                filenames, k, kIndex, PACKAGE="seqTools"))
119
+}
120
+
121
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
122
+## End: Data collection functions.
123
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
124
+
125
+
126
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
127
+## Standard slot accessor functions
128
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
129
+
130
+
131
+setMethod("fileNames", "Fastqq", function(object)
132
+                                            {return(object@filenames)})
133
+
134
+setMethod("collectTime", "Fastqq", function(object)
135
+                                            {return(object@collectTime)})
136
+
137
+setMethod("collectDur", "Fastqq", function(object) {
138
+return(as.numeric(difftime(object@collectTime$end, object@collectTime$start,
139
+                                                    units = "secs")))
140
+})
141
+
142
+setMethod("getK", "Fastqq", function(object) {return(object@k)})
143
+
144
+setMethod("nFiles", "Fastqq", function(object) {return(object@nFiles)})
145
+
146
+setMethod("nNnucs", "Fastqq", function(object) {return(object@nN)})
147
+
148
+setMethod("nReads", "Fastqq", function(object) {return(object@nReads)})
149
+
150
+setMethod("maxSeqLen", "Fastqq", function(object) {return(object@maxSeqLen)})
151
+
152
+setMethod("seqLenCount", "Fastqq", function(object)
153
+{
154
+    res<-object@seqLenCount
155
+    colnames(res) <- object@probeLabel
156
+    return(res)
157
+})
158
+
159
+setMethod("nucFreq", "Fastqq", function(object, i)
160
+{
161
+    if(missing(i))
162
+        stop("Argument 'i' is not optional.")
163
+    
164
+    if(!is.numeric(i))
165
+        stop("'i' must be numeric.")
166
+    
167
+    i <- as.integer(i)
168
+    if( (i < 1) || (i > object@nFiles) )
169
+        stop("'i' must be >0 and < nFiles.")
170
+    
171
+    return(object@nac[[i]])
172
+})
173
+
174
+setMethod("gcContent", "Fastqq", function(object, i)
175
+{
176
+    if(missing(i))
177
+        stop("Argument 'i' is not optional.")
178
+    
179
+    if(!is.numeric(i))
180
+        stop("'i' must be numeric.")
181
+    
182
+    i <- as.integer(i)
183
+    if( (i < 1) || (i > object@nFiles))
184
+        stop("'i' must be >0 and < nFiles.")
185
+    
186
+    return(object@gcContent[, i])
187
+})
188
+
189
+
190
+setMethod("gcContentMatrix", "Fastqq", function(object)
191
+{
192
+    gcc <- object@gcContent
193
+    colnames(gcc) <- object@probeLabel
194
+    return(gcc)
195
+})
196
+
197
+setMethod("seqLen", "Fastqq", function(object)
198
+{
199
+    sql <- object@seqLen
200
+    colnames(sql) <- object@probeLabel
201
+    return(sql)
202
+})
203
+
204
+setMethod("kmerCount", "Fastqq", function(object)
205
+{
206
+    kmer <- object@kmer
207
+    colnames(kmer) <- object@probeLabel
208
+    return(kmer)
209
+})
210
+
211
+
212
+setMethod("probeLabel", "Fastqq", function(object){return(object@probeLabel)})
213
+setReplaceMethod("probeLabel", "Fastqq", function(object, value)
214
+{
215
+    if(length(value) != nFiles(object))
216
+        stop("'value' must have length ", nFiles(object), ".")
217
+    
218
+    val <- as.character(value)
219
+    if(any(table(val)) > 1)
220
+    {
221
+        warning("[probeLabel <- .Fastqq] probeLabel unique suffix added.")
222
+        val <- paste(1:nFiles(object), val, sep="_")
223
+    }
224
+    object@probeLabel <- val
225
+    
226
+    return(object)
227
+})
228
+
229
+setMethod("phred", signature="Fastqq", definition=function(object, i)
230
+{
231
+    if(missing(i))
232
+        stop("Argument 'i' is not optional.")
233
+    
234
+    if(!is.numeric(i))
235
+        stop("'i' must be numeric.")
236
+    
237
+    i <- as.integer(i)
238
+    if( (i < 1) || (i > object@nFiles) )
239
+        stop("'i' must be >0 and < nFiles.")
240
+    
241
+    return(object@phred[[i]])
242
+})
243
+
244
+
245
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
246
+## End: Standard slot accessor functions
247
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
248
+
249
+
250
+setMethod("show", "Fastqq", function(object)
251
+{
252
+    bm <- Sys.localeconv()[7]
253
+    w <- 20
254
+    r <- "right"
255
+    cat("Class       : ", format(class(object), w=w, j=r)                              , "\n", sep="")
256
+    cat("nFiles      : ", format(format(nFiles(object)          , big.m=bm), w=w, j=r) , "\n", sep="")
257
+    cat("maxSeqLen   : ", format(format(maxSeqLen(object)       , big.m=bm), w=w, j=r) , "\n", sep="")
258
+    cat("k (Kmer len): ", format(format(getK(object)            , big.m=bm), w=w, j=r) , "\n", sep="")
259
+    cat("\n")
260
+    cat("nReads      : ", format(format(sum(as.numeric(nReads(object)))    , big.m=bm), w=w, j=r)   , "\n", sep="")
261
+    cat("nr  N   nuc : ", format(format(sum(nNnucs(object))     , big.m=bm), w=w, j=r) , "\n", sep="")
262
+    cat("Min seq len : ", format(format(min(seqLen(object)[1, ]), big.m=bm), w=w, j=r) , "\n", sep="")
263
+    cat("Max seq len : ", format(format(max(seqLen(object)[2, ]), big.m=bm), w=w, j=r) , "\n", sep="")
264
+    return(invisible())
265
+})
266
+
267
+
268
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
269
+## Phred related functions
270
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
271
+
272
+setMethod("phredQuantiles", "Fastqq", function(object, quantiles, i, ...)
273
+{
274
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
275
+    # Checking arguments
276
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
277
+    
278
+    ## Check quantiles argument
279
+    if(missing(quantiles))
280
+        stop("'quantiles' argument is not optional")
281
+    
282
+    if(!is.numeric(quantiles))
283
+        stop("Quantiles must be numeric.")
284
+    
285
+    if(!(all(quantiles >= 0) & all(quantiles <= 1) ))
286
+        stop("All quantiles mustbe in [0, 1]")
287
+    
288
+    quantiles <- sort(unique(round(quantiles, 2)))
289
+    
290
+    ## Check 'i' argument
291
+    if(missing(i))
292
+        stop("'i' argument is not optional")
293
+    
294
+    if(!is.numeric(i))
295
+        stop("'i' must be numeric.")
296
+    
297
+    if(length(i) > 1)
298
+        stop("'i' must have length 1")
299
+    
300
+    i <- as.integer(i)
301
+    if( (i < 1) || (i > object@nFiles) )
302
+            stop("'i' must be >0 and <nFiles.")
303
+
304
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
305
+    # Count qual values for each sequence position
306
+    # Convert integer counts into column-wise relative values
307
+    # Maximum counted read length
308
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
309
+    vec <- 1:seqLen(object)[2, i]
310
+    qrel <- as.data.frame(apply(object@phred[[i]][, vec], 2, rel_int))
311
+    names(qrel) <- vec
312
+    
313
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
314
+    # Walk through each column and extract row number
315
+    # for given quantile values
316
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
317
+    res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE="seqTools")
318
+    return(res)
319
+})
320
+
321
+
322
+setMethod("plotPhredQuant", "Fastqq", function(object, i, main, ...){
323
+    if(!is.numeric(i))
324
+        stop("'i' must be numeric.")
325
+    
326
+    i <- as.integer(i)
327
+    
328
+    if( (i < 1) || (i > object@nFiles) )
329
+        stop("'i' must be >0 and <nFiles.")
330
+    
331
+    quant <- c(0.1, 0.25, 0.5, 0.75, 0.9)
332
+    cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4") 
333
+    qq <- phredQuantiles(object, quant, i)
334
+    maxQ = floor(1.2*max(qq))
335
+    xv <- 1:ncol(qq)
336
+    
337
+    if(missing(main))
338
+    {
339
+        main <- paste("Position wise Phred Quantiles (", 
340
+                                probeLabel(object)[i], ")", sep="")
341
+    }
342
+
343
+    plot(xv, xv, ylim=c(0, maxQ), type="n", bty="n", las=1,
344
+        ylab = "Phred score", xlab="Sequence position", main=main, ...)
345
+     
346
+    lines(xv, qq[1, ], col=cols[1], lty=2)
347
+    lines(xv, qq[2, ], col=cols[2], lty=1)
348
+    lines(xv, qq[3, ], col=cols[3], lwd=2)
349
+    lines(xv, qq[4, ], col=cols[4], lty=1)
350
+    lines(xv, qq[5, ], col=cols[5], lty=2)
351
+    
352
+    legend("top", ncol=6, lty=c(2, 1, 1, 1, 2),
353
+            lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5,
354
+            legend=c("10%", "25%", "50%", "75%", "90%"), bty="n", cex=0.8)
355
+    return(invisible())
356
+})
357
+
358
+
359
+
360
+
361
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
362
+## Global Phred content functions
363
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
364
+setMethod("phredDist", "Fastqq", function(object, i){
365
+    idx <- 1:nFiles(object)
366
+    
367
+    if(missing(i))
368
+        i <- idx
369
+    else
370
+    {
371
+        if(!is.numeric(i))
372
+            stop("'i' must be numeric.")
373
+        
374
+        if(!all(is.element(i, idx)))
375
+            stop("'i' is out of range.")
376
+    }
377
+    
378
+    phred <- Reduce("+", object@phred[i])
379
+    phred <- matrix(as.numeric(phred), nrow=nrow(phred))
380
+    
381
+    phred_vals <- apply(phred, 1, sum)
382
+    phred_dist <- phred_vals/sum(phred_vals)
383
+    names(phred_dist) <- rownames(object@phred[[1]])
384
+    
385
+    return(phred_dist)    
386
+})
387
+
388
+
389
+setMethod("plotPhredDist", "Fastqq", function(object, i, maxp=45, col, ...)
390
+{
391
+    if(!is.numeric(maxp))
392
+        stop("maxp must be numeric")
393
+    
394
+    if(!is.integer(maxp))
395
+        maxp<-as.integer(maxp)
396
+    
397
+    if(maxp <= 0)
398
+        stop("maxp must be >=0")
399
+    
400
+    if(missing(col))
401
+        col <- topo.colors(10)[3]
402
+    
403
+    phred <- phredDist(object, i)
404
+    maxy <- ceiling(max(phred) * 5) / 5
405
+    x <- 1:maxp
406
+    xmax <- 10 * (ceiling(maxp / 10))
407
+    
408
+    plot(x, phred[x], ylim=c(0, maxy), xlim=c(0, xmax), type="l", lwd=2,
409
+            col=col, yaxt="n", bty="n", xlab="Phred value",
410
+            ylab="Content (%)", ...)
411
+    
412
+    ylb <- 0:(10 * maxy) / 10
413
+    axis(side=2, at=ylb, labels=100 * ylb, las=1)
414
+    return(invisible())
415
+})
416
+
417
+
418
+# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
419
+# Not exported:
420
+# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
421
+pd_l10 <- function(x){ x <- phredDist(x); return(sum(x[1:10]) / sum(x))}
422
+
423
+
424
+setMethod("propPhred","Fastqq",function(object, greater=30, less=93)
425
+{
426
+    if(!is.numeric(greater))
427
+        stop("'greater' must be numeric.")
428
+    
429
+    if(length(greater) != 1)
430
+        stop("'greater' must have length 1.")
431
+    
432
+    if(!is.numeric(less))
433
+        stop("'less' must be numeric.")
434
+    
435
+    if(length(less) != 1)
436
+        stop("'less must have length 1.")
437
+    
438
+    ## + + + + + + + + + + + + + + + + + + + + + + ##
439
+    ## greater and less shall be used as
440
+    ## array indices: increase greater
441
+    ## + + + + + + + + + + + + + + + + + + + + + + ##
442
+    greater<-as.integer(greater+1)
443
+    less<-as.integer(less)
444
+    
445
+    if(greater < 1)
446
+        stop("'greater' must be >=0.")
447
+    
448
+    if(less > 93)
449
+        stop("'less' must be < 94.")
450
+    
451
+    if(greater >= less)
452
+        stop("'greater' must be <= 'less'")
453
+    
454
+    n <- nFiles(object)
455
+    res <- numeric(n)
456
+    for(i in 1:n)
457
+    {
458
+        pd <- phredDist(object, i)
459
+        res[i] <- sum(pd[greater:less])
460
+    }
461
+    names(res) <- probeLabel(object)
462
+    return(res)
463
+})
464
+
465
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
466
+## End: Global Phred content functions
467
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
468
+
469
+setMethod("mergedPhred", "Fastqq", function(object){
470
+    sql <- seqLen(object)
471
+    maxSeqLen <- max(sql[2, ])
472
+    
473
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
474
+    ## as.numeric: Sum on integer is likely to exceed 
475
+    ##                 maximal 32-bit integer  values
476
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
477
+    
478
+    if(sql[2, 1] < maxSeqLen)
479
+    {
480
+        mp <- as.numeric(.Call("r_enlarge_int_mat", object@phred[[1]], 
481
+                c(nrow(object@phred[[1]]), maxSeqLen), PACKAGE="seqTools"))
482
+    }else{
483
+        mp <- as.numeric(object@phred[[1]])
484
+    }
485
+    
486
+    
487
+    n <- nFiles(object)
488
+    if(n > 1)
489
+    {
490
+        for(i in 2:n)
491
+        {
492
+            if(sql[2, i] < maxSeqLen)  
493
+            {
494
+                mp <- mp + as.numeric(.Call("r_enlarge_int_mat",
495
+                            object@phred[[i]],
496
+                            c(nrow(object@phred[[i]]), maxSeqLen), 
497
+                            PACKAGE="seqTools"))
498
+            }else{
499
+                mp <- mp + as.numeric(object@phred[[i]])
500
+            }
501
+        }
502
+    }
503
+    mp <- matrix(mp, ncol = maxSeqLen)
504
+    rownames(mp) <- rownames(object@phred[[1]])
505
+    colnames(mp) <- 1:maxSeqLen
506
+    return(mp)
507
+})
508
+
509
+setMethod("mergedPhredQuantiles", "Fastqq", function(object, quantiles)
510
+{
511
+    if(!(all(quantiles >= 0) & all(quantiles <= 1)) )
512
+        stop("[mergedPhredQuantiles.Fastqq] all quantiles mustbe in [0, 1]")
513
+    quantiles <- sort(unique(round(quantiles, 2)))
514
+  
515
+    sql <- seqLen(object)
516
+    maxSeqLen <- max(sql[2, ])
517
+    
518
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
519
+    ## Count qual values for each sequence position
520
+    ## Convert counts into column-wise relative values
521
+    ## Maximum counted read length
522
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
523
+    vec <- 1:maxSeqLen
524
+    mrg <- mergedPhred(object)
525
+    qrel <- as.data.frame(apply(mrg[, vec], 2, rel_real))
526
+    names(qrel)
527
+    names(qrel) <- vec
528
+    
529
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
530
+    ## Walk through each column and extract row number
531
+    ## for given quantile values
532
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
533
+    res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE="seqTools")
534
+    return(res)
535
+})
536
+
537
+
538
+setMethod("plotMergedPhredQuant", "Fastqq", function(object, main, ...)
539
+{
540
+    quant <- c(0.1, 0.25, 0.5, 0.75, 0.9)
541
+    cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4")
542
+    qq <- mergedPhredQuantiles(object, quant)
543
+    maxQ = floor(1.2*max(qq))
544
+    xv <- 1:ncol(qq)
545
+    
546
+    if(missing(main))
547
+        main <- paste("Merged position wise Phred Quantiles.", sep = "")
548
+    
549
+    plot(xv, xv, ylim=c(0, maxQ), type="n", bty="n", las=1,
550
+        ylab="Phred score", xlab="Sequence position", main=main, ...)
551
+    
552
+    
553
+    lines(xv, qq[1, ], col=cols[1], lty=2)
554
+    lines(xv, qq[2, ], col=cols[2], lty=1)
555
+    lines(xv, qq[3, ], col=cols[3], lwd=2)
556
+    lines(xv, qq[4, ], col=cols[4], lty=1)
557
+    lines(xv, qq[5, ], col=cols[5], lty=2)
558
+    
559
+    legend("top", ncol=6, lty=c(2, 1, 1, 1, 2), 
560
+            lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5, 
561
+            legend=c("10%", "25%", "50%", "75%", "90%"), bty="n", cex=0.8)
562
+    
563
+    return(invisible()) 
564
+})
565
+
566
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
567
+## End: Phred related functions
568
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
569
+
570
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
571
+## Nucleotide frequency related functions
572
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
573
+
574
+
575
+setMethod("plotNucFreq", "Fastqq", function(object, i, main, maxx, ...)
576
+{
577
+    if(!is.numeric(i))
578
+        stop("'i' must be numeric.")
579
+    
580
+    i <- as.integer(i)
581
+    
582
+    if( (i < 1) || (i > object@nFiles) )
583
+        stop("'i' must be >0 and <nFiles.")
584
+    
585
+    maxSeqlen <- max(seqLen(object)[2, ])
586
+    if(missing(maxx))
587
+    {
588
+        maxx <- maxSeqlen
589
+    }
590
+    else
591
+    {
592
+        if(!is.numeric(maxx))
593
+            stop("'maxx' must be numeric.")
594
+        maxx <- as.integer(maxx)
595
+        if(maxx < 1)
596
+            stop("'maxx' must be >0.")
597
+        if(maxx > maxSeqlen)
598
+            maxx <- maxSeqlen
599
+    }
600
+    
601
+    # Skip extra line
602
+    x <- 1:maxx
603
+    nac <- object@nac[[i]][1:4, x]
604
+    nacrel <- apply(nac, 2, rel_int)
605
+    maxY = round(1.4 * max(nacrel), 1)
606
+    
607
+    # Maximum counted read length
608
+    nacrel <- apply(nac, 2, rel_int)
609
+    cols <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00")
610
+    
611
+    if(missing(main))
612
+        main <- paste("Position wise Nucleotide frequency (",
613
+                                probeLabel(object)[i], ")", sep="")
614
+  
615
+    plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
616
+                ylab="Nucleotide fequency", xlab="sequence position",
617
+                main=main, ...)
618
+ 
619
+    lines(x, nacrel[1, ], col=cols[1], lwd=2)
620
+    lines(x, nacrel[2, ], col=cols[2], lwd=2)
621
+    lines(x, nacrel[3, ], col=cols[3], lwd=2)
622
+    lines(x, nacrel[4, ], col=cols[4], lwd=2)
623
+ 
624
+    legend("top", ncol=6, 
625
+            lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5, 
626
+            legend=c("A", "C", "G", "T"), bty="n", cex=0.8)
627
+    
628
+    return(invisible())
629
+})
630
+
631
+
632
+setMethod("plotGCcontent", "Fastqq", function(object,main,...)
633
+{
634
+    cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", 
635
+            "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
636
+    
637
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
638
+    ## Normalize matrix to colsum = 1
639
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
640
+    
641
+    gc <- apply(object@gcContent, 2, rel_int)
642
+    maxY = round(1.3*max(gc), 2)
643
+    nf <- nFiles(object)
644
+    x <- 1:nrow(gc)
645
+    
646
+    if(missing(main))
647
+        main<-"GC content"
648
+    
649
+    plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
650
+        ylab="Proportion of reads (%)", xlab="Relative GC content (%)", 
651
+        main=main, ...) 
652
+  
653
+    for(i in 1:nf)
654
+        lines(x, gc[, i], col=cols[i], lwd=2)
655
+  
656
+    legend("right", lwd=2, col=cols, legend=probeLabel(object),
657
+                                                bty="n", cex=0.8)
658
+  
659
+    return(invisible())
660
+})
661
+
662
+
663
+setMethod("plotNucCount", "Fastqq", function(object, nucs=16, maxx, ...)
664
+{
665
+  
666
+    # j = 16: N,    j = 2:3: gc
667
+    if(!is.numeric(nucs))
668
+        stop("'nucs' must be numeric.")
669
+    
670
+    nucs <- as.integer(nucs)
671
+    if(any(nucs < 1) || any(nucs > 19))
672
+        stop("'nucs' must be >0 and <20.")
673
+
674
+    maxSeqlen <- max(seqLen(object)[2, ])
675
+    
676
+    if(missing(maxx))
677
+    {
678
+        maxx <- maxSeqlen
679
+    }
680
+    else
681
+    {
682
+        if(!is.numeric(maxx))
683
+            stop("'maxx' must be numeric.")
684
+        
685
+        maxx <- as.integer(maxx)
686
+        
687
+        if(maxx<1)
688
+            stop("'maxx must be >0.")
689
+        
690
+        if(maxx > maxSeqlen)
691
+            maxx <- maxSeqlen
692
+    }
693
+    
694
+    n <- nFiles(object)
695
+    fvec <- 1:n
696
+    
697
+    i <- 1
698
+    
699
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
700
+    ## Skip extra line
701
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
702
+    x <- 1:maxx
703
+    nac <- object@nac[[i]][, x]
704
+    nacrel <- apply(nac, 2, rel_int)
705
+    
706
+    if(length(nucs) == 1){
707
+        dfr <- data.frame(a = nacrel[nucs, ])
708
+    }else{
709
+        dfr <- data.frame(a = apply(nacrel[nucs, ], 2, sum))
710
+    }
711
+    
712
+    if(n > 1)
713
+    {
714
+        for(i in 2:n)
715
+        {
716
+            nac <- object@nac[[i]][, x]
717
+            nacrel <- apply(nac, 2, rel_int)
718
+            if(length(nucs) == 1){
719
+                dfr[, i] <- nacrel[nucs, ]
720
+            }else{
721
+                dfr[, i] <- apply(nacrel[nucs, ], 2, sum)
722
+            }
723
+        }        
724
+    }
725
+    
726
+    maxY = round(1.4 * max(dfr), 3)
727
+    cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", 
728
+                    "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
729
+    
730
+    nv <- paste(iupac_chars[nucs], collapse = "")
731
+    
732
+    plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
733
+        ylab="Nucleotide fequency", xlab="sequence position",
734
+        main=paste("Position wise Nucleotide frequency:  '",
735
+                                            nv, "'", sep=""))
736
+  
737
+    for(i in fvec)
738
+        lines(x, dfr[, i], col=cols[i %% 10], lwd=2)
739
+    
740
+    legend("top", ncol=6, 
741
+        lwd=c(1, 1, 2, 1, 1), col=cols[fvec %% 10], xjust=0.5,
742
+        legend=probeLabel(object), bty="n", cex=0.8)
743
+    
744
+    return(invisible())
745
+})
746
+
747
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
748
+## End: Nucleotide frequency related functions
749
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
750
+
751
+
752
+
753
+setMethod("plotKmerCount", "Fastqq",
754
+                    function(object, index, mxey, main="K-mer count", ...)
755
+{
756
+    n <- nFiles(object)
757
+    if(missing(index))
758
+    {
759
+        index <- 1:n
760
+    }else{
761
+        if(!is.numeric(index))
762
+            stop("'index' must be numeric.")
763
+        
764
+        index <- sort(unique(as.integer(index)))
765
+        
766
+        if(any(index < 0) || any(index > n))
767
+                stop("'index' must be in 1, .., ", n, " ( = nFiles).")
768
+    }
769
+    
770
+    if(!missing(mxey))
771
+    {
772
+        if(!is.numeric(mxey))
773
+            stop("'mxey' must be numeric.")
774
+        mxey <- as.integer(mxey)
775
+        
776
+        if(mxey<1)
777
+            stop("'mxey' must be positive.")
778
+    }
779
+    
780
+    cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", 
781
+                "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")  
782
+    pk <- 6
783
+    if(object@k <= pk)
784
+    {
785
+        pk <- object@k
786
+        x <- 1:(4^pk)
787
+        
788
+        if(missing(mxey))
789
+            lg2y <- floor(log2(1.2 * (max(object@kmer))))
790
+        else
791
+            lg2y <- mxey
792
+        
793
+        maxY <- 2^lg2y
794
+
795
+        
796
+        plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
797
+             ylab="K-mer count", xlab="K-mer index", main=main,
798
+                                                axes=FALSE, ...)
799
+    
800
+    for(i in index)
801
+        lines(x, object@kmer[, i], col=cols[i], lwd=2)
802
+    }
803
+    else
804
+    {
805
+        x <- 1:(4^pk)
806
+        melt_factor <- as.integer(4^(object@k - pk))
807
+        
808
+        y_factor <- max(.Call("melt_vector", object@kmer[, 1], melt_factor,
809
+                        PACKAGE="seqTools")) / max(object@kmer[, 1])
810
+    
811
+        if(missing(mxey))
812
+            lg2y <- floor(log2(1.2 * (max(object@kmer)) * y_factor))
813
+        else
814
+            lg2y <- mxey
815
+        maxY <- 2^lg2y
816
+    
817
+        plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
818
+            ylab="K-mer count", xlab="K-mer index", 
819
+                            main=main, axes=FALSE, ...)
820
+    
821
+        for(i in index)
822
+        {
823
+            lines(x, .Call("melt_vector", object@kmer[, i], melt_factor,
824
+                        PACKAGE="seqTools"), col=cols[i], lwd=2)
825
+        }
826
+    }
827
+    axis(side=1, at=0:4 * 4^(pk - 1), labels=c("A", "C", "G", "T", ""))
828
+  
829
+    axis(side=2, at=c(0, maxY),
830
+                    labels=c(0, paste("2^", lg2y, sep="")), las=1)
831
+  
832
+    legend("right", lwd=2, col=cols[index], 
833
+                legend=probeLabel(object)[index], bty="n", cex=0.8)
834
+  
835
+    return(invisible())
836
+})
837
+
838
+
839
+
840
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
841
+## Merging and melting Fastqq objects
842
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
843
+
844
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
845
+setMethod("mergeFastqq", "Fastqq", function(lhs, rhs)
846
+{
847
+    
848
+    res <- new("Fastqq")
849
+    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
850
+    ## Simple concatenations
851
+    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
852
+    
853
+    res@filenames <- c(lhs@filenames, rhs@filenames)
854
+    res@nFiles <- lhs@nFiles+rhs@nFiles
855
+    res@nReads <- c(lhs@nReads, rhs@nReads)
856
+    res@nN <- c(lhs@nN, rhs@nN)
857
+    res@seqLenCount <- cbind(lhs@seqLenCount, rhs@seqLenCount)
858
+    res@gcContent <- cbind(gcContentMatrix(lhs), gcContentMatrix(rhs))
859
+    res@seqLen <- cbind(lhs@seqLen, rhs@seqLen)
860
+    
861
+    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
862
+    ## Singularize probeLabel
863
+    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
864
+    
865
+    res@probeLabel <- c(lhs@probeLabel, rhs@probeLabel)  
866
+    if(any(table(res@probeLabel) > 1))
867
+    {
868
+        message("[mergeFastqq] Singularizing probeLabel (new suffix).")
869
+        res@probeLabel <- paste(1:res@nFiles, res@probeLabel, sep="_")
870
+    }
871
+    
872
+    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
873
+    ## Eventually resize arrays when lhs and rhs have different maxSeqLen
874
+    ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
875
+    if(lhs@maxSeqLen > rhs@maxSeqLen)
876
+    {
877
+        message("[mergeFastqq] Resizing rhs.")
878
+        msl <- lhs@maxSeqLen
879
+        
880
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
881
+        ## nac
882
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
883
+        
884
+        new_dim <- as.integer(c(nrow(rhs@nac), msl))
885
+        rhs_nac <- .Call("r_enlarge_int_mat", rhs@nac, new_dim, 
886
+                                                    PACKAGE="seqTools")
887
+        res@nac <- c(lhs@nac, rhs_nac)
888
+        
889
+        # seqLenCount
890
+        new_dim <- as.integer(c(msl, rhs@nFiles))
891
+        rhs_seqLenCount <- .Call("r_enlarge_int_mat", 
892
+                                rhs@seqLenCount, new_dim, PACKAGE="seqTools")
893
+        
894
+        res@seqLenCount <- c(lhs@seqLenCount, rhs_seqLenCount)
895
+        
896
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
897
+        ## phred
898
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
899
+        
900
+        new_dim <- as.integer(nrow(rhs@phred), msl)
901
+        rhs_phred_list <- list()
902
+        
903
+        for(i in 1:nFiles(rhs)){
904
+                rhs_phred_list[[i]] <- .Call("r_enlarge_int_mat", 
905
+                                rhs@phred[[i]], new_dim, PACKAGE="seqTools")
906
+        }
907
+        
908
+        res@phred <- c(lhs@phred, rhs_phred_list)
909
+        res@maxSeqLen <- lhs@maxSeqLen
910
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
911
+        
912
+    } else if(rhs@maxSeqLen > lhs@maxSeqLen)
913
+    {
914
+        
915
+        message("[mergeFastqq] Resizing lhs.")
916
+        msl <- rhs@maxSeqLen
917
+        
918
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
919
+        ## nac
920
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
921
+        
922
+        new_dim <- as.integer(c(nrow(lhs@nac), msl))
923
+        
924
+        lhs_nac <- .Call("r_enlarge_int_mat",
925
+                                lhs@nac, new_dim, PACKAGE="seqTools")
926
+        
927
+        res@nac <- c(rhs@nac, lhs_nac)
928
+        
929
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
930
+        ## seqLenCount
931
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
932
+        
933
+        new_dim <- as.integer(c(msl, lhs@nFiles))
934
+        
935
+        lhs_seqLenCount <- .Call("r_enlarge_int_mat",
936
+                    lhs@seqLenCount, new_dim, PACKAGE="seqTools")
937
+        
938
+        res@seqLenCount <- c(rhs@seqLenCount, lhs_seqLenCount)
939
+        
940
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
941
+        ## phred
942
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
943
+        
944
+        new_dim <- as.integer(nrow(lhs@phred), msl)
945
+        lhs_phred_list <- list()
946
+        
947
+        for(i in 1:nFiles(lhs))
948
+        {
949
+            lhs_phred_list[[i]] <- .Call("r_enlarge_int_mat", 
950
+                    lhs@phred[[i]], new_dim, PACKAGE="seqTools")
951
+        }
952
+        res@phred <- c(rhs@phred, lhs_phred_list)
953
+        res@maxSeqLen <- rhs@maxSeqLen
954
+        
955
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
956
+        
957
+    } else { 
958
+        
959
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
960
+        ## rhs@maxSeqLen == lhs@maxSeqLen
961
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
962
+        
963
+        res@maxSeqLen = lhs@maxSeqLen
964
+        res@nac <- c(lhs@nac, rhs@nac)
965
+        res@phred <- c(lhs@phred, rhs@phred)
966
+        
967
+        ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
968
+        
969
+    }
970
+    
971
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
972
+    ## Eventually melt down k-mer count matrix
973
+    ## when lhs and rhs have different k
974
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
975
+    res@k <- pmin(lhs@k, rhs@k)
976
+    
977
+    kml <- kmerCount(lhs)
978
+    if(lhs@k > res@k)
979
+    {
980
+        kml <- .Call("melt_kmer_matrix",
981
+                    kml, c(lhs@k, res@k), PACKAGE="seqTools")
982
+    }
983
+    
984
+    kmr <- kmerCount(rhs)
985
+    if(rhs@k > res@k)
986
+    {
987
+        kmr <- .Call("melt_kmer_matrix",
988
+                    kmr, c(rhs@k, res@k), PACKAGE="seqTools")
989
+    }
990
+    res@kmer <- cbind(kml, kmr)
991
+    
992
+    fkml <- lhs@firstKmer
993
+    if(lhs@k > res@k)
994
+    {
995
+        fkml <- .Call("melt_kmer_matrix",
996
+                    fkml, c(lhs@k, res@k), PACKAGE="seqTools")
997
+    }
998
+    fkmr <- rhs@firstKmer
999
+    
1000
+    if(rhs@k > res@k)
1001
+    {
1002
+        fkmr <- .Call("melt_kmer_matrix",
1003
+                    fkmr, c(rhs@k, res@k), PACKAGE="seqTools")
1004
+    }
1005
+    res@firstKmer <- cbind(fkml, fkmr)
1006
+    
1007
+    return(res)
1008
+})
1009
+
1010
+
1011
+setMethod("meltDownK", "Fastqq", function(object, newK)
1012
+{
1013
+  
1014
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
1015
+    if(!is.numeric(newK))
1016
+        stop("'newK' must be numeric")
1017
+    
1018
+    newK <- as.integer(newK)
1019
+    
1020
+    if(length(newK) != 1)
1021
+        stop("'newK' must have length 1.")
1022
+    
1023
+    if(newK < 1 || newK >=  getK(object))
1024
+        stop("'newK' must be >= 1 and <= ", getK(object), ".")
1025
+    
1026
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
1027
+    res <- new("Fastqq")
1028
+    res@filenames <- object@filenames
1029
+    res@nFiles <- object@nFiles
1030
+    res@k <- newK
1031
+    res@maxSeqLen <- object@maxSeqLen
1032
+    
1033
+    res@kmer <- .Call("melt_kmer_matrix",
1034
+        object@kmer, c(getK(object), newK), PACKAGE="seqTools")
1035
+    
1036
+    res@firstKmer <- .Call("melt_kmer_matrix",
1037
+        object@firstKmer, c(getK(object), newK), PACKAGE="seqTools")
1038
+    
1039
+    res@nReads <- object@nReads
1040
+    res@nN <- object@nN
1041
+    res@seqLenCount <- object@seqLenCount
1042
+    res@gcContent <- object@gcContent
1043
+    res@nac <- object@nac
1044
+    res@phred <- object@phred
1045
+    res@seqLen <- object@seqLen
1046
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
1047
+  
1048
+    return(res)
1049
+})
1050
+
1051
+
1052
+listMelt <- function(x, oldK, newK)
1053
+{
1054
+    f <- function(x) .Call("melt_kmer_matrix", x, 
1055
+            as.integer(c(oldK, newK)), PACKAGE="seqTools")
1056
+  
1057
+    return(lapply(x, f))
1058
+}
1059
+
1060
+
1061
+
1062
+setMethod("[", "Fastqq", function(x, i, j, drop="missing")
1063
+{
1064
+  
1065
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
1066
+    res <- new("Fastqq")
1067
+    res@filenames <- x@filenames[i]
1068
+    res@probeLabel <- x@probeLabel[i]
1069
+    res@nFiles <- length(i)
1070
+    res@k <- x@k
1071
+    res@seqLen <- x@seqLen[, i, drop=FALSE]
1072
+    res@maxSeqLen <- max(res@seqLen[2, ])
1073
+    res@kmer <- x@kmer[, i, drop=FALSE]
1074
+    res@firstKmer <- x@firstKmer[, i, drop=FALSE]
1075
+    res@nN <- x@nN[i]
1076
+    res@seqLenCount <- x@seqLenCount[, i, drop=FALSE]
1077
+    res@gcContent <- x@gcContent[, i, drop=FALSE]
1078
+    res@nac <- x@nac[i]
1079
+    res@phred <- x@phred[i]
1080
+    res@nReads <- x@nReads[i]
1081
+    res@collectTime <- x@collectTime
1082
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
1083
+    
1084
+    return(res)
1085
+})
1086
+
1087
+
1088
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
1089
+## Calculation of distance matrix based on Canberra distance
1090
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
1091
+
1092
+setMethod("cbDistMatrix", "Fastqq", 
1093
+        function(object, nReadNorm=max(nReads(object)))
1094
+{
1095
+    if(!is.numeric(nReadNorm))
1096
+        stop("'nReadNorm' must be numeric.")
1097
+    nReadNorm <- as.integer(nReadNorm)
1098
+    
1099
+    if(nReadNorm < max(nReads(object)))
1100
+        stop("'nReadNorm' must be greater than all nRead.")
1101
+    
1102
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
1103
+    ## Column-wise normalizing read counts (by upscaling) 
1104
+    ## so that column sums become nearly equal in order to
1105
+    ## compensate sequencing depth artifacts in 
1106
+    ## Canberra distance values
1107
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
1108
+    
1109
+    scale <- nReadNorm/nReads(object)
1110
+    
1111
+    scaled <- .Call("scale_kmer_matrix",
1112
+            kmerCount(object), scale, PACKAGE="seqTools")
1113
+    
1114
+    colnames(scaled) <- object@probeLabel
1115
+    ## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
1116
+  
1117
+    return(.Call("cb_distance_matrix", scaled, PACKAGE="seqTools"))
1118
+})
1119
+
1120
+
1121
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
1122
+## END OF FILE
1123
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
0 1124
new file mode 100644
... ...
@@ -0,0 +1,387 @@
1
+
2
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
3
+##                                                                           ##
4
+##  Project   :   seqTools                                                   ##
5
+##  Created   :   26.August.2013                                             ##
6
+##  Author    :   W. Kaisers                                                 ##
7
+##  Version   :   0.99.13                                                    ##
8
+##  File      :   simFunctions.r                                             ##
9
+##  Content   :   Functions which produce fastq files with simulated data    ##
10
+##                (Done for testing Fastqq and DNA k-mer counting)           ##
11
+##                writeSimFastq, writeSimContFastq, simFastqqRunTimes,       ##
12
+##                sim_fq                                                     ##
13
+##                                                                           ##
14
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
15
+
16
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
17
+## Create fastq files with simulated k-mers
18
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
19
+
20
+writeSimFastq<-function(k=6, nk=5, nSeq=10, filename="sim.fq.gz")
21
+{
22
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
23
+    # k
24
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
25
+    if(!is.numeric(k))
26
+        stop("'k' must be numeric.")
27
+    if(length(k) != 1)
28
+        stop("'k' must have length 1.")
29
+    k <- as.integer(k)    
30
+    if( (k < 1) || (k > max_k) )
31
+        stop("'k' out of range.")
32
+    
33
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
34
+    # nk
35
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
36
+    if(!is.numeric(nk))
37
+        stop("'nk' must be numeric.")
38
+    if(length(nk) != 1)
39
+        stop("'nk' must have length 1.")
40
+    nk<-as.integer(nk)
41
+    if( (nk < 1) || (nk > 1000) )
42
+        stop("nk must be positive and <1000.")
43
+    
44
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
45
+    # nSeq
46
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
47
+    if(!is.numeric(nSeq))
48
+        stop("'nSeq' must be numeric.")
49
+    if(nSeq < 1)
50
+        stop("'nSeq' must be positive.")
51
+    
52
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
53
+    # filename
54
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
55
+    if(!is.character(filename))
56
+        stop("'filename' must be string.")
57
+    
58
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
59
+    # Do the work
60
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
61
+    bm <- Sys.localeconv()[7]
62
+    val <- as.integer(c(k, nk, nSeq))
63
+
64
+    pseq <- .Call("sim_dna_k_mer", val, PACKAGE="seqTools")
65
+    res <- .Call("gzwrite_fastq_dna", val, pseq, filename, PACKAGE="seqTools")
66
+
67
+    message("[writeSimFastq] file '", basename(filename), "': ",
68
+            format(res,big.mark=bm), " Bytes written.")
69
+  
70
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
71
+    # Terminate function.
72
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
73
+    return(invisible())
74
+}
75
+
76
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
77
+## writeSimContFastq
78
+## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
79
+writeSimContFastq<-function(k=6, nk=5, nSeq=10, pos=1, 
80
+                kIndex=1, nContam=nSeq, filename="simc.fq.gz")
81
+{
82
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
83
+    # k
84
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
85
+    if(!is.numeric(k))
86
+        stop("k must be numeric.")
87
+    if(length(k) != 1)
88
+        stop("k must have length 1.")
89
+    k<-as.integer(k)    
90
+    if( (k < 1) || (k > max_k) )
91
+        stop("k out of range.")
92
+    
93
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
94
+    # nk
95
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
96
+    if(!is.numeric(nk))
97
+        stop("nk must be numeric.")
98
+    if(length(nk) != 1)
99
+        stop("nk must have length 1.")
100
+    nk<-as.integer(nk)
101
+    if( (nk < 1) || (nk > 1000) )
102
+        stop("nk must be positive and <1000.") 
103
+    
104
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
105
+    # nSeq
106
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
107
+    if(!is.numeric(nSeq))
108
+        stop("nSeq must be numeric.")
109
+    if(length(nSeq) != 1)
110
+        stop("nSeq must have length 1.")
111
+    if(nSeq < 1)
112
+        stop("nSeq must be positive.")
113
+    
114
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
115
+    # pos
116
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
117
+    if(!is.numeric(pos))
118
+        stop("pos must be numeric.")
119
+    pos<-as.integer(pos)
120
+    if( any(pos < 1) || any(pos > (k * (nk - 1) + 1)))
121
+        stop("pos out of range.")
122
+    
123
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
124
+    # writeSimContFastq should take 1-based positions
125
+    # while "set_dna_k_mer" takes    0-based positions
126
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
127
+    pos<-pos - 1L
128
+    
129
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
130
+    # kIndex
131
+    # + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
132
+    if(!is.numeric(kIndex))
133
+        stop("kIndex must be numeric.")
134