Browse code

Several updates -> new version 1.1.6

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/kebabs@98550 bc3139a8-67e5-0310-9ffc-ced21a209358

Ulrich Bodenhofer authored on 21/01/2015 12:59:19
Showing 18 changed files

... ...
@@ -1,8 +1,8 @@
1 1
 Package: kebabs
2 2
 Type: Package
3 3
 Title: Kernel-Based Analysis Of Biological Sequences
4
-Version: 1.1.5
5
-Date: 2014-01-12
4
+Version: 1.1.6
5
+Date: 2015-01-21
6 6
 Author: Johannes Palme
7 7
 Maintainer: Ulrich Bodenhofer <bodenhofer@bioinf.jku.at>
8 8
 Description: The package provides functionality for kernel-based analysis of
... ...
@@ -54,4 +54,4 @@ Imports: methods, stats, Rcpp (>= 0.11.2), Matrix, XVector (>= 0.5.8),
54 54
 LinkingTo: IRanges, XVector, Biostrings, Rcpp, S4Vectors
55 55
 Suggests: SparseM, apcluster, Biobase, BiocGenerics
56 56
 biocViews: SupportVectorMachine, Classification, Clustering, Regression
57
-Packaged: 2015-01-12 13:37:03 UTC; jo
57
+Packaged: 2015-01-21 11:10:57 UTC; jo
... ...
@@ -136,6 +136,53 @@ ersTransposedAsdgCMatrix <- function(from)
136 136
     return(dgC)
137 137
 }
138 138
 
139
+distWeightKernelToString <- function(distWeight)
140
+{
141
+    dwLength <- length(distWeight)
142
+    distWeightChar <- ""
143
+    
144
+    if (is.numeric(distWeight))
145
+    {
146
+        if (dwLength == 1)
147
+            distWeightChar <- distWeight
148
+        else if (dwLength < 7)
149
+        {
150
+            distWeightChar <-
151
+                paste("c(",paste(distWeight,collapse=","),")", sep="")
152
+        }
153
+        else
154
+        {
155
+            distWeightChar <-
156
+                paste("c(", paste(distWeight[1:3],collapse=","), " ... ",
157
+                      paste(distWeight[dwLength - 2:dwLength],collapse=","),
158
+                      ")", sep="")
159
+        }
160
+    }
161
+    else if (typeof(distWeight) == "closure")
162
+    {
163
+        func <- deparse(distWeight)[2]
164
+        index <- grep("(", strsplit(func, split="")[[1]], fixed=TRUE,
165
+                      value=FALSE)
166
+        
167
+        if (length(index) > 0)
168
+            distWeightChar <- substr(func, 1, index[1] - 1)
169
+        {
170
+            if (distWeightChar %in% c("linWeight", "gaussWeight",
171
+                                      "expWeight"))
172
+            {
173
+                sigma <- get("sigma", envir = environment(distWeight))
174
+                distWeightChar <- paste(distWeightChar, "(sigma=", sigma, ")",
175
+                                        sep="")
176
+            }
177
+            else if (distWeightChar == "swdWeight")
178
+                distWeightChar <- paste(distWeightChar, "()", sep="")
179
+            else
180
+                distWeightChar <- paste(distWeightChar, "( . . . )", sep="")
181
+        }
182
+    }
183
+    distWeightChar
184
+}
185
+
139 186
 #' @rdname sequenceKernel
140 187
 #' @aliases
141 188
 #' seqKernelAsChar
... ...
@@ -182,8 +229,8 @@ seqKernelAsChar <- function(from)
182 229
 
183 230
     if (length(kernelParameters(from)$distWeight) > 0)
184 231
     {
185
-        kChar <- paste(kChar, ", distWeight=",
186
-                       kernelParameters(from)$distWeight, sep="")
232
+        dwString <- distWeightKernelToString(kernelParameters(from)$distWeight)
233
+        kChar <- paste(kChar, ", distWeight=", dwString, sep="")
187 234
     }
188 235
 
189 236
     if (!kernelParameters(from)$normalized)
... ...
@@ -194,9 +194,21 @@ gappyPairKernel <- function(k=1, m=1, r=1, annSpec=FALSE, distWeight=numeric(0),
194 194
     if (!isTRUEorFALSE(annSpec))
195 195
         stop("'annSpec' must be TRUE or FALSE\n")
196 196
 
197
-    if (length(distWeight) > 0 &&
198
-        !(is.numeric(distWeight) || is.function(distWeight)))
199
-        stop("'distWeight' must be a numeric vector or a function\n")
197
+    if (length(distWeight) > 0)
198
+    {
199
+        if (!(is.numeric(distWeight) || is.function(distWeight)))
200
+            stop("'distWeight' must be a numeric vector or a function\n")
201
+    
202
+        if (is.function(distWeight))
203
+        {
204
+            func <- deparse(distWeight)[2]
205
+            index <- grep("(", strsplit(func, split="")[[1]], fixed=TRUE,
206
+                          value=FALSE)
207
+        
208
+            if (length(index) < 1)
209
+                stop("Missing parentheses in 'distWeight'\n")
210
+        }
211
+    }
200 212
 
201 213
     if (presence && length(distWeight) > 0)
202 214
     {
... ...
@@ -164,16 +164,14 @@ kbsvm.seqs <- function(x, y, kernel=NULL, pkg="auto", svm="C-svc",
164 164
 
165 165
                         if (ncol(exRepSV) == 0)
166 166
                             stop("explicit representaton is not available\n")
167
+                    }
168
+                    else
169
+                        exRepSV <- NULL
167 170
 
168
-                        model@featureWeights <-
169
-                            getFeatureWeights(model=model,
171
+                    model@featureWeights <-
172
+                        getFeatureWeights(model=model,
170 173
                                           exrep=exRepSV,
171 174
                                           weightLimit=model@svmInfo@weightLimit)
172
-                    }
173
-                    else
174
-                        stop("feature weights cannot be computed because\n",
175
-                             "        the selected kernel does not support\n",
176
-                             "        an explicit representation\n")
177 175
 
178 176
                     model@b <- getSVMSlotValue("b", model)
179 177
                 }
... ...
@@ -511,20 +509,31 @@ kbsvm.seqs <- function(x, y, kernel=NULL, pkg="auto", svm="C-svc",
511 509
 #' Training of biological sequences with a sequence kernel\cr
512 510
 #'
513 511
 #' Training is performed via the method \code{kbsvm} for classification and
514
-#' regression tasks. The user passes sequence data in \code{XStringSet} format,
515
-#' the label vector, a sequence kernel object and the requested SVM along with
516
-#' SVM parameters and receives the training results in the form of a KeBABS
517
-#' model object of class \code{\linkS4class{KBModel}}. The accessor
512
+#' regression tasks. The user passes sequence data, the response vector, a
513
+#' sequence kernel object and the requested SVM along with SVM parameters
514
+#' to \code{kbsvm} and receives the training results in the form of a
515
+#' KeBABS model object of class \code{\linkS4class{KBModel}}. The accessor
518 516
 #' \code{svmModel} allows to retrieve the SVM specific model from the KeBABS
519
-#' model object. However for regular operation a detailed look into the SVM
520
-#' specific model is usually not necessary. (When repeat regions are coded as
521
-#' lowercase characters and should be excluded from the analysis the sequence
522
-#' data can be passed as \code{\linkS4class{BioVector}} which also supports
523
-#' lowercase characters instead of \code{\linkS4class{XStringSet}} format.
524
-#' Please note that the classes derived from \code{\linkS4class{XStringSet}}
525
-#' are much more powerful than the \code{\linkS4class{BioVector}} derived
526
-#' classes and should be used in all cases where lowercase characters are not
527
-#' needed). Apart from SVM training \code{kbsvm} can be also used for cross
517
+#' model object. However, for regular operation a detailed look into the SVM
518
+#' specific model is usually not necessary.
519
+#'
520
+#' The standard data format for sequences in KeBABS are the
521
+#' \code{XStringSet}-derived classes \code{\linkS4class{DNAStringSet}},
522
+#' \code{\linkS4class{RNAStringSet}} and \code{\linkS4class{AAStringSet}}.
523
+#' (When repeat regions are coded as lowercase characters and should be
524
+#' excluded from the analysis the sequence data can be passed as
525
+#' \code{\linkS4class{BioVector}} which also supports lowercase characters
526
+#' instead of \code{\linkS4class{XStringSet}} format. Please note that the
527
+#' classes derived from \code{\linkS4class{XStringSet}} are much more
528
+#' powerful than the \code{\linkS4class{BioVector}} derived classes and
529
+#' should be used in all cases where lowercase characters are not needed).
530
+#'
531
+#' Instead of sequences also a precomputed explicit representation or
532
+#' a precomputed kernel matrix can be used for training. Examples for
533
+#' training with kernel matrix and explicit representation can be found on
534
+#' the help page for the prediction method \code{\link{predict}}.
535
+#'
536
+#' Apart from SVM training \code{kbsvm} can be also used for cross
528 537
 #' validation (see \link{crossValidation} and parameters \code{cross} and
529 538
 #' \code{noCross}), grid search for SVM- and kernel-parameter values (see
530 539
 #' \link{gridSearch}) and model selection (see \link{modelSelection} and
... ...
@@ -653,9 +662,9 @@ kbsvm.seqs <- function(x, y, kernel=NULL, pkg="auto", svm="C-svc",
653 662
 #' with the parameter \code{weightLimit} which defines the cutoff for
654 663
 #' small feature weights not stored in the model.
655 664
 #'
656
-#' Hint: For multiclass prediction is currently not performed via feature
657
-#' weights but native in the SVM. This means that for multiclass a pruned
658
-#' model cannot be used for prediction.\cr\cr
665
+#' Hint: For training with a precomputed kernel matrix feature weights are
666
+#' not available. For multiclass prediction is currently not performed via
667
+#' feature weights but native in the SVM.\cr\cr
659 668
 #'
660 669
 #' Cross validation, grid search and model selection\cr
661 670
 #'
... ...
@@ -754,6 +754,18 @@ predict.KBModel <- function(object, x, predictionType="response", sel=NULL,
754 754
 #' do not support the generation of an explicit representation, e.g. the
755 755
 #' position dependent kernel variants.\cr\cr
756 756
 #'
757
+#' Prediction with precomputed kernel matrix
758
+#'
759
+#' When training was performed with a precomputed kernel matrix also in
760
+#' prediction a precomputed kernel matrix must be passed to the \code{predict}
761
+#' method. In contrast to the quadratic and symmetric kernel matrix used
762
+#' in training the kernel matrix for prediction is rectangular and contains
763
+#' the similarities of test samples (rows) against support vectors (columns).
764
+#' support vector indices can be read from the model with the accessor SVindex.
765
+#' Please not that these indices refer to the sample subset used in training.
766
+#' An example for training and prediction via precomputed kernel matrix is
767
+#' shown below.
768
+#'
757 769
 #' Generation of prediction profiles
758 770
 #'
759 771
 #' The parameter \code{predProfiles} controls whether prediction profiles
... ...
@@ -791,15 +803,14 @@ predict.KBModel <- function(object, x, predictionType="response", sel=NULL,
791 803
 #'
792 804
 #' ## run training with explicit representation
793 805
 #' model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
794
-#'                pkg="LiblineaR", svm="C-svc", cost=10, explicit="yes",
795
-#'                featureWeights="yes")
806
+#'                pkg="LiblineaR", svm="C-svc", cost=10)
796 807
 #'
797 808
 #' ## show feature weights in KeBABS model
798 809
 #' featureWeights(model)[1:8]
799 810
 #'
800 811
 #' ## predict the test sequences
801 812
 #' pred <- predict(model, enhancerFB[test])
802
-#' evaluatePrediction(pred, yFB[test], allLabels=c(-1,1))
813
+#' evaluatePrediction(pred, yFB[test], allLabels=unique(yFB))
803 814
 #' pred[1:10]
804 815
 #'
805 816
 #' ## output decision values instead
... ...
@@ -807,11 +818,30 @@ predict.KBModel <- function(object, x, predictionType="response", sel=NULL,
807 818
 #' pred[1:10]
808 819
 #'
809 820
 #' \dontrun{
810
-#' ## computing of probability model via Platt scaling during training
821
+#' ## example for training and prediction via precomputed kernel matrix
822
+#'
823
+#' ## compute quadratic kernel matrix of training samples
824
+#' kmtrain <- getKernelMatrix(gappy, x=enhancerFB, selx=train)
825
+#'
826
+#' ## train model with kernel matrix
827
+#' model <- kbsvm(x=kmtrain, y=yFB[train], kernel=gappy,
828
+#'                pkg="kernlab", svm="C-svc", cost=10)
829
+#'
830
+#' ## compute rectangular kernel matrix of test samples versus
831
+#' ## support vectors
832
+#' kmtest <- getKernelMatrix(gappy, x=enhancerFB, y=enhancerFB,
833
+#'                           selx=test, sely=train[SVindex(model)])
834
+#'
835
+#' ## predict with kernel matrix
836
+#' pred <- predict(model, kmtest)
837
+#' evaluatePrediction(pred, yFB[test], allLabels=unique(yFB))
838
+#'
839
+#' ## example for probability model generation during training
840
+#'
841
+#' ## compute probability model via Platt scaling during training
811 842
 #' ## and predict class membership probabilities
812 843
 #' model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
813
-#'                pkg="e1071", svm="C-svc", cost=10, explicit="yes",
814
-#'                probModel=TRUE)
844
+#'                pkg="e1071", svm="C-svc", cost=10, probModel=TRUE)
815 845
 #'
816 846
 #' ## show parameters of the fitted probability model which are the parameters
817 847
 #' ## probA and probB for the fitted sigmoid function in case of classification
... ...
@@ -358,26 +358,8 @@ setMethod("show", signature(object="SpectrumKernel"),
358 358
             cat(paste(", annSpec=TRUE"))
359 359
         if (length(object@distWeight) > 0)
360 360
         {
361
-            if (is.numeric(object@distWeight))
362
-            {
363
-                if (length(object@distWeight) == 1)
364
-                {
365
-                    cat(paste(", distWeight=",
366
-                              object@distWeight, sep=""))
367
-                }
368
-                else
369
-                {
370
-                    cat(paste(", distWeight=",
371
-                              paste("c(",paste(object@distWeight,
372
-                                               collapse=","),")",
373
-                                    sep=""), sep=""))
374
-                }
375
-            }
376
-            else
377
-            {
378
-                cat(", \ndistWeight=")
379
-                cat(format(object@distWeight))
380
-            }
361
+             dwString <- distWeightKernelToString(object@distWeight)
362
+             cat(", distWeight=", dwString, sep="")
381 363
         }
382 364
         if (object@normalized == FALSE)
383 365
             cat(", normalized=FALSE")
... ...
@@ -170,9 +170,21 @@ spectrumKernel <- function(k=3, r=1, annSpec=FALSE, distWeight=numeric(0),
170 170
     if (!isTRUEorFALSE(annSpec))
171 171
         stop("'annSpec' must be TRUE or FALSE\n")
172 172
 
173
-    if (length(distWeight) > 0 &&
174
-        !(is.numeric(distWeight) || is.function(distWeight)))
175
-        stop("'distWeight' must be a numeric vector or a function\n")
173
+    if (length(distWeight) > 0)
174
+    {
175
+        if (!(is.numeric(distWeight) || is.function(distWeight)))
176
+            stop("'distWeight' must be a numeric vector or a function\n")
177
+        
178
+        if (is.function(distWeight))
179
+        {
180
+            func <- deparse(distWeight)[2]
181
+            index <- grep("(", strsplit(func, split="")[[1]], fixed=TRUE,
182
+                          value=FALSE)
183
+            
184
+            if (length(index) < 1)
185
+                stop("Missing parentheses in 'distWeight'\n")
186
+        }
187
+    }
176 188
 
177 189
     if (presence && length(distWeight) > 0)
178 190
     {
... ...
@@ -1,6 +1,19 @@
1 1
 Change history of package kebabs:
2 2
 ====================================
3 3
 
4
+Version 1.1.6
5
+- error correction for training with position specific kernel and
6
+  computation of feature weights
7
+- error correction in coercion of kernel to character for distance
8
+  weighting
9
+- error correction in spectrum, gappy pair and motif kernel
10
+  for kernel matrix - last feature was missing in kernel value
11
+  in rare situations
12
+- correction of Windows build problem in linearKernel
13
+- build warnings on Windows removed
14
+- minor changes in help pages
15
+- minor changes in vignette
16
+
4 17
 Version 1.1.5
5 18
 - new method heatmap to display heatmap of prediction profiles
6 19
 - extension of function linearKernel to optionally return
... ...
@@ -291,30 +291,43 @@ Training of biological sequences with a sequence kernel\cr
291 291
 
292 292
 Training is performed via the method \code{kbsvm} for
293 293
 classification and regression tasks. The user passes
294
-sequence data in \code{XStringSet} format, the label
295
-vector, a sequence kernel object and the requested SVM
296
-along with SVM parameters and receives the training results
297
-in the form of a KeBABS model object of class
294
+sequence data, the response vector, a sequence kernel
295
+object and the requested SVM along with SVM parameters to
296
+\code{kbsvm} and receives the training results in the form
297
+of a KeBABS model object of class
298 298
 \code{\linkS4class{KBModel}}. The accessor \code{svmModel}
299 299
 allows to retrieve the SVM specific model from the KeBABS
300
-model object. However for regular operation a detailed look
301
-into the SVM specific model is usually not necessary. (When
302
-repeat regions are coded as lowercase characters and should
303
-be excluded from the analysis the sequence data can be
304
-passed as \code{\linkS4class{BioVector}} which also
305
-supports lowercase characters instead of
300
+model object. However, for regular operation a detailed
301
+look into the SVM specific model is usually not necessary.
302
+
303
+The standard data format for sequences in KeBABS are the
304
+\code{XStringSet}-derived classes
305
+\code{\linkS4class{DNAStringSet}},
306
+\code{\linkS4class{RNAStringSet}} and
307
+\code{\linkS4class{AAStringSet}}. (When repeat regions are
308
+coded as lowercase characters and should be excluded from
309
+the analysis the sequence data can be passed as
310
+\code{\linkS4class{BioVector}} which also supports
311
+lowercase characters instead of
306 312
 \code{\linkS4class{XStringSet}} format. Please note that
307 313
 the classes derived from \code{\linkS4class{XStringSet}}
308 314
 are much more powerful than the
309 315
 \code{\linkS4class{BioVector}} derived classes and should
310 316
 be used in all cases where lowercase characters are not
311
-needed). Apart from SVM training \code{kbsvm} can be also
312
-used for cross validation (see \link{crossValidation} and
313
-parameters \code{cross} and \code{noCross}), grid search
314
-for SVM- and kernel-parameter values (see
315
-\link{gridSearch}) and model selection (see
316
-\link{modelSelection} and parameters \code{nestedCross} and
317
-\code{noNestedCross}).\cr\cr
317
+needed).
318
+
319
+Instead of sequences also a precomputed explicit
320
+representation or a precomputed kernel matrix can be used
321
+for training. Examples for training with kernel matrix and
322
+explicit representation can be found on the help page for
323
+the prediction method \code{\link{predict}}.
324
+
325
+Apart from SVM training \code{kbsvm} can be also used for
326
+cross validation (see \link{crossValidation} and parameters
327
+\code{cross} and \code{noCross}), grid search for SVM- and
328
+kernel-parameter values (see \link{gridSearch}) and model
329
+selection (see \link{modelSelection} and parameters
330
+\code{nestedCross} and \code{noNestedCross}).\cr\cr
318 331
 
319 332
 Package and SVM selection\cr
320 333
 
... ...
@@ -428,10 +441,10 @@ Pruning of feature weights can be achieved with the
428 441
 parameter \code{weightLimit} which defines the cutoff for
429 442
 small feature weights not stored in the model.
430 443
 
431
-Hint: For multiclass prediction is currently not performed
432
-via feature weights but native in the SVM. This means that
433
-for multiclass a pruned model cannot be used for
434
-prediction.\cr\cr
444
+Hint: For training with a precomputed kernel matrix feature
445
+weights are not available. For multiclass prediction is
446
+currently not performed via feature weights but native in
447
+the SVM.\cr\cr
435 448
 
436 449
 Cross validation, grid search and model selection\cr
437 450
 
... ...
@@ -107,6 +107,20 @@ variants which do not support the generation of an explicit
107 107
 representation, e.g. the position dependent kernel
108 108
 variants.\cr\cr
109 109
 
110
+Prediction with precomputed kernel matrix
111
+
112
+When training was performed with a precomputed kernel
113
+matrix also in prediction a precomputed kernel matrix must
114
+be passed to the \code{predict} method. In contrast to the
115
+quadratic and symmetric kernel matrix used in training the
116
+kernel matrix for prediction is rectangular and contains
117
+the similarities of test samples (rows) against support
118
+vectors (columns). support vector indices can be read from
119
+the model with the accessor SVindex. Please not that these
120
+indices refer to the sample subset used in training. An
121
+example for training and prediction via precomputed kernel
122
+matrix is shown below.
123
+
110 124
 Generation of prediction profiles
111 125
 
112 126
 The parameter \code{predProfiles} controls whether
... ...
@@ -132,15 +146,14 @@ gappy
132 146
 
133 147
 ## run training with explicit representation
134 148
 model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
135
-               pkg="LiblineaR", svm="C-svc", cost=10, explicit="yes",
136
-               featureWeights="yes")
149
+               pkg="LiblineaR", svm="C-svc", cost=10)
137 150
 
138 151
 ## show feature weights in KeBABS model
139 152
 featureWeights(model)[1:8]
140 153
 
141 154
 ## predict the test sequences
142 155
 pred <- predict(model, enhancerFB[test])
143
-evaluatePrediction(pred, yFB[test], allLabels=c(-1,1))
156
+evaluatePrediction(pred, yFB[test], allLabels=unique(yFB))
144 157
 pred[1:10]
145 158
 
146 159
 ## output decision values instead
... ...
@@ -148,11 +161,30 @@ pred <- predict(model, enhancerFB[test], predictionType="decision")
148 161
 pred[1:10]
149 162
 
150 163
 \dontrun{
151
-## computing of probability model via Platt scaling during training
164
+## example for training and prediction via precomputed kernel matrix
165
+
166
+## compute quadratic kernel matrix of training samples
167
+kmtrain <- getKernelMatrix(gappy, x=enhancerFB, selx=train)
168
+
169
+## train model with kernel matrix
170
+model <- kbsvm(x=kmtrain, y=yFB[train], kernel=gappy,
171
+               pkg="kernlab", svm="C-svc", cost=10)
172
+
173
+## compute rectangular kernel matrix of test samples versus
174
+## support vectors
175
+kmtest <- getKernelMatrix(gappy, x=enhancerFB, y=enhancerFB,
176
+                          selx=test, sely=train[SVindex(model)])
177
+
178
+## predict with kernel matrix
179
+pred <- predict(model, kmtest)
180
+evaluatePrediction(pred, yFB[test], allLabels=unique(yFB))
181
+
182
+## example for probability model generation during training
183
+
184
+## compute probability model via Platt scaling during training
152 185
 ## and predict class membership probabilities
153 186
 model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
154
-               pkg="e1071", svm="C-svc", cost=10, explicit="yes",
155
-               probModel=TRUE)
187
+               pkg="e1071", svm="C-svc", cost=10, probModel=TRUE)
156 188
 
157 189
 ## show parameters of the fitted probability model which are the parameters
158 190
 ## probA and probB for the fitted sigmoid function in case of classification
... ...
@@ -900,13 +900,14 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
900 900
                       int sizeX, int sizeY, IntegerVector selX, IntegerVector selY,
901 901
                       ByteStringVector annCharset, ByteStringVector annX, ByteStringVector annY,
902 902
                       bool unmapped, int k, int m, bool normalized, bool symmetric, bool presence,
903
-                      bool reverseComplement, int maxSeqLength, struct alphaInfo *alphaInf)
903
+                      bool reverseComplement, int maxSeqLength, uint64_t dimFeatureSpace,
904
+                      struct alphaInfo *alphaInf)
904 905
 {
905 906
     T featureIndex, annotIndex;
906 907
     int i, j, l, iX, iY, elemIndex, noSamples = sizeX;
907 908
     int freeNode, nodeLimit, currBlock, currIndex, newBlock, maxBlockIndex;
908 909
     int maxNoOfNodes, seqnchar, currStack, stack[8*k];
909
-    uint64_t maxNodesPerSequence, numAnnPow2K = 0;
910
+    uint64_t maxNodesPerSequence, maxNumFeatures, numAnnPow2K = 0;
910 911
     double kv;
911 912
     bool printWarning = TRUE;
912 913
     const char *seqptr, *annptr;
... ...
@@ -928,14 +929,17 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
928 929
         noSamples += sizeY;
929 930
 
930 931
     // add one for the sentinel
931
-    maxSeqLength += 1;
932
-
932
+    if (dimFeatureSpace < maxSeqLength)
933
+        maxNumFeatures = dimFeatureSpace + 1;
934
+    else
935
+        maxNumFeatures = (maxSeqLength - 2*k - 0.5*m + 1) * (m + 1) + 1;
936
+    
933 937
     // allocate arrays for sparse feature vectors with 32 or 64 bit index
934 938
     // store only unnormalized k-mer count to avoid double space usage
935 939
     // add one for the sentinel
936 940
     featVectorArrays featVectors;
937
-    featVectors.x   = (int32_t *) R_alloc(noSamples * ((maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1), sizeof(int32_t));
938
-    featVectors.idx = (T *) R_alloc(noSamples * ((maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1), sizeof(T));
941
+    featVectors.x   = (int32_t *) R_alloc(noSamples * maxNumFeatures, sizeof(int32_t));
942
+    featVectors.idx = (T *) R_alloc(noSamples * maxNumFeatures, sizeof(T));
939 943
 
940 944
     double *normValues   = (double *) R_alloc(noSamples, sizeof(double));
941 945
 
... ...
@@ -1016,12 +1020,13 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1016 1020
             currBlock = 0;
1017 1021
             currIndex = 0;
1018 1022
             currStack = -1;
1019
-            elemIndex = i * ((maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1);
1023
+            elemIndex = i * maxNumFeatures;
1020 1024
             featureIndex = 0;
1021 1025
             annotIndex = 0;
1022 1026
 
1023 1027
             // for situation with no features set sentinel
1024 1028
             featVectors.idx[elemIndex] = maxUnSignedIndex;
1029
+            featVectors.x[elemIndex] = MAXINT32;
1025 1030
 
1026 1031
             while (currStack >= 0 || currIndex <= maxBlockIndex)
1027 1032
             {
... ...
@@ -1117,10 +1122,11 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1117 1122
             }
1118 1123
 
1119 1124
             featVectors.idx[elemIndex] = maxUnSignedIndex;
1125
+            featVectors.x[elemIndex] = MAXINT32;
1120 1126
         }
1121 1127
 
1122 1128
         computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues,
1123
-                            (maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1, sizeX, sizeY, normalized, symmetric);
1129
+                            maxNumFeatures, sizeX, sizeY, normalized, symmetric);
1124 1130
     }
1125 1131
     else // not symmetric
1126 1132
     {
... ...
@@ -1144,10 +1150,6 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1144 1150
             }
1145 1151
             else
1146 1152
             {
1147
-                // this statement causes problems in the optimization level -O2
1148
-//                iY = selY[i - sizeX];
1149
-//                Rprintf("i: %d  sizeX: %d  i-sizeX: %d\n", i, sizeX, i - sizeX);
1150
-
1151 1153
                 indexY++;
1152 1154
                 iY = selY[indexY];
1153 1155
 
... ...
@@ -1162,8 +1164,8 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1162 1164
 
1163 1165
             //create new tree for sequence
1164 1166
             kv = createTreeGappy((const char *) seqptr, seqnchar, annptr, k, m, &annotationIndexMap,
1165
-                                 presence, reverseComplement, pTree, maxNoOfNodes, &freeNode, &nullBlock, &printWarning,
1166
-                                 alphaInf);
1167
+                                 presence, reverseComplement, pTree, maxNoOfNodes, &freeNode,
1168
+                                 &nullBlock, &printWarning, alphaInf);
1167 1169
 
1168 1170
             if (kv == NA_REAL)
1169 1171
             {
... ...
@@ -1185,12 +1187,13 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1185 1187
             currBlock = 0;
1186 1188
             currIndex = 0;
1187 1189
             currStack = -1;
1188
-            elemIndex = i * ((maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1);
1190
+            elemIndex = i * maxNumFeatures;
1189 1191
             featureIndex = 0;
1190 1192
             annotIndex = 0;
1191 1193
 
1192 1194
             // for situation with no features set sentinel
1193 1195
             featVectors.idx[elemIndex] = maxUnSignedIndex;
1196
+            featVectors.x[elemIndex] = MAXINT32;
1194 1197
 
1195 1198
             while (currStack >= 0 || currIndex <= maxBlockIndex)
1196 1199
             {
... ...
@@ -1286,10 +1289,11 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1286 1289
             }
1287 1290
 
1288 1291
             featVectors.idx[elemIndex] = maxUnSignedIndex;
1292
+            featVectors.x[elemIndex] = MAXINT32;
1289 1293
         }
1290 1294
 
1291 1295
         computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues,
1292
-                            (maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1, sizeX, sizeY, normalized, symmetric);
1296
+                            maxNumFeatures, sizeX, sizeY, normalized, symmetric);
1293 1297
     }
1294 1298
 
1295 1299
     return;
... ...
@@ -1303,9 +1307,9 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1303 1307
                        NumericVector distWeight, int maxSeqLength, struct alphaInfo *alphaInf)
1304 1308
 {
1305 1309
     T prevIndex;
1306
-    uint32_t km1, inew, invalidCharPos, elemIndex, tempIndex1, tempIndex2, fIndex;
1307
-    uint64_t featureIndex, fDimArray, numAlphaPowerK, numAlphaPowerK1;
1308
-    int i, j, l, offset, noSamples = sizeX;
1310
+    uint32_t km1, inew, elemIndex, tempIndex1, tempIndex2, fIndex;
1311
+    uint64_t featureIndex, fDimArray, invalidCharPos, numAlphaPowerK, numAlphaPowerK1, *featVectorsStart;
1312
+    int i, j, l, offset, maxNumPatternsPerPos, maxFeaturesPerSample, noSamples = sizeX;
1309 1313
     int patLength, seqnchar, index, iold, iX, iY, upper;
1310 1314
     const char *seqptr;
1311 1315
     double kv, *normValues;
... ...
@@ -1321,19 +1325,25 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1321 1325
         noSamples += sizeY;
1322 1326
 
1323 1327
     km1 = k + m + 1;
1324
-    T *oldIndex = (T *) R_alloc(k, sizeof(uint64_t));
1325
-    T *newIndex = (T *) R_alloc(km1, sizeof(uint64_t));
1328
+    uint64_t *oldIndex = (uint64_t *) R_alloc(k, sizeof(uint64_t));
1329
+    uint64_t *newIndex = (uint64_t *) R_alloc(km1, sizeof(uint64_t));
1326 1330
     numAlphaPowerK1 = ipow64(alphaInf->numAlphabetChars, k - 1);
1327 1331
     numAlphaPowerK = numAlphaPowerK1 * alphaInf->numAlphabetChars;
1328 1332
 
1329 1333
     // allocate arrays for feature vectors with 8, 16, 32 or 64 bit index
1330
-    fDimArray = (maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1;
1334
+    fDimArray = (maxSeqLength - 2*k - 0.5*m + 1) * (m + 1);
1331 1335
     featVectorArrays featVectors;
1332 1336
 
1333 1337
     featVectors.x   = (int32_t *) R_alloc(noSamples * fDimArray, sizeof(int32_t));
1334 1338
     featVectors.idx = (T *) R_alloc(noSamples * fDimArray, sizeof(T));
1339
+    featVectorsStart = (uint64_t *) R_alloc(noSamples + 1, sizeof(uint64_t));
1335 1340
     normValues = (double *) R_alloc(noSamples, sizeof(double));
1341
+    maxNumPatternsPerPos = m + 1;
1336 1342
 
1343
+    featVectorsStart[0] = 0;
1344
+    maxFeaturesPerSample = 0;
1345
+    elemIndex = 0;
1346
+    
1337 1347
     // walk along sequence and create sparse feature vector
1338 1348
     for (i = 0; i < noSamples; i++)
1339 1349
     {
... ...
@@ -1362,18 +1372,17 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1362 1372
 
1363 1373
         patLength = 0;
1364 1374
         featureIndex = 0;
1365
-        elemIndex = i * fDimArray;
1366 1375
         iold = 0;
1367 1376
         inew = 0;
1368 1377
         kv = 0;
1369
-        invalidCharPos = maxUnSignedIndex;
1378
+        invalidCharPos = MAXUINT64;
1370 1379
 
1371 1380
         for (l = 0; l < seqnchar - 2 * k + 1; l++)
1372 1381
         {
1373
-            if (invalidCharPos != maxUnSignedIndex && l <= (int)invalidCharPos)
1382
+            if (invalidCharPos != MAXUINT64 && l <= (int)invalidCharPos)
1374 1383
                 continue;
1375 1384
             else
1376
-                invalidCharPos = maxUnSignedIndex;
1385
+                invalidCharPos = MAXUINT64;
1377 1386
 
1378 1387
             // look ahead and fill newIndex buffer to process
1379 1388
             // each of the m + 1 patterns at a position immediately
... ...
@@ -1386,9 +1395,9 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1386 1395
             {
1387 1396
                 for (j = 0; j < upper; j++)
1388 1397
                 {
1389
-                    if (invalidCharPos != maxUnSignedIndex)
1398
+                    if (invalidCharPos != MAXUINT64)
1390 1399
                     {
1391
-                        newIndex[inew++] = maxUnSignedIndex;
1400
+                        newIndex[inew++] = MAXUINT64;
1392 1401
 
1393 1402
                         if (inew == km1)
1394 1403
                             inew = 0;
... ...
@@ -1438,7 +1447,7 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1438 1447
                     else
1439 1448
                     {
1440 1449
                         //store max index to buffer for not valid pattern
1441
-                        newIndex[inew++] = maxUnSignedIndex;
1450
+                        newIndex[inew++] = MAXUINT64;
1442 1451
 
1443 1452
                         if (inew == km1)
1444 1453
                             inew = 0;
... ...
@@ -1452,7 +1461,7 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1452 1461
                 }
1453 1462
             }
1454 1463
 
1455
-            if (l + 2 * k + m - 1 < seqnchar && invalidCharPos == maxUnSignedIndex)
1464
+            if (l + 2 * k + m - 1 < seqnchar && invalidCharPos == MAXUINT64)
1456 1465
             {
1457 1466
                 //calc next look ahead feature index and store to buffer
1458 1467
                 index = alphaInf->seqIndexMap[(int) seqptr[l + 2 * k + m - 1]];
... ...
@@ -1475,7 +1484,7 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1475 1484
                 else
1476 1485
                 {
1477 1486
                     //store max index to buffer for not valid pattern
1478
-                    newIndex[inew++] = maxUnSignedIndex;
1487
+                    newIndex[inew++] = MAXUINT64;
1479 1488
 
1480 1489
                     if (inew == km1)
1481 1490
                         inew = 0;
... ...
@@ -1499,13 +1508,13 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1499 1508
             else
1500 1509
                 upper = seqnchar - 2 * k - l;
1501 1510
 
1502
-            if (invalidCharPos != maxUnSignedIndex && upper > (int) invalidCharPos - l - 2 * k)
1511
+            if (invalidCharPos != MAXUINT64 && upper > (int) invalidCharPos - l - 2 * k)
1503 1512
                 upper = invalidCharPos - l - 2 * k;
1504 1513
 
1505 1514
             for (j = 0; j <= upper; j++)
1506 1515
             {
1507
-                if (newIndex[inew] == maxUnSignedIndex ||
1508
-                    newIndex[(inew + k + j) % km1] == maxUnSignedIndex)
1516
+                if (newIndex[inew] == MAXUINT64 ||
1517
+                    newIndex[(inew + k + j) % km1] == MAXUINT64)
1509 1518
                     continue;
1510 1519
 
1511 1520
                 if (reverseComplement)
... ...
@@ -1537,11 +1546,14 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1537 1546
                 kv++;
1538 1547
             }
1539 1548
 
1540
-            if (invalidCharPos != maxUnSignedIndex)
1549
+            if (invalidCharPos != MAXUINT64)
1541 1550
                 inew = 0;
1542 1551
         }
1543 1552
 
1544
-        featVectors.idx[elemIndex] = maxUnSignedIndex;
1553
+        featVectorsStart[i + 1] = elemIndex;
1554
+
1555
+        if (maxFeaturesPerSample < (featVectorsStart[i + 1] - featVectorsStart[i]))
1556
+            maxFeaturesPerSample = featVectorsStart[i + 1] - featVectorsStart[i];
1545 1557
 
1546 1558
         // for dist weighting the kernel value is determined in computeKernelMatrixPos
1547 1559
         if (distWeight.length() == 0)
... ...
@@ -1553,8 +1565,9 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1553 1565
         }
1554 1566
     }
1555 1567
 
1556
-    computeKernelMatrixPos(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, fDimArray,
1557
-                           m + 1, sizeX, sizeY, normalized, symmetric, FALSE, distWeight);
1568
+    computeKernelMatrixPos(maxUnSignedIndex, featVectors.idx, featVectors.x, featVectorsStart, km,
1569
+                           normValues, maxFeaturesPerSample, maxNumPatternsPerPos, sizeX, sizeY,
1570
+                           normalized, symmetric, FALSE, distWeight);
1558 1571
 
1559 1572
     return;
1560 1573
 }
... ...
@@ -3464,7 +3477,7 @@ RcppExport SEXP gappyPairKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR,
3464 3477
             {
3465 3478
                 getKMStdAnnGappy(maxUIndex8, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY,
3466 3479
                                  unmapped, k, m, normalized, symmetric, presence, reverseComplement,
3467
-                                 maxSeqLength, &alphaInf);
3480
+                                 maxSeqLength, dimFeatureSpace, &alphaInf);
3468 3481
             }
3469 3482
 
3470 3483
             break;
... ...
@@ -3482,7 +3495,7 @@ RcppExport SEXP gappyPairKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR,
3482 3495
             {
3483 3496
                 getKMStdAnnGappy(maxUIndex16, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY,
3484 3497
                                  unmapped, k, m, normalized, symmetric, presence, reverseComplement,
3485
-                                 maxSeqLength, &alphaInf);
3498
+                                 maxSeqLength, dimFeatureSpace, &alphaInf);
3486 3499
             }
3487 3500
 
3488 3501
             break;
... ...
@@ -3501,7 +3514,7 @@ RcppExport SEXP gappyPairKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR,
3501 3514
             {
3502 3515
                 getKMStdAnnGappy(maxUIndex32, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY,
3503 3516
                                  unmapped, k, m, normalized, symmetric, presence, reverseComplement,
3504
-                                 maxSeqLength, &alphaInf);
3517
+                                 maxSeqLength, dimFeatureSpace, &alphaInf);
3505 3518
             }
3506 3519
 
3507 3520
             break;
... ...
@@ -3519,7 +3532,7 @@ RcppExport SEXP gappyPairKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR,
3519 3532
             {
3520 3533
                 getKMStdAnnGappy(maxUIndex64, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY,
3521 3534
                                  unmapped, k, m, normalized, symmetric, presence, reverseComplement,
3522
-                                 maxSeqLength, &alphaInf);
3535
+                                 maxSeqLength, dimFeatureSpace, &alphaInf);
3523 3536
             }
3524 3537
 
3525 3538
             break;
... ...
@@ -3536,7 +3549,7 @@ uint64_t * featureNamesToIndexGappyPair(SEXP featureNames, int numFeatures, Byte
3536 3549
                                         struct alphaInfo *alphaInf)
3537 3550
 {
3538 3551
     int i, j, l, numDots;
3539
-    uint64_t featureIndex, *featIndex, annIndex, numAnnPow2K, tempIndex, fIndex;
3552
+    uint64_t featureIndex, *featIndex, annIndex, tempIndex, fIndex, numAnnPow2K = 0;
3540 3553
     const char *pattern;
3541 3554
 
3542 3555
     featIndex = (uint64_t *) R_alloc(numFeatures, sizeof(uint64_t));
... ...
@@ -3600,7 +3613,7 @@ RcppExport void featuresToHashmapGappyPair(NumericMatrix featureWeights, int svm
3600 3613
                                            IntegerVector annotationIndexMap)
3601 3614
 {
3602 3615
     int i, j, numFeatures, result, numDots;
3603
-    uint64_t featureIndex, annotIndex, numAnnPow2K;
3616
+    uint64_t featureIndex, annotIndex, numAnnPow2K = 0;
3604 3617
     const char *pattern;
3605 3618
     SEXP dimNames, colNames;
3606 3619
     khiter_t iter;
... ...
@@ -18,6 +18,7 @@
18 18
 #define CHAR_UNMAPPED        FALSE
19 19
 #define MAXUINT8             0xFF
20 20
 #define MAXUINT16            0xFFFF
21
+#define MAXINT32             0x7FFFFFFF
21 22
 #define MAXUINT32            0xFFFFFFFF
22 23
 #define MAXUINT64            0xFFFFFFFFFFFFFFFF
23 24
 #define MAXINDEX32           MAXUINT32 - 1
... ...
@@ -237,11 +237,11 @@ void computeKernelMatrix(T maxUnSignedIndex, T *featVectorIndex, int32_t *featVe
237 237
 
238 238
                 while (ix < endx && iy < endy)
239 239
                 {
240
-                    if (featVectorIndex[ix] == maxUnSignedIndex ||
241
-                        featVectorIndex[iy] == maxUnSignedIndex)
242
-                    {
240
+                    if ((featVectorIndex[ix] == maxUnSignedIndex &&
241
+                         featVectorValue[ix] == MAXINT32)||
242
+                        (featVectorIndex[iy] == maxUnSignedIndex &&
243
+                         featVectorValue[iy] == MAXINT32))
243 244
                         break;
244
-                    }
245 245
 
246 246
                     if (featVectorIndex[ix] < featVectorIndex[iy])
247 247
                         ix++;
... ...
@@ -285,8 +285,10 @@ void computeKernelMatrix(T maxUnSignedIndex, T *featVectorIndex, int32_t *featVe
285 285
 
286 286
                 while (ix < endx && iy < endy)
287 287
                 {
288
-                    if (featVectorIndex[ix] == maxUnSignedIndex ||
289
-                        featVectorIndex[iy] == maxUnSignedIndex)
288
+                    if ((featVectorIndex[ix] == maxUnSignedIndex &&
289
+                         featVectorValue[ix] == MAXINT32)||
290
+                        (featVectorIndex[iy] == maxUnSignedIndex &&
291
+                         featVectorValue[iy] == MAXINT32))
290 292
                         break;
291 293
 
292 294
                     if (featVectorIndex[ix] < featVectorIndex[iy])
... ...
@@ -317,9 +319,9 @@ void computeKernelMatrix(T maxUnSignedIndex, T *featVectorIndex, int32_t *featVe
317 319
 
318 320
 template<typename T>
319 321
 void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *featVectorValue,
320
-                            Rcpp::NumericMatrix km, double *normValues, uint32_t fDim, int maxNumPatterns,
321
-                            int sizeX, int sizeY, bool normalized, bool symmetric, bool computePosition,
322
-                            Rcpp::NumericVector distWeight)
322
+                            uint64_t *featVectorStart, Rcpp::NumericMatrix km, double *normValues,
323
+                            uint32_t fDim, int maxNumPatterns, int sizeX, int sizeY, bool normalized,
324
+                            bool symmetric, bool computePosition, Rcpp::NumericVector distWeight)
323 325
 {
324 326
     uint32_t endx, endy, ix, iy, prevIndex;
325 327
     int i, j, j1, j2, posX, posY, distWeightLength, startIndex, numSamples, yOffset, numPatterns1, numPatterns2;
... ...
@@ -360,10 +362,10 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
360 362
 
361 363
             for (j = startIndex; j < sizeY; j++)
362 364
             {
363
-                ix = i * fDim;
364
-                iy = (yOffset + j) * fDim;
365
-                endx = ix + fDim;
366
-                endy = iy + fDim;
365
+                ix = featVectorStart[i];
366
+                iy = featVectorStart[yOffset + j];
367
+                endx = featVectorStart[i + 1];
368
+                endy = featVectorStart[yOffset + j + 1];
367 369
                 prevIndex = 0;
368 370
                 kv = 0;
369 371
 
... ...
@@ -378,10 +380,6 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
378 380
 
379 381
                     while (ix < endx && iy < endy)
380 382
                     {
381
-                        if (featVectorIndex[ix] == maxUnSignedIndex ||
382
-                            featVectorIndex[iy] == maxUnSignedIndex)
383
-                            break;
384
-
385 383
                         if (featVectorIndex[ix] == featVectorIndex[iy])
386 384
                             kv++;
387 385
 
... ...
@@ -393,10 +391,6 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
393 391
                 {
394 392
                     while (ix < endx && iy < endy)
395 393
                     {
396
-                        if (featVectorIndex[ix] == maxUnSignedIndex ||
397
-                            featVectorIndex[iy] == maxUnSignedIndex)
398
-                            break;
399
-
400 394
                         if (featVectorValue[ix] < featVectorValue[iy])
401 395
                             ix++;
402 396
                         else if (featVectorValue[ix] > featVectorValue[iy])
... ...
@@ -418,7 +412,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
418 412
 
419 413
                                 for (j1 = 0; j1 < maxNumPatterns; j1++)
420 414
                                 {
421
-                                    if (featVectorIndex[ix + j1] == maxUnSignedIndex)
415
+                                    if (ix + j1 >= endx)
422 416
                                         break;
423 417
 
424 418
                                     if (featVectorValue[ix + j1] != featVectorValue[ix])
... ...
@@ -428,7 +422,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
428 422
 
429 423
                                     for (j2 = 0; j2 < maxNumPatterns; j2++)
430 424
                                     {
431
-                                        if (featVectorIndex[ix + j2] == maxUnSignedIndex)
425
+                                        if (iy + j2 >= endy)
432 426
                                             break;
433 427
 
434 428
                                         if (featVectorValue[iy + j2] != featVectorValue[ix])
... ...
@@ -467,7 +461,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
467 461
     else // distance weighting
468 462
     {
469 463
         // sort feature vectors
470
-        sort2Arrays(maxUnSignedIndex, featVectorIndex, featVectorValue, numSamples, fDim, NULL);
464
+        sort2Arrays(maxUnSignedIndex, featVectorIndex, featVectorValue, numSamples, fDim, featVectorStart);
471 465
 
472 466
         distWeightLength = distWeight.size();
473 467
 
... ...
@@ -476,18 +470,14 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
476 470
         {
477 471
             R_CheckUserInterrupt();
478 472
 
479
-            ix = i * fDim;
480
-            iy = i * fDim;
473
+            ix = featVectorStart[i];
474
+            iy = featVectorStart[i];
481 475
             prevIndex = 0;
482
-            endx = ix + fDim;
476
+            endx = featVectorStart[i + 1];
483 477
             kv = 0;
484 478
 
485 479
             while (ix < endx && iy < endx)
486 480
             {
487
-                if (featVectorIndex[ix] == maxUnSignedIndex ||
488
-                    featVectorIndex[iy] == maxUnSignedIndex)
489
-                    break;
490
-
491 481
                 if (featVectorIndex[ix] < featVectorIndex[iy])
492 482
                     ix++;
493 483
                 else if (featVectorIndex[ix] > featVectorIndex[iy])
... ...
@@ -496,7 +486,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
496 486
                 {
497 487
                     prevIndex = iy;
498 488
 
499
-                    while (featVectorIndex[ix] == featVectorIndex[iy])
489
+                    while (iy < endx && featVectorIndex[ix] == featVectorIndex[iy])
500 490
                     {
501 491
                         distance = abs(featVectorValue[iy++] - featVectorValue[ix]);
502 492
 
... ...
@@ -538,19 +528,15 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
538 528
 
539 529
             for (j = startIndex; j < sizeY; j++)
540 530
             {
541
-                ix = i * fDim;
542
-                endx = ix + fDim;
543
-                iy = (yOffset + j) * fDim;
544
-                endy = (yOffset + j + 1) * fDim;
531
+                ix = featVectorStart[i];
532
+                endx = featVectorStart[i + 1];
533
+                iy = featVectorStart[yOffset + j];
534
+                endy = featVectorStart[yOffset + j + 1];
545 535
                 prevIndex = 0;
546 536
                 kv = 0;
547 537
 
548 538
                 while (ix < endx && iy < endy)
549 539
                 {
550
-                    if (featVectorIndex[ix] == maxUnSignedIndex ||
551
-                        featVectorIndex[iy] == maxUnSignedIndex)
552
-                        break;
553
-
554 540
                     if (featVectorIndex[ix] < featVectorIndex[iy])
555 541
                         ix++;
556 542
                     else if (featVectorIndex[ix] > featVectorIndex[iy])
... ...
@@ -559,7 +545,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea
559 545
                     {
560 546
                         prevIndex = iy;
561 547
 
562
-                        while (featVectorIndex[ix] == featVectorIndex[iy])
548
+                        while (iy < endy && featVectorIndex[ix] == featVectorIndex[iy])
563 549
                         {
564 550
                             distance = abs(featVectorValue[iy++] - featVectorValue[ix]);
565 551
 
... ...
@@ -972,6 +972,7 @@ bool getIndexMap(ByteStringVector x, int sizeX, IntegerVector selX, bool unmappe
972 972
     featureCounts = NULL;
973 973
     featuresInSample = NULL;
974 974
     vmax = NULL;
975
+    fchmap = NULL;
975 976
 
976 977
     // cyclic buffers for old index
977 978
     uint64_t *oldIndex = (uint64_t *) R_alloc(k, sizeof(uint64_t));
... ...
@@ -1666,7 +1667,7 @@ bool getERSMismatch(ByteStringVector x, int sizeX, IntegerVector selX, int maxSe
1666 1667
                     double *normValues)
1667 1668
 {
1668 1669
     int i, iX, freeNode, maxNoOfNodes, jIndex;
1669
-    uint64_t nodeLimit, maxNodesPerSequence;
1670
+    uint64_t nodeLimit;
1670 1671
     double kernelValue;
1671 1672
     bool printWarning = TRUE;
1672 1673
     struct prefTree *pTree;
... ...
@@ -1674,18 +1675,12 @@ bool getERSMismatch(ByteStringVector x, int sizeX, IntegerVector selX, int maxSe
1674 1675
 
1675 1676
     // alloc mem for prefix tree
1676 1677
     // consider mismatch subtree
1677
-    nodeLimit = ((pow(alphaInf->numAlphabetChars, k + 1) - 1) / (alphaInf->numAlphabetChars - 1)) *
1678
-                pow(alphaInf->numAlphabetChars, k) + 1;
1679
-
1680
-    maxNodesPerSequence = pow(alphaInf->numAlphabetChars, m) * k * (maxSeqLength - k + 1) + 1;
1681
-
1682
-    if (maxNodesPerSequence < (uint64_t) nodeLimit)
1683
-        nodeLimit = (int) maxNodesPerSequence;
1684
-
1685
-    maxNoOfNodes = MAX_BLOCK;
1678
+    nodeLimit = (pow(alphaInf->numAlphabetChars, k + 1) - 1) / (alphaInf->numAlphabetChars - 1);
1686 1679
 
1687 1680
     if (nodeLimit < (uint64_t) MAX_BLOCK)
1688 1681
         maxNoOfNodes = (int) nodeLimit;
1682
+    else
1683
+        maxNoOfNodes = MAX_BLOCK;
1689 1684
 
1690 1685
     pTree = (struct prefTree *) R_alloc(maxNoOfNodes, sizeof(struct treeNode));
1691 1686
 
... ...
@@ -847,7 +847,7 @@ void setFeatureIndex(struct prefTreeMotif *pTree, int maxMotifLength, int maxPat
847 847
                 }
848 848
             }
849 849
 
850
-            free(pKeys);
850
+            Free(pKeys);
851 851
             pKeys = NULL;
852 852
         }
853 853
     }
... ...
@@ -1629,8 +1629,6 @@ void findMotifsForPos(struct intfFindMotifs *intf)
1629 1629
 
1630 1630
     descendOnBranchPos(0, intf->seqnchar, 0, &motifBegin, intf);
1631 1631
 
1632
-    intf->pFeatVecIndex[intf->elemIndex] = MAXUINT32;
1633
-
1634 1632
     return;
1635 1633
 }
1636 1634
 
... ...
@@ -1728,6 +1726,7 @@ void getNonzeroMotifs(bool annSpec, int32_t *featVectorValue, uint32_t *featVect
1728 1726
         }
1729 1727
 
1730 1728
         featVectorIndex[elemIndex] = MAXUINT32;
1729
+        featVectorValue[elemIndex] = MAXINT32;
1731 1730
     }
1732 1731
     else
1733 1732
     {
... ...
@@ -1768,6 +1767,7 @@ void getNonzeroMotifs(bool annSpec, int32_t *featVectorValue, uint32_t *featVect
1768 1767
             }
1769 1768
 
1770 1769
             featVectorIndex[elemIndex] = MAXUINT32;
1770
+            featVectorValue[elemIndex] = MAXINT32;
1771 1771
         }
1772 1772
     }
1773 1773
 
... ...
@@ -1869,6 +1869,7 @@ void getNonzeroMotifsERS(bool annSpec, struct prefTreeMotif *pTree, khash_t(fim)
1869 1869
         if (curr > 0)
1870 1870
         {
1871 1871
             featVectorIndex[curr] = MAXUINT32;
1872
+            featVectorValue[curr] = MAXINT32;
1872 1873
             sort2Arrays(MAXUINT32, featVectorIndex, featVectorValue, 1, numUsedFeatures, NULL);
1873 1874
 
1874 1875
             curr = 0;
... ...
@@ -2151,13 +2152,12 @@ void getKMStdAnnMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y,
2151 2152
 
2152 2153
     if (pKeys != NULL)
2153 2154
     {
2154
-        free(pKeys);
2155
+        Free(pKeys);
2155 2156
         pKeys = NULL;
2156 2157
     }
2157 2158
 
2158 2159
     // sort feature vectors
2159
-    if (annX.length > 0)
2160
-        sort2Arrays(MAXUINT32, featVectorIndex, featVectorValue, numSamples, fDim, NULL);
2160
+    sort2Arrays(MAXUINT32, featVectorIndex, featVectorValue, numSamples, fDim, NULL);
2161 2161
 
2162 2162
     computeKernelMatrix(MAXUINT32, featVectorIndex, featVectorValue, km, normValues, fDim,
2163 2163
                         sizeX, sizeY, normalized, symmetric);
... ...
@@ -2174,7 +2174,8 @@ void getKMPosDistMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y,
2174 2174
 {
2175 2175
     uint32_t fDim, *featVectorIndex;
2176 2176
     int32_t *featVectorValue;
2177
-    int i, freeNode, maxNoOfNodes, iX, iY, numSamples;
2177
+    uint64_t *featVectorsStart;
2178
+    int i, freeNode, maxNoOfNodes, iX, iY, numSamples, maxFeaturesPerSample;
2178 2179
     bool printWarning = TRUE;
2179 2180
     struct prefTreeMotif *pTree;
2180 2181
     struct indexBlock nullBlock;
... ...
@@ -2197,6 +2198,7 @@ void getKMPosDistMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y,
2197 2198
     fDim = 2 * maxSeqLength + 1;
2198 2199
     featVectorValue = (int32_t *) R_alloc(numSamples * fDim, sizeof(int32_t));
2199 2200
     featVectorIndex = (uint32_t *) R_alloc(numSamples * fDim, sizeof(uint32_t));
2201
+    featVectorsStart = (uint64_t *) R_alloc(numSamples + 1, sizeof(uint64_t));
2200 2202
 
2201 2203
     // alloc mem for prefix tree
2202 2204
     maxNoOfNodes = MAX_BLOCK;
... ...
@@ -2238,12 +2240,15 @@ void getKMPosDistMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y,
2238 2240
     intf.fDim = fDim;
2239 2241
     intf.allIndexMaps = allIndexMaps;
2240 2242
 
2243
+    featVectorsStart[0] = 0;
2244
+    maxFeaturesPerSample = 0;
2245
+    intf.elemIndex = 0;
2246
+
2241 2247
     // calculate kernel matrix
2242 2248
     for (i = 0; i < numSamples; i++)
2243 2249
     {
2244 2250
         R_CheckUserInterrupt();
2245 2251
         intf.rowIndex = i;
2246
-        intf.elemIndex = intf.rowIndex * intf.fDim;
2247 2252
         intf.offset = 0;
2248 2253
 
2249 2254
         if (i < sizeX)
... ...
@@ -2273,14 +2278,20 @@ void getKMPosDistMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y,
2273 2278
             return;
2274 2279
         }
2275 2280
 
2281
+        featVectorsStart[i + 1] = intf.elemIndex;
2282
+
2283
+        if (maxFeaturesPerSample < (featVectorsStart[i + 1] - featVectorsStart[i]))
2284
+            maxFeaturesPerSample = featVectorsStart[i + 1] - featVectorsStart[i];
2285
+
2276 2286
         if (normalized)
2277 2287
             normValues[i] = sqrt(intf.kernelValue);
2278 2288
         else
2279 2289
             normValues[i] = intf.kernelValue;
2280 2290
     }
2281 2291
 
2282
-    computeKernelMatrixPos(MAXUINT32, featVectorIndex, featVectorValue, km, normValues, fDim, motifs.length,
2283
-                           sizeX, sizeY, normalized, symmetric, FALSE, distWeight);
2292
+    computeKernelMatrixPos(MAXUINT32, featVectorIndex, featVectorValue, featVectorsStart, km,
2293
+                           normValues, maxFeaturesPerSample, motifs.length, sizeX, sizeY, normalized, symmetric,
2294
+                           FALSE, distWeight);
2284 2295
 
2285 2296
     return;
2286 2297
 }
... ...
@@ -581,12 +581,13 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B
581 581
                      int sizeX, int sizeY, IntegerVector selX, IntegerVector selY,
582 582
                      ByteStringVector annCharset, ByteStringVector annX, ByteStringVector annY,
583 583
                      bool unmapped, int k, bool normalized,  bool symmetric, bool presence,
584
-                     bool reverseComplement, int maxSeqLength, struct alphaInfo *alphaInf)
584
+                     bool reverseComplement, int maxSeqLength, uint64_t dimFeatureSpace,
585
+                     struct alphaInfo *alphaInf)
585 586
 {
586 587
     T featureIndex, annotIndex;
587 588
     int i, j, l, currStack, stack[4*k], iX, iY, twoK, elemIndex, maxNoOfNodes, noSamples = sizeX;
588 589
     int freeNode, nodeLimit, currBlock, currIndex, newBlock, lastBlockSize, maxBlockIndex;
589
-    uint64_t numAnnPowK, maxNodesPerSequence;
590
+    uint64_t numAnnPowK, maxNodesPerSequence, maxNumFeatures;
590 591
     double kv;
591 592
     bool printWarning = TRUE;
592 593
     const char *annptr;
... ...
@@ -608,13 +609,16 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B
608 609
         noSamples += sizeY;
609 610
 
610 611
     // add one for the sentinel
611
-    maxSeqLength += 1;
612
+    if (dimFeatureSpace < maxSeqLength)
613
+        maxNumFeatures = dimFeatureSpace + 1;
614
+    else
615
+        maxNumFeatures = maxSeqLength + 1;
612 616
 
613 617
     // allocate arrays for sparse feature vectors with 32 or 64 bit index
614 618
     // store only unnormalized k-mer counts to avoid double space usage
615 619
     featVectorArrays featVectors;
616
-    featVectors.x   = (int32_t *) R_alloc(noSamples * maxSeqLength, sizeof(int32_t));
617
-    featVectors.idx = (T *) R_alloc(noSamples * maxSeqLength, sizeof(T));
620
+    featVectors.x   = (int32_t *) R_alloc(noSamples * maxNumFeatures, sizeof(int32_t));
621
+    featVectors.idx = (T *) R_alloc(noSamples * maxNumFeatures, sizeof(T));
618 622
 
619 623
     double *normValues   = (double *) R_alloc(noSamples, sizeof(double));
620 624
 
... ...
@@ -701,12 +705,13 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B
701 705
             currBlock = 0;
702 706
             currIndex = 0;
703 707
             currStack = -1;
704
-            elemIndex = i * maxSeqLength;
708
+            elemIndex = i * maxNumFeatures;
705 709
             featureIndex = 0;
706 710
             annotIndex = 0;
707 711
 
708 712
             // for situation with no features set sentinel
709 713
             featVectors.idx[elemIndex] = maxUnSignedIndex;
714
+            featVectors.x[elemIndex] = MAXINT32;
710 715
 
711 716
             while (currStack >= 0 || currIndex <= maxBlockIndex)
712 717
             {
... ...
@@ -793,9 +798,10 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B
793 798
             }
794 799
 
795 800
             featVectors.idx[elemIndex] = maxUnSignedIndex;
801
+            featVectors.x[elemIndex] = MAXINT32;
796 802
         }
797 803
 
798
-        computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, maxSeqLength,
804
+        computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, maxNumFeatures,
799 805
                             sizeX, sizeY, normalized, symmetric);
800 806
     }
801 807
     else // not symmetric
... ...
@@ -856,12 +862,13 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B
856 862
             currBlock = 0;
857 863
             currIndex = 0;
858 864
             currStack = -1;
859
-            elemIndex = i * maxSeqLength;
865
+            elemIndex = i * maxNumFeatures;
860 866
             featureIndex = 0;
861 867
             annotIndex = 0;
862 868
 
863 869
             // for situation with no features set sentinel
864 870
             featVectors.idx[elemIndex] = maxUnSignedIndex;
871
+            featVectors.x[elemIndex] = MAXINT32;
865 872
 
866 873
             while (currStack >= 0 || currIndex <= maxBlockIndex)
867 874
             {
... ...
@@ -947,9 +954,10 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B
947 954
             }
948 955
 
949 956
             featVectors.idx[elemIndex] = maxUnSignedIndex;
957
+            featVectors.x[elemIndex] = MAXINT32;
950 958
         }
951 959
 
952
-        computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, maxSeqLength,
960
+        computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, maxNumFeatures,
953 961
                             sizeX, sizeY, normalized, symmetric);
954 962
     }
955 963
 
... ...
@@ -964,11 +972,12 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
964 972
                       NumericVector distWeight, int maxSeqLength, struct alphaInfo *alphaInf)
965 973
 {
966 974
     T prevIndex, featureIndex, tempIndex, fIndex;
967
-    uint64_t elemIndex, fDimArray;
968
-    int i, j, l, offset, patLength, seqnchar, index, iold, iX, iY, maxNumPatterns, noSamples = sizeX;
975
+    uint64_t elemIndex, fDimArray, *featVectorsStart;
976
+    int i, j, l, offset, patLength, seqnchar, index, iold, iX, iY;
977
+    int maxNumPatternsPerPos, maxFeaturesPerSample, noSamples = sizeX;
969 978
     bool distWeighting;
970 979
     const char *seqptr;
971
-    double kv;
980
+    double kv, *normValues;
972 981
 
973 982
     struct featVectorArrays {
974 983
         int32_t    *x;        // value array
... ...
@@ -984,7 +993,7 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
984 993
 
985 994
     // allocate arrays for feature vectors with 8, 16, 32 or 64 bit index
986 995
     // without dist weighting use implicit position to save storage
987
-    fDimArray = maxSeqLength - k + 2;
996
+    fDimArray = maxSeqLength - k + 1;
988 997
     featVectorArrays featVectors;
989 998
 
990 999
     if (distWeighting)
... ...
@@ -993,11 +1002,14 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
993 1002
         // just for the offset of each sample
994 1003
         featVectors.x   = (int32_t *) R_alloc(noSamples, sizeof(int32_t));
995 1004
 
996
-
997 1005
     featVectors.idx = (T *) R_alloc(noSamples * fDimArray, sizeof(T));
998
-    maxNumPatterns = 1;
1006
+    featVectorsStart = (uint64_t *) R_alloc(noSamples + 1, sizeof(uint64_t));
1007
+    normValues   = (double *) R_alloc(noSamples, sizeof(double));
1008
+    maxNumPatternsPerPos = 1;
999 1009
 
1000
-    double *normValues   = (double *) R_alloc(noSamples, sizeof(double));
1010
+    featVectorsStart[0] = 0;
1011
+    maxFeaturesPerSample = 0;
1012
+    elemIndex = 0;
1001 1013
 
1002 1014
     // walk along sequence and create sparse feature vector
1003 1015
     for (i = 0; i < noSamples; i++)
... ...
@@ -1029,7 +1041,6 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1029 1041
 
1030 1042
         patLength = 0;
1031 1043
         featureIndex = 0;
1032
-        elemIndex = (uint64_t) i * fDimArray;
1033 1044
         iold = 0;
1034 1045
         kv = 0;
1035 1046
 
... ...
@@ -1118,7 +1129,10 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1118 1129
             }
1119 1130
         }
1120 1131
 
1121
-        featVectors.idx[elemIndex] = maxUnSignedIndex;
1132
+        featVectorsStart[i + 1] = elemIndex;
1133
+
1134
+        if (maxFeaturesPerSample < (featVectorsStart[i + 1] - featVectorsStart[i]))
1135
+            maxFeaturesPerSample = featVectorsStart[i + 1] - featVectorsStart[i];
1122 1136
 
1123 1137
         // for dist weighting the kernel value is determined in computeKernelMatrixPos
1124 1138
         if (!distWeighting)
... ...
@@ -1130,8 +1144,9 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x,
1130 1144
         }
1131 1145
     }
1132 1146
 
1133
-    computeKernelMatrixPos(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, fDimArray,
1134
-                           maxNumPatterns, sizeX, sizeY, normalized, symmetric, !distWeighting, distWeight);
1147
+    computeKernelMatrixPos(maxUnSignedIndex, featVectors.idx, featVectors.x, featVectorsStart, km,
1148
+                           normValues, maxFeaturesPerSample, maxNumPatternsPerPos, sizeX, sizeY,
1149
+                           normalized, symmetric, !distWeighting, distWeight);
1135 1150
 
1136 1151
     return;
1137 1152
 }
... ...
@@ -3211,7 +3226,7 @@ RcppExport SEXP spectrumKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR,
3211 3226
             {
3212 3227
                 getKMStdAnnSpec(maxUIndex8, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY,
3213 3228
                                 unmapped, k, normalized, symmetric, presence, reverseComplement, maxSeqLength,
3214
-                                &alphaInf);
3229
+                                dimFeatureSpace, &alphaInf);
3215 3230
             }
3216 3231
 
3217 3232
             break;
... ...
@@ -3229,7 +3244,7 @@ RcppExport SEXP spectrumKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR,
3229 3244
             {
3230 3245
                 getKMStdAnnSpec(maxUIndex16, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY,
3231 3246
                                 unmapped, k, normalized, symmetric, presence, reverseComplement, maxSeqLength,
3232
-                                &alphaInf);
3247
+                                dimFeatureSpace, &alphaInf);
3233 3248
             }
3234 3249
 
3235 3250
             break;
... ...
@@ -3248,7 +3263,7 @@ RcppExport SEXP spectrumKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR,
3248 3263
             {
3249 3264
                 getKMStdAnnSpec(maxUIndex32, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY,
3250 3265
                                 unmapped, k, normalized, symmetric, presence, reverseComplement, maxSeqLength,
3251
-                                &alphaInf);
3266
+                                dimFeatureSpace, &alphaInf);
3252 3267
             }
3253 3268
 
3254 3269
             break;
... ...
@@ -3266,7 +3281,7 @@ RcppExport SEXP spectrumKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR,
3266 3281
             {
3267 3282
                 getKMStdAnnSpec(maxUIndex64, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY,
3268 3283
                                 unmapped, k, normalized, symmetric, presence, reverseComplement, maxSeqLength,
3269
-                                &alphaInf);
3284
+                                dimFeatureSpace, &alphaInf);
3270 3285
             }
3271 3286
 
3272 3287
             break;
... ...
@@ -3333,7 +3348,7 @@ RcppExport void featuresToHashmapSpectrum(NumericMatrix featureWeights, int svmI
3333 3348
                                           IntegerVector annotationIndexMap)
3334 3349
 {
3335 3350
     int i, j, numFeatures, result;
3336
-    uint64_t featureIndex, annotIndex, numAnnPowK;
3351
+    uint64_t featureIndex, annotIndex, numAnnPowK = 0;
3337 3352
     const char *pattern;
3338 3353
     SEXP dimNames, colNames;
3339 3354
     khiter_t iter;
... ...
@@ -167,7 +167,6 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR,
167 167
     ptrI = iptr;
168 168
     ptrX = xptr;
169 169
     
170
-    // do processing
171 170
     nextFree = 0;
172 171
 
173 172
     if (symmetric)
... ...
@@ -203,7 +202,7 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR,
203 202
                 if (kv > lowerLimit)
204 203
                 {
205 204
                     // new element - check if enough space
206
-                    if (nextFree + 1 > vectorSize)
205
+                    if (((uint64_t) nextFree + 1) > vectorSize)
207 206
                     {
208 207
                         // realloc arrays i and x
209 208
                         vectorSize *= growBy;
... ...
@@ -258,7 +257,7 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR,
258 257
                 if (kv > lowerLimit)
259 258
                 {
260 259
                     // new element - check if enough space
261
-                    if (nextFree + 1 > vectorSize)
260
+                    if (((uint64_t) nextFree + 1) > vectorSize)
262 261
                     {
263 262
                         // realloc arrays i and x
264 263
                         vectorSize = (uint64_t) (vectorSize * growBy);
... ...
@@ -306,13 +305,13 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR,
306 305
     else
307 306
         SET_VECTOR_ELT(dimnames, 1, R_NilValue);
308 307
     
309
-    slot_p = PROTECT(Rf_allocVector(INTSXP, sizeY+1));
308
+    slot_p = PROTECT(Rf_allocVector(INTSXP, sizeY + 1));
310 309
     SET_SLOT(spm, Rf_mkChar("p"), slot_p);
311 310
     
312 311
     for (i=0; i < sizeY + 1; i++)
313 312
         INTEGER(slot_p)[i] = pptr[i];
314 313
     
315
-    free(pptr);
314
+    Free(pptr);
316 315
     ptrP = NULL;
317 316
     
318 317
     slot_i = PROTECT(Rf_allocVector(INTSXP, nextFree));
... ...
@@ -321,7 +320,7 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR,
321 320
     for (i=0; i < nextFree; i++)
322 321
         INTEGER(slot_i)[i] = iptr[i];
323 322
     
324
-    free(iptr);
323
+    Free(iptr);
325 324
     ptrI = NULL;
326 325
     
327 326
     slot_x = PROTECT(Rf_allocVector(REALSXP, nextFree));
... ...
@@ -330,7 +329,7 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR,
330 329
     for (i=0; i < nextFree; i++)
331 330
         REAL(slot_x)[i] = xptr[i];
332 331
     
333
-    free(xptr);
332
+    Free(xptr);
334 333
     ptrX = NULL;
335 334
     
336 335
     numProtect += 3;
... ...
@@ -59,7 +59,7 @@ kebabsDate <- paste(month.name[kebabsDateMonth], " ",
59