git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/kebabs@98550 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: kebabs |
2 | 2 |
Type: Package |
3 | 3 |
Title: Kernel-Based Analysis Of Biological Sequences |
4 |
-Version: 1.1.5 |
|
5 |
-Date: 2014-01-12 |
|
4 |
+Version: 1.1.6 |
|
5 |
+Date: 2015-01-21 |
|
6 | 6 |
Author: Johannes Palme |
7 | 7 |
Maintainer: Ulrich Bodenhofer <bodenhofer@bioinf.jku.at> |
8 | 8 |
Description: The package provides functionality for kernel-based analysis of |
... | ... |
@@ -54,4 +54,4 @@ Imports: methods, stats, Rcpp (>= 0.11.2), Matrix, XVector (>= 0.5.8), |
54 | 54 |
LinkingTo: IRanges, XVector, Biostrings, Rcpp, S4Vectors |
55 | 55 |
Suggests: SparseM, apcluster, Biobase, BiocGenerics |
56 | 56 |
biocViews: SupportVectorMachine, Classification, Clustering, Regression |
57 |
-Packaged: 2015-01-12 13:37:03 UTC; jo |
|
57 |
+Packaged: 2015-01-21 11:10:57 UTC; jo |
... | ... |
@@ -136,6 +136,53 @@ ersTransposedAsdgCMatrix <- function(from) |
136 | 136 |
return(dgC) |
137 | 137 |
} |
138 | 138 |
|
139 |
+distWeightKernelToString <- function(distWeight) |
|
140 |
+{ |
|
141 |
+ dwLength <- length(distWeight) |
|
142 |
+ distWeightChar <- "" |
|
143 |
+ |
|
144 |
+ if (is.numeric(distWeight)) |
|
145 |
+ { |
|
146 |
+ if (dwLength == 1) |
|
147 |
+ distWeightChar <- distWeight |
|
148 |
+ else if (dwLength < 7) |
|
149 |
+ { |
|
150 |
+ distWeightChar <- |
|
151 |
+ paste("c(",paste(distWeight,collapse=","),")", sep="") |
|
152 |
+ } |
|
153 |
+ else |
|
154 |
+ { |
|
155 |
+ distWeightChar <- |
|
156 |
+ paste("c(", paste(distWeight[1:3],collapse=","), " ... ", |
|
157 |
+ paste(distWeight[dwLength - 2:dwLength],collapse=","), |
|
158 |
+ ")", sep="") |
|
159 |
+ } |
|
160 |
+ } |
|
161 |
+ else if (typeof(distWeight) == "closure") |
|
162 |
+ { |
|
163 |
+ func <- deparse(distWeight)[2] |
|
164 |
+ index <- grep("(", strsplit(func, split="")[[1]], fixed=TRUE, |
|
165 |
+ value=FALSE) |
|
166 |
+ |
|
167 |
+ if (length(index) > 0) |
|
168 |
+ distWeightChar <- substr(func, 1, index[1] - 1) |
|
169 |
+ { |
|
170 |
+ if (distWeightChar %in% c("linWeight", "gaussWeight", |
|
171 |
+ "expWeight")) |
|
172 |
+ { |
|
173 |
+ sigma <- get("sigma", envir = environment(distWeight)) |
|
174 |
+ distWeightChar <- paste(distWeightChar, "(sigma=", sigma, ")", |
|
175 |
+ sep="") |
|
176 |
+ } |
|
177 |
+ else if (distWeightChar == "swdWeight") |
|
178 |
+ distWeightChar <- paste(distWeightChar, "()", sep="") |
|
179 |
+ else |
|
180 |
+ distWeightChar <- paste(distWeightChar, "( . . . )", sep="") |
|
181 |
+ } |
|
182 |
+ } |
|
183 |
+ distWeightChar |
|
184 |
+} |
|
185 |
+ |
|
139 | 186 |
#' @rdname sequenceKernel |
140 | 187 |
#' @aliases |
141 | 188 |
#' seqKernelAsChar |
... | ... |
@@ -182,8 +229,8 @@ seqKernelAsChar <- function(from) |
182 | 229 |
|
183 | 230 |
if (length(kernelParameters(from)$distWeight) > 0) |
184 | 231 |
{ |
185 |
- kChar <- paste(kChar, ", distWeight=", |
|
186 |
- kernelParameters(from)$distWeight, sep="") |
|
232 |
+ dwString <- distWeightKernelToString(kernelParameters(from)$distWeight) |
|
233 |
+ kChar <- paste(kChar, ", distWeight=", dwString, sep="") |
|
187 | 234 |
} |
188 | 235 |
|
189 | 236 |
if (!kernelParameters(from)$normalized) |
... | ... |
@@ -194,9 +194,21 @@ gappyPairKernel <- function(k=1, m=1, r=1, annSpec=FALSE, distWeight=numeric(0), |
194 | 194 |
if (!isTRUEorFALSE(annSpec)) |
195 | 195 |
stop("'annSpec' must be TRUE or FALSE\n") |
196 | 196 |
|
197 |
- if (length(distWeight) > 0 && |
|
198 |
- !(is.numeric(distWeight) || is.function(distWeight))) |
|
199 |
- stop("'distWeight' must be a numeric vector or a function\n") |
|
197 |
+ if (length(distWeight) > 0) |
|
198 |
+ { |
|
199 |
+ if (!(is.numeric(distWeight) || is.function(distWeight))) |
|
200 |
+ stop("'distWeight' must be a numeric vector or a function\n") |
|
201 |
+ |
|
202 |
+ if (is.function(distWeight)) |
|
203 |
+ { |
|
204 |
+ func <- deparse(distWeight)[2] |
|
205 |
+ index <- grep("(", strsplit(func, split="")[[1]], fixed=TRUE, |
|
206 |
+ value=FALSE) |
|
207 |
+ |
|
208 |
+ if (length(index) < 1) |
|
209 |
+ stop("Missing parentheses in 'distWeight'\n") |
|
210 |
+ } |
|
211 |
+ } |
|
200 | 212 |
|
201 | 213 |
if (presence && length(distWeight) > 0) |
202 | 214 |
{ |
... | ... |
@@ -164,16 +164,14 @@ kbsvm.seqs <- function(x, y, kernel=NULL, pkg="auto", svm="C-svc", |
164 | 164 |
|
165 | 165 |
if (ncol(exRepSV) == 0) |
166 | 166 |
stop("explicit representaton is not available\n") |
167 |
+ } |
|
168 |
+ else |
|
169 |
+ exRepSV <- NULL |
|
167 | 170 |
|
168 |
- model@featureWeights <- |
|
169 |
- getFeatureWeights(model=model, |
|
171 |
+ model@featureWeights <- |
|
172 |
+ getFeatureWeights(model=model, |
|
170 | 173 |
exrep=exRepSV, |
171 | 174 |
weightLimit=model@svmInfo@weightLimit) |
172 |
- } |
|
173 |
- else |
|
174 |
- stop("feature weights cannot be computed because\n", |
|
175 |
- " the selected kernel does not support\n", |
|
176 |
- " an explicit representation\n") |
|
177 | 175 |
|
178 | 176 |
model@b <- getSVMSlotValue("b", model) |
179 | 177 |
} |
... | ... |
@@ -511,20 +509,31 @@ kbsvm.seqs <- function(x, y, kernel=NULL, pkg="auto", svm="C-svc", |
511 | 509 |
#' Training of biological sequences with a sequence kernel\cr |
512 | 510 |
#' |
513 | 511 |
#' Training is performed via the method \code{kbsvm} for classification and |
514 |
-#' regression tasks. The user passes sequence data in \code{XStringSet} format, |
|
515 |
-#' the label vector, a sequence kernel object and the requested SVM along with |
|
516 |
-#' SVM parameters and receives the training results in the form of a KeBABS |
|
517 |
-#' model object of class \code{\linkS4class{KBModel}}. The accessor |
|
512 |
+#' regression tasks. The user passes sequence data, the response vector, a |
|
513 |
+#' sequence kernel object and the requested SVM along with SVM parameters |
|
514 |
+#' to \code{kbsvm} and receives the training results in the form of a |
|
515 |
+#' KeBABS model object of class \code{\linkS4class{KBModel}}. The accessor |
|
518 | 516 |
#' \code{svmModel} allows to retrieve the SVM specific model from the KeBABS |
519 |
-#' model object. However for regular operation a detailed look into the SVM |
|
520 |
-#' specific model is usually not necessary. (When repeat regions are coded as |
|
521 |
-#' lowercase characters and should be excluded from the analysis the sequence |
|
522 |
-#' data can be passed as \code{\linkS4class{BioVector}} which also supports |
|
523 |
-#' lowercase characters instead of \code{\linkS4class{XStringSet}} format. |
|
524 |
-#' Please note that the classes derived from \code{\linkS4class{XStringSet}} |
|
525 |
-#' are much more powerful than the \code{\linkS4class{BioVector}} derived |
|
526 |
-#' classes and should be used in all cases where lowercase characters are not |
|
527 |
-#' needed). Apart from SVM training \code{kbsvm} can be also used for cross |
|
517 |
+#' model object. However, for regular operation a detailed look into the SVM |
|
518 |
+#' specific model is usually not necessary. |
|
519 |
+#' |
|
520 |
+#' The standard data format for sequences in KeBABS are the |
|
521 |
+#' \code{XStringSet}-derived classes \code{\linkS4class{DNAStringSet}}, |
|
522 |
+#' \code{\linkS4class{RNAStringSet}} and \code{\linkS4class{AAStringSet}}. |
|
523 |
+#' (When repeat regions are coded as lowercase characters and should be |
|
524 |
+#' excluded from the analysis the sequence data can be passed as |
|
525 |
+#' \code{\linkS4class{BioVector}} which also supports lowercase characters |
|
526 |
+#' instead of \code{\linkS4class{XStringSet}} format. Please note that the |
|
527 |
+#' classes derived from \code{\linkS4class{XStringSet}} are much more |
|
528 |
+#' powerful than the \code{\linkS4class{BioVector}} derived classes and |
|
529 |
+#' should be used in all cases where lowercase characters are not needed). |
|
530 |
+#' |
|
531 |
+#' Instead of sequences also a precomputed explicit representation or |
|
532 |
+#' a precomputed kernel matrix can be used for training. Examples for |
|
533 |
+#' training with kernel matrix and explicit representation can be found on |
|
534 |
+#' the help page for the prediction method \code{\link{predict}}. |
|
535 |
+#' |
|
536 |
+#' Apart from SVM training \code{kbsvm} can be also used for cross |
|
528 | 537 |
#' validation (see \link{crossValidation} and parameters \code{cross} and |
529 | 538 |
#' \code{noCross}), grid search for SVM- and kernel-parameter values (see |
530 | 539 |
#' \link{gridSearch}) and model selection (see \link{modelSelection} and |
... | ... |
@@ -653,9 +662,9 @@ kbsvm.seqs <- function(x, y, kernel=NULL, pkg="auto", svm="C-svc", |
653 | 662 |
#' with the parameter \code{weightLimit} which defines the cutoff for |
654 | 663 |
#' small feature weights not stored in the model. |
655 | 664 |
#' |
656 |
-#' Hint: For multiclass prediction is currently not performed via feature |
|
657 |
-#' weights but native in the SVM. This means that for multiclass a pruned |
|
658 |
-#' model cannot be used for prediction.\cr\cr |
|
665 |
+#' Hint: For training with a precomputed kernel matrix feature weights are |
|
666 |
+#' not available. For multiclass prediction is currently not performed via |
|
667 |
+#' feature weights but native in the SVM.\cr\cr |
|
659 | 668 |
#' |
660 | 669 |
#' Cross validation, grid search and model selection\cr |
661 | 670 |
#' |
... | ... |
@@ -754,6 +754,18 @@ predict.KBModel <- function(object, x, predictionType="response", sel=NULL, |
754 | 754 |
#' do not support the generation of an explicit representation, e.g. the |
755 | 755 |
#' position dependent kernel variants.\cr\cr |
756 | 756 |
#' |
757 |
+#' Prediction with precomputed kernel matrix |
|
758 |
+#' |
|
759 |
+#' When training was performed with a precomputed kernel matrix also in |
|
760 |
+#' prediction a precomputed kernel matrix must be passed to the \code{predict} |
|
761 |
+#' method. In contrast to the quadratic and symmetric kernel matrix used |
|
762 |
+#' in training the kernel matrix for prediction is rectangular and contains |
|
763 |
+#' the similarities of test samples (rows) against support vectors (columns). |
|
764 |
+#' support vector indices can be read from the model with the accessor SVindex. |
|
765 |
+#' Please not that these indices refer to the sample subset used in training. |
|
766 |
+#' An example for training and prediction via precomputed kernel matrix is |
|
767 |
+#' shown below. |
|
768 |
+#' |
|
757 | 769 |
#' Generation of prediction profiles |
758 | 770 |
#' |
759 | 771 |
#' The parameter \code{predProfiles} controls whether prediction profiles |
... | ... |
@@ -791,15 +803,14 @@ predict.KBModel <- function(object, x, predictionType="response", sel=NULL, |
791 | 803 |
#' |
792 | 804 |
#' ## run training with explicit representation |
793 | 805 |
#' model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy, |
794 |
-#' pkg="LiblineaR", svm="C-svc", cost=10, explicit="yes", |
|
795 |
-#' featureWeights="yes") |
|
806 |
+#' pkg="LiblineaR", svm="C-svc", cost=10) |
|
796 | 807 |
#' |
797 | 808 |
#' ## show feature weights in KeBABS model |
798 | 809 |
#' featureWeights(model)[1:8] |
799 | 810 |
#' |
800 | 811 |
#' ## predict the test sequences |
801 | 812 |
#' pred <- predict(model, enhancerFB[test]) |
802 |
-#' evaluatePrediction(pred, yFB[test], allLabels=c(-1,1)) |
|
813 |
+#' evaluatePrediction(pred, yFB[test], allLabels=unique(yFB)) |
|
803 | 814 |
#' pred[1:10] |
804 | 815 |
#' |
805 | 816 |
#' ## output decision values instead |
... | ... |
@@ -807,11 +818,30 @@ predict.KBModel <- function(object, x, predictionType="response", sel=NULL, |
807 | 818 |
#' pred[1:10] |
808 | 819 |
#' |
809 | 820 |
#' \dontrun{ |
810 |
-#' ## computing of probability model via Platt scaling during training |
|
821 |
+#' ## example for training and prediction via precomputed kernel matrix |
|
822 |
+#' |
|
823 |
+#' ## compute quadratic kernel matrix of training samples |
|
824 |
+#' kmtrain <- getKernelMatrix(gappy, x=enhancerFB, selx=train) |
|
825 |
+#' |
|
826 |
+#' ## train model with kernel matrix |
|
827 |
+#' model <- kbsvm(x=kmtrain, y=yFB[train], kernel=gappy, |
|
828 |
+#' pkg="kernlab", svm="C-svc", cost=10) |
|
829 |
+#' |
|
830 |
+#' ## compute rectangular kernel matrix of test samples versus |
|
831 |
+#' ## support vectors |
|
832 |
+#' kmtest <- getKernelMatrix(gappy, x=enhancerFB, y=enhancerFB, |
|
833 |
+#' selx=test, sely=train[SVindex(model)]) |
|
834 |
+#' |
|
835 |
+#' ## predict with kernel matrix |
|
836 |
+#' pred <- predict(model, kmtest) |
|
837 |
+#' evaluatePrediction(pred, yFB[test], allLabels=unique(yFB)) |
|
838 |
+#' |
|
839 |
+#' ## example for probability model generation during training |
|
840 |
+#' |
|
841 |
+#' ## compute probability model via Platt scaling during training |
|
811 | 842 |
#' ## and predict class membership probabilities |
812 | 843 |
#' model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy, |
813 |
-#' pkg="e1071", svm="C-svc", cost=10, explicit="yes", |
|
814 |
-#' probModel=TRUE) |
|
844 |
+#' pkg="e1071", svm="C-svc", cost=10, probModel=TRUE) |
|
815 | 845 |
#' |
816 | 846 |
#' ## show parameters of the fitted probability model which are the parameters |
817 | 847 |
#' ## probA and probB for the fitted sigmoid function in case of classification |
... | ... |
@@ -358,26 +358,8 @@ setMethod("show", signature(object="SpectrumKernel"), |
358 | 358 |
cat(paste(", annSpec=TRUE")) |
359 | 359 |
if (length(object@distWeight) > 0) |
360 | 360 |
{ |
361 |
- if (is.numeric(object@distWeight)) |
|
362 |
- { |
|
363 |
- if (length(object@distWeight) == 1) |
|
364 |
- { |
|
365 |
- cat(paste(", distWeight=", |
|
366 |
- object@distWeight, sep="")) |
|
367 |
- } |
|
368 |
- else |
|
369 |
- { |
|
370 |
- cat(paste(", distWeight=", |
|
371 |
- paste("c(",paste(object@distWeight, |
|
372 |
- collapse=","),")", |
|
373 |
- sep=""), sep="")) |
|
374 |
- } |
|
375 |
- } |
|
376 |
- else |
|
377 |
- { |
|
378 |
- cat(", \ndistWeight=") |
|
379 |
- cat(format(object@distWeight)) |
|
380 |
- } |
|
361 |
+ dwString <- distWeightKernelToString(object@distWeight) |
|
362 |
+ cat(", distWeight=", dwString, sep="") |
|
381 | 363 |
} |
382 | 364 |
if (object@normalized == FALSE) |
383 | 365 |
cat(", normalized=FALSE") |
... | ... |
@@ -170,9 +170,21 @@ spectrumKernel <- function(k=3, r=1, annSpec=FALSE, distWeight=numeric(0), |
170 | 170 |
if (!isTRUEorFALSE(annSpec)) |
171 | 171 |
stop("'annSpec' must be TRUE or FALSE\n") |
172 | 172 |
|
173 |
- if (length(distWeight) > 0 && |
|
174 |
- !(is.numeric(distWeight) || is.function(distWeight))) |
|
175 |
- stop("'distWeight' must be a numeric vector or a function\n") |
|
173 |
+ if (length(distWeight) > 0) |
|
174 |
+ { |
|
175 |
+ if (!(is.numeric(distWeight) || is.function(distWeight))) |
|
176 |
+ stop("'distWeight' must be a numeric vector or a function\n") |
|
177 |
+ |
|
178 |
+ if (is.function(distWeight)) |
|
179 |
+ { |
|
180 |
+ func <- deparse(distWeight)[2] |
|
181 |
+ index <- grep("(", strsplit(func, split="")[[1]], fixed=TRUE, |
|
182 |
+ value=FALSE) |
|
183 |
+ |
|
184 |
+ if (length(index) < 1) |
|
185 |
+ stop("Missing parentheses in 'distWeight'\n") |
|
186 |
+ } |
|
187 |
+ } |
|
176 | 188 |
|
177 | 189 |
if (presence && length(distWeight) > 0) |
178 | 190 |
{ |
... | ... |
@@ -1,6 +1,19 @@ |
1 | 1 |
Change history of package kebabs: |
2 | 2 |
==================================== |
3 | 3 |
|
4 |
+Version 1.1.6 |
|
5 |
+- error correction for training with position specific kernel and |
|
6 |
+ computation of feature weights |
|
7 |
+- error correction in coercion of kernel to character for distance |
|
8 |
+ weighting |
|
9 |
+- error correction in spectrum, gappy pair and motif kernel |
|
10 |
+ for kernel matrix - last feature was missing in kernel value |
|
11 |
+ in rare situations |
|
12 |
+- correction of Windows build problem in linearKernel |
|
13 |
+- build warnings on Windows removed |
|
14 |
+- minor changes in help pages |
|
15 |
+- minor changes in vignette |
|
16 |
+ |
|
4 | 17 |
Version 1.1.5 |
5 | 18 |
- new method heatmap to display heatmap of prediction profiles |
6 | 19 |
- extension of function linearKernel to optionally return |
... | ... |
@@ -291,30 +291,43 @@ Training of biological sequences with a sequence kernel\cr |
291 | 291 |
|
292 | 292 |
Training is performed via the method \code{kbsvm} for |
293 | 293 |
classification and regression tasks. The user passes |
294 |
-sequence data in \code{XStringSet} format, the label |
|
295 |
-vector, a sequence kernel object and the requested SVM |
|
296 |
-along with SVM parameters and receives the training results |
|
297 |
-in the form of a KeBABS model object of class |
|
294 |
+sequence data, the response vector, a sequence kernel |
|
295 |
+object and the requested SVM along with SVM parameters to |
|
296 |
+\code{kbsvm} and receives the training results in the form |
|
297 |
+of a KeBABS model object of class |
|
298 | 298 |
\code{\linkS4class{KBModel}}. The accessor \code{svmModel} |
299 | 299 |
allows to retrieve the SVM specific model from the KeBABS |
300 |
-model object. However for regular operation a detailed look |
|
301 |
-into the SVM specific model is usually not necessary. (When |
|
302 |
-repeat regions are coded as lowercase characters and should |
|
303 |
-be excluded from the analysis the sequence data can be |
|
304 |
-passed as \code{\linkS4class{BioVector}} which also |
|
305 |
-supports lowercase characters instead of |
|
300 |
+model object. However, for regular operation a detailed |
|
301 |
+look into the SVM specific model is usually not necessary. |
|
302 |
+ |
|
303 |
+The standard data format for sequences in KeBABS are the |
|
304 |
+\code{XStringSet}-derived classes |
|
305 |
+\code{\linkS4class{DNAStringSet}}, |
|
306 |
+\code{\linkS4class{RNAStringSet}} and |
|
307 |
+\code{\linkS4class{AAStringSet}}. (When repeat regions are |
|
308 |
+coded as lowercase characters and should be excluded from |
|
309 |
+the analysis the sequence data can be passed as |
|
310 |
+\code{\linkS4class{BioVector}} which also supports |
|
311 |
+lowercase characters instead of |
|
306 | 312 |
\code{\linkS4class{XStringSet}} format. Please note that |
307 | 313 |
the classes derived from \code{\linkS4class{XStringSet}} |
308 | 314 |
are much more powerful than the |
309 | 315 |
\code{\linkS4class{BioVector}} derived classes and should |
310 | 316 |
be used in all cases where lowercase characters are not |
311 |
-needed). Apart from SVM training \code{kbsvm} can be also |
|
312 |
-used for cross validation (see \link{crossValidation} and |
|
313 |
-parameters \code{cross} and \code{noCross}), grid search |
|
314 |
-for SVM- and kernel-parameter values (see |
|
315 |
-\link{gridSearch}) and model selection (see |
|
316 |
-\link{modelSelection} and parameters \code{nestedCross} and |
|
317 |
-\code{noNestedCross}).\cr\cr |
|
317 |
+needed). |
|
318 |
+ |
|
319 |
+Instead of sequences also a precomputed explicit |
|
320 |
+representation or a precomputed kernel matrix can be used |
|
321 |
+for training. Examples for training with kernel matrix and |
|
322 |
+explicit representation can be found on the help page for |
|
323 |
+the prediction method \code{\link{predict}}. |
|
324 |
+ |
|
325 |
+Apart from SVM training \code{kbsvm} can be also used for |
|
326 |
+cross validation (see \link{crossValidation} and parameters |
|
327 |
+\code{cross} and \code{noCross}), grid search for SVM- and |
|
328 |
+kernel-parameter values (see \link{gridSearch}) and model |
|
329 |
+selection (see \link{modelSelection} and parameters |
|
330 |
+\code{nestedCross} and \code{noNestedCross}).\cr\cr |
|
318 | 331 |
|
319 | 332 |
Package and SVM selection\cr |
320 | 333 |
|
... | ... |
@@ -428,10 +441,10 @@ Pruning of feature weights can be achieved with the |
428 | 441 |
parameter \code{weightLimit} which defines the cutoff for |
429 | 442 |
small feature weights not stored in the model. |
430 | 443 |
|
431 |
-Hint: For multiclass prediction is currently not performed |
|
432 |
-via feature weights but native in the SVM. This means that |
|
433 |
-for multiclass a pruned model cannot be used for |
|
434 |
-prediction.\cr\cr |
|
444 |
+Hint: For training with a precomputed kernel matrix feature |
|
445 |
+weights are not available. For multiclass prediction is |
|
446 |
+currently not performed via feature weights but native in |
|
447 |
+the SVM.\cr\cr |
|
435 | 448 |
|
436 | 449 |
Cross validation, grid search and model selection\cr |
437 | 450 |
|
... | ... |
@@ -107,6 +107,20 @@ variants which do not support the generation of an explicit |
107 | 107 |
representation, e.g. the position dependent kernel |
108 | 108 |
variants.\cr\cr |
109 | 109 |
|
110 |
+Prediction with precomputed kernel matrix |
|
111 |
+ |
|
112 |
+When training was performed with a precomputed kernel |
|
113 |
+matrix also in prediction a precomputed kernel matrix must |
|
114 |
+be passed to the \code{predict} method. In contrast to the |
|
115 |
+quadratic and symmetric kernel matrix used in training the |
|
116 |
+kernel matrix for prediction is rectangular and contains |
|
117 |
+the similarities of test samples (rows) against support |
|
118 |
+vectors (columns). support vector indices can be read from |
|
119 |
+the model with the accessor SVindex. Please not that these |
|
120 |
+indices refer to the sample subset used in training. An |
|
121 |
+example for training and prediction via precomputed kernel |
|
122 |
+matrix is shown below. |
|
123 |
+ |
|
110 | 124 |
Generation of prediction profiles |
111 | 125 |
|
112 | 126 |
The parameter \code{predProfiles} controls whether |
... | ... |
@@ -132,15 +146,14 @@ gappy |
132 | 146 |
|
133 | 147 |
## run training with explicit representation |
134 | 148 |
model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy, |
135 |
- pkg="LiblineaR", svm="C-svc", cost=10, explicit="yes", |
|
136 |
- featureWeights="yes") |
|
149 |
+ pkg="LiblineaR", svm="C-svc", cost=10) |
|
137 | 150 |
|
138 | 151 |
## show feature weights in KeBABS model |
139 | 152 |
featureWeights(model)[1:8] |
140 | 153 |
|
141 | 154 |
## predict the test sequences |
142 | 155 |
pred <- predict(model, enhancerFB[test]) |
143 |
-evaluatePrediction(pred, yFB[test], allLabels=c(-1,1)) |
|
156 |
+evaluatePrediction(pred, yFB[test], allLabels=unique(yFB)) |
|
144 | 157 |
pred[1:10] |
145 | 158 |
|
146 | 159 |
## output decision values instead |
... | ... |
@@ -148,11 +161,30 @@ pred <- predict(model, enhancerFB[test], predictionType="decision") |
148 | 161 |
pred[1:10] |
149 | 162 |
|
150 | 163 |
\dontrun{ |
151 |
-## computing of probability model via Platt scaling during training |
|
164 |
+## example for training and prediction via precomputed kernel matrix |
|
165 |
+ |
|
166 |
+## compute quadratic kernel matrix of training samples |
|
167 |
+kmtrain <- getKernelMatrix(gappy, x=enhancerFB, selx=train) |
|
168 |
+ |
|
169 |
+## train model with kernel matrix |
|
170 |
+model <- kbsvm(x=kmtrain, y=yFB[train], kernel=gappy, |
|
171 |
+ pkg="kernlab", svm="C-svc", cost=10) |
|
172 |
+ |
|
173 |
+## compute rectangular kernel matrix of test samples versus |
|
174 |
+## support vectors |
|
175 |
+kmtest <- getKernelMatrix(gappy, x=enhancerFB, y=enhancerFB, |
|
176 |
+ selx=test, sely=train[SVindex(model)]) |
|
177 |
+ |
|
178 |
+## predict with kernel matrix |
|
179 |
+pred <- predict(model, kmtest) |
|
180 |
+evaluatePrediction(pred, yFB[test], allLabels=unique(yFB)) |
|
181 |
+ |
|
182 |
+## example for probability model generation during training |
|
183 |
+ |
|
184 |
+## compute probability model via Platt scaling during training |
|
152 | 185 |
## and predict class membership probabilities |
153 | 186 |
model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy, |
154 |
- pkg="e1071", svm="C-svc", cost=10, explicit="yes", |
|
155 |
- probModel=TRUE) |
|
187 |
+ pkg="e1071", svm="C-svc", cost=10, probModel=TRUE) |
|
156 | 188 |
|
157 | 189 |
## show parameters of the fitted probability model which are the parameters |
158 | 190 |
## probA and probB for the fitted sigmoid function in case of classification |
... | ... |
@@ -900,13 +900,14 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
900 | 900 |
int sizeX, int sizeY, IntegerVector selX, IntegerVector selY, |
901 | 901 |
ByteStringVector annCharset, ByteStringVector annX, ByteStringVector annY, |
902 | 902 |
bool unmapped, int k, int m, bool normalized, bool symmetric, bool presence, |
903 |
- bool reverseComplement, int maxSeqLength, struct alphaInfo *alphaInf) |
|
903 |
+ bool reverseComplement, int maxSeqLength, uint64_t dimFeatureSpace, |
|
904 |
+ struct alphaInfo *alphaInf) |
|
904 | 905 |
{ |
905 | 906 |
T featureIndex, annotIndex; |
906 | 907 |
int i, j, l, iX, iY, elemIndex, noSamples = sizeX; |
907 | 908 |
int freeNode, nodeLimit, currBlock, currIndex, newBlock, maxBlockIndex; |
908 | 909 |
int maxNoOfNodes, seqnchar, currStack, stack[8*k]; |
909 |
- uint64_t maxNodesPerSequence, numAnnPow2K = 0; |
|
910 |
+ uint64_t maxNodesPerSequence, maxNumFeatures, numAnnPow2K = 0; |
|
910 | 911 |
double kv; |
911 | 912 |
bool printWarning = TRUE; |
912 | 913 |
const char *seqptr, *annptr; |
... | ... |
@@ -928,14 +929,17 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
928 | 929 |
noSamples += sizeY; |
929 | 930 |
|
930 | 931 |
// add one for the sentinel |
931 |
- maxSeqLength += 1; |
|
932 |
- |
|
932 |
+ if (dimFeatureSpace < maxSeqLength) |
|
933 |
+ maxNumFeatures = dimFeatureSpace + 1; |
|
934 |
+ else |
|
935 |
+ maxNumFeatures = (maxSeqLength - 2*k - 0.5*m + 1) * (m + 1) + 1; |
|
936 |
+ |
|
933 | 937 |
// allocate arrays for sparse feature vectors with 32 or 64 bit index |
934 | 938 |
// store only unnormalized k-mer count to avoid double space usage |
935 | 939 |
// add one for the sentinel |
936 | 940 |
featVectorArrays featVectors; |
937 |
- featVectors.x = (int32_t *) R_alloc(noSamples * ((maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1), sizeof(int32_t)); |
|
938 |
- featVectors.idx = (T *) R_alloc(noSamples * ((maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1), sizeof(T)); |
|
941 |
+ featVectors.x = (int32_t *) R_alloc(noSamples * maxNumFeatures, sizeof(int32_t)); |
|
942 |
+ featVectors.idx = (T *) R_alloc(noSamples * maxNumFeatures, sizeof(T)); |
|
939 | 943 |
|
940 | 944 |
double *normValues = (double *) R_alloc(noSamples, sizeof(double)); |
941 | 945 |
|
... | ... |
@@ -1016,12 +1020,13 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1016 | 1020 |
currBlock = 0; |
1017 | 1021 |
currIndex = 0; |
1018 | 1022 |
currStack = -1; |
1019 |
- elemIndex = i * ((maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1); |
|
1023 |
+ elemIndex = i * maxNumFeatures; |
|
1020 | 1024 |
featureIndex = 0; |
1021 | 1025 |
annotIndex = 0; |
1022 | 1026 |
|
1023 | 1027 |
// for situation with no features set sentinel |
1024 | 1028 |
featVectors.idx[elemIndex] = maxUnSignedIndex; |
1029 |
+ featVectors.x[elemIndex] = MAXINT32; |
|
1025 | 1030 |
|
1026 | 1031 |
while (currStack >= 0 || currIndex <= maxBlockIndex) |
1027 | 1032 |
{ |
... | ... |
@@ -1117,10 +1122,11 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1117 | 1122 |
} |
1118 | 1123 |
|
1119 | 1124 |
featVectors.idx[elemIndex] = maxUnSignedIndex; |
1125 |
+ featVectors.x[elemIndex] = MAXINT32; |
|
1120 | 1126 |
} |
1121 | 1127 |
|
1122 | 1128 |
computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, |
1123 |
- (maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1, sizeX, sizeY, normalized, symmetric); |
|
1129 |
+ maxNumFeatures, sizeX, sizeY, normalized, symmetric); |
|
1124 | 1130 |
} |
1125 | 1131 |
else // not symmetric |
1126 | 1132 |
{ |
... | ... |
@@ -1144,10 +1150,6 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1144 | 1150 |
} |
1145 | 1151 |
else |
1146 | 1152 |
{ |
1147 |
- // this statement causes problems in the optimization level -O2 |
|
1148 |
-// iY = selY[i - sizeX]; |
|
1149 |
-// Rprintf("i: %d sizeX: %d i-sizeX: %d\n", i, sizeX, i - sizeX); |
|
1150 |
- |
|
1151 | 1153 |
indexY++; |
1152 | 1154 |
iY = selY[indexY]; |
1153 | 1155 |
|
... | ... |
@@ -1162,8 +1164,8 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1162 | 1164 |
|
1163 | 1165 |
//create new tree for sequence |
1164 | 1166 |
kv = createTreeGappy((const char *) seqptr, seqnchar, annptr, k, m, &annotationIndexMap, |
1165 |
- presence, reverseComplement, pTree, maxNoOfNodes, &freeNode, &nullBlock, &printWarning, |
|
1166 |
- alphaInf); |
|
1167 |
+ presence, reverseComplement, pTree, maxNoOfNodes, &freeNode, |
|
1168 |
+ &nullBlock, &printWarning, alphaInf); |
|
1167 | 1169 |
|
1168 | 1170 |
if (kv == NA_REAL) |
1169 | 1171 |
{ |
... | ... |
@@ -1185,12 +1187,13 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1185 | 1187 |
currBlock = 0; |
1186 | 1188 |
currIndex = 0; |
1187 | 1189 |
currStack = -1; |
1188 |
- elemIndex = i * ((maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1); |
|
1190 |
+ elemIndex = i * maxNumFeatures; |
|
1189 | 1191 |
featureIndex = 0; |
1190 | 1192 |
annotIndex = 0; |
1191 | 1193 |
|
1192 | 1194 |
// for situation with no features set sentinel |
1193 | 1195 |
featVectors.idx[elemIndex] = maxUnSignedIndex; |
1196 |
+ featVectors.x[elemIndex] = MAXINT32; |
|
1194 | 1197 |
|
1195 | 1198 |
while (currStack >= 0 || currIndex <= maxBlockIndex) |
1196 | 1199 |
{ |
... | ... |
@@ -1286,10 +1289,11 @@ void getKMStdAnnGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1286 | 1289 |
} |
1287 | 1290 |
|
1288 | 1291 |
featVectors.idx[elemIndex] = maxUnSignedIndex; |
1292 |
+ featVectors.x[elemIndex] = MAXINT32; |
|
1289 | 1293 |
} |
1290 | 1294 |
|
1291 | 1295 |
computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, |
1292 |
- (maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1, sizeX, sizeY, normalized, symmetric); |
|
1296 |
+ maxNumFeatures, sizeX, sizeY, normalized, symmetric); |
|
1293 | 1297 |
} |
1294 | 1298 |
|
1295 | 1299 |
return; |
... | ... |
@@ -1303,9 +1307,9 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1303 | 1307 |
NumericVector distWeight, int maxSeqLength, struct alphaInfo *alphaInf) |
1304 | 1308 |
{ |
1305 | 1309 |
T prevIndex; |
1306 |
- uint32_t km1, inew, invalidCharPos, elemIndex, tempIndex1, tempIndex2, fIndex; |
|
1307 |
- uint64_t featureIndex, fDimArray, numAlphaPowerK, numAlphaPowerK1; |
|
1308 |
- int i, j, l, offset, noSamples = sizeX; |
|
1310 |
+ uint32_t km1, inew, elemIndex, tempIndex1, tempIndex2, fIndex; |
|
1311 |
+ uint64_t featureIndex, fDimArray, invalidCharPos, numAlphaPowerK, numAlphaPowerK1, *featVectorsStart; |
|
1312 |
+ int i, j, l, offset, maxNumPatternsPerPos, maxFeaturesPerSample, noSamples = sizeX; |
|
1309 | 1313 |
int patLength, seqnchar, index, iold, iX, iY, upper; |
1310 | 1314 |
const char *seqptr; |
1311 | 1315 |
double kv, *normValues; |
... | ... |
@@ -1321,19 +1325,25 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1321 | 1325 |
noSamples += sizeY; |
1322 | 1326 |
|
1323 | 1327 |
km1 = k + m + 1; |
1324 |
- T *oldIndex = (T *) R_alloc(k, sizeof(uint64_t)); |
|
1325 |
- T *newIndex = (T *) R_alloc(km1, sizeof(uint64_t)); |
|
1328 |
+ uint64_t *oldIndex = (uint64_t *) R_alloc(k, sizeof(uint64_t)); |
|
1329 |
+ uint64_t *newIndex = (uint64_t *) R_alloc(km1, sizeof(uint64_t)); |
|
1326 | 1330 |
numAlphaPowerK1 = ipow64(alphaInf->numAlphabetChars, k - 1); |
1327 | 1331 |
numAlphaPowerK = numAlphaPowerK1 * alphaInf->numAlphabetChars; |
1328 | 1332 |
|
1329 | 1333 |
// allocate arrays for feature vectors with 8, 16, 32 or 64 bit index |
1330 |
- fDimArray = (maxSeqLength - 2*k + 0.5*m) * (m + 1) + 1; |
|
1334 |
+ fDimArray = (maxSeqLength - 2*k - 0.5*m + 1) * (m + 1); |
|
1331 | 1335 |
featVectorArrays featVectors; |
1332 | 1336 |
|
1333 | 1337 |
featVectors.x = (int32_t *) R_alloc(noSamples * fDimArray, sizeof(int32_t)); |
1334 | 1338 |
featVectors.idx = (T *) R_alloc(noSamples * fDimArray, sizeof(T)); |
1339 |
+ featVectorsStart = (uint64_t *) R_alloc(noSamples + 1, sizeof(uint64_t)); |
|
1335 | 1340 |
normValues = (double *) R_alloc(noSamples, sizeof(double)); |
1341 |
+ maxNumPatternsPerPos = m + 1; |
|
1336 | 1342 |
|
1343 |
+ featVectorsStart[0] = 0; |
|
1344 |
+ maxFeaturesPerSample = 0; |
|
1345 |
+ elemIndex = 0; |
|
1346 |
+ |
|
1337 | 1347 |
// walk along sequence and create sparse feature vector |
1338 | 1348 |
for (i = 0; i < noSamples; i++) |
1339 | 1349 |
{ |
... | ... |
@@ -1362,18 +1372,17 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1362 | 1372 |
|
1363 | 1373 |
patLength = 0; |
1364 | 1374 |
featureIndex = 0; |
1365 |
- elemIndex = i * fDimArray; |
|
1366 | 1375 |
iold = 0; |
1367 | 1376 |
inew = 0; |
1368 | 1377 |
kv = 0; |
1369 |
- invalidCharPos = maxUnSignedIndex; |
|
1378 |
+ invalidCharPos = MAXUINT64; |
|
1370 | 1379 |
|
1371 | 1380 |
for (l = 0; l < seqnchar - 2 * k + 1; l++) |
1372 | 1381 |
{ |
1373 |
- if (invalidCharPos != maxUnSignedIndex && l <= (int)invalidCharPos) |
|
1382 |
+ if (invalidCharPos != MAXUINT64 && l <= (int)invalidCharPos) |
|
1374 | 1383 |
continue; |
1375 | 1384 |
else |
1376 |
- invalidCharPos = maxUnSignedIndex; |
|
1385 |
+ invalidCharPos = MAXUINT64; |
|
1377 | 1386 |
|
1378 | 1387 |
// look ahead and fill newIndex buffer to process |
1379 | 1388 |
// each of the m + 1 patterns at a position immediately |
... | ... |
@@ -1386,9 +1395,9 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1386 | 1395 |
{ |
1387 | 1396 |
for (j = 0; j < upper; j++) |
1388 | 1397 |
{ |
1389 |
- if (invalidCharPos != maxUnSignedIndex) |
|
1398 |
+ if (invalidCharPos != MAXUINT64) |
|
1390 | 1399 |
{ |
1391 |
- newIndex[inew++] = maxUnSignedIndex; |
|
1400 |
+ newIndex[inew++] = MAXUINT64; |
|
1392 | 1401 |
|
1393 | 1402 |
if (inew == km1) |
1394 | 1403 |
inew = 0; |
... | ... |
@@ -1438,7 +1447,7 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1438 | 1447 |
else |
1439 | 1448 |
{ |
1440 | 1449 |
//store max index to buffer for not valid pattern |
1441 |
- newIndex[inew++] = maxUnSignedIndex; |
|
1450 |
+ newIndex[inew++] = MAXUINT64; |
|
1442 | 1451 |
|
1443 | 1452 |
if (inew == km1) |
1444 | 1453 |
inew = 0; |
... | ... |
@@ -1452,7 +1461,7 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1452 | 1461 |
} |
1453 | 1462 |
} |
1454 | 1463 |
|
1455 |
- if (l + 2 * k + m - 1 < seqnchar && invalidCharPos == maxUnSignedIndex) |
|
1464 |
+ if (l + 2 * k + m - 1 < seqnchar && invalidCharPos == MAXUINT64) |
|
1456 | 1465 |
{ |
1457 | 1466 |
//calc next look ahead feature index and store to buffer |
1458 | 1467 |
index = alphaInf->seqIndexMap[(int) seqptr[l + 2 * k + m - 1]]; |
... | ... |
@@ -1475,7 +1484,7 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1475 | 1484 |
else |
1476 | 1485 |
{ |
1477 | 1486 |
//store max index to buffer for not valid pattern |
1478 |
- newIndex[inew++] = maxUnSignedIndex; |
|
1487 |
+ newIndex[inew++] = MAXUINT64; |
|
1479 | 1488 |
|
1480 | 1489 |
if (inew == km1) |
1481 | 1490 |
inew = 0; |
... | ... |
@@ -1499,13 +1508,13 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1499 | 1508 |
else |
1500 | 1509 |
upper = seqnchar - 2 * k - l; |
1501 | 1510 |
|
1502 |
- if (invalidCharPos != maxUnSignedIndex && upper > (int) invalidCharPos - l - 2 * k) |
|
1511 |
+ if (invalidCharPos != MAXUINT64 && upper > (int) invalidCharPos - l - 2 * k) |
|
1503 | 1512 |
upper = invalidCharPos - l - 2 * k; |
1504 | 1513 |
|
1505 | 1514 |
for (j = 0; j <= upper; j++) |
1506 | 1515 |
{ |
1507 |
- if (newIndex[inew] == maxUnSignedIndex || |
|
1508 |
- newIndex[(inew + k + j) % km1] == maxUnSignedIndex) |
|
1516 |
+ if (newIndex[inew] == MAXUINT64 || |
|
1517 |
+ newIndex[(inew + k + j) % km1] == MAXUINT64) |
|
1509 | 1518 |
continue; |
1510 | 1519 |
|
1511 | 1520 |
if (reverseComplement) |
... | ... |
@@ -1537,11 +1546,14 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1537 | 1546 |
kv++; |
1538 | 1547 |
} |
1539 | 1548 |
|
1540 |
- if (invalidCharPos != maxUnSignedIndex) |
|
1549 |
+ if (invalidCharPos != MAXUINT64) |
|
1541 | 1550 |
inew = 0; |
1542 | 1551 |
} |
1543 | 1552 |
|
1544 |
- featVectors.idx[elemIndex] = maxUnSignedIndex; |
|
1553 |
+ featVectorsStart[i + 1] = elemIndex; |
|
1554 |
+ |
|
1555 |
+ if (maxFeaturesPerSample < (featVectorsStart[i + 1] - featVectorsStart[i])) |
|
1556 |
+ maxFeaturesPerSample = featVectorsStart[i + 1] - featVectorsStart[i]; |
|
1545 | 1557 |
|
1546 | 1558 |
// for dist weighting the kernel value is determined in computeKernelMatrixPos |
1547 | 1559 |
if (distWeight.length() == 0) |
... | ... |
@@ -1553,8 +1565,9 @@ void getKMPosDistGappy(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1553 | 1565 |
} |
1554 | 1566 |
} |
1555 | 1567 |
|
1556 |
- computeKernelMatrixPos(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, fDimArray, |
|
1557 |
- m + 1, sizeX, sizeY, normalized, symmetric, FALSE, distWeight); |
|
1568 |
+ computeKernelMatrixPos(maxUnSignedIndex, featVectors.idx, featVectors.x, featVectorsStart, km, |
|
1569 |
+ normValues, maxFeaturesPerSample, maxNumPatternsPerPos, sizeX, sizeY, |
|
1570 |
+ normalized, symmetric, FALSE, distWeight); |
|
1558 | 1571 |
|
1559 | 1572 |
return; |
1560 | 1573 |
} |
... | ... |
@@ -3464,7 +3477,7 @@ RcppExport SEXP gappyPairKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR, |
3464 | 3477 |
{ |
3465 | 3478 |
getKMStdAnnGappy(maxUIndex8, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY, |
3466 | 3479 |
unmapped, k, m, normalized, symmetric, presence, reverseComplement, |
3467 |
- maxSeqLength, &alphaInf); |
|
3480 |
+ maxSeqLength, dimFeatureSpace, &alphaInf); |
|
3468 | 3481 |
} |
3469 | 3482 |
|
3470 | 3483 |
break; |
... | ... |
@@ -3482,7 +3495,7 @@ RcppExport SEXP gappyPairKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR, |
3482 | 3495 |
{ |
3483 | 3496 |
getKMStdAnnGappy(maxUIndex16, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY, |
3484 | 3497 |
unmapped, k, m, normalized, symmetric, presence, reverseComplement, |
3485 |
- maxSeqLength, &alphaInf); |
|
3498 |
+ maxSeqLength, dimFeatureSpace, &alphaInf); |
|
3486 | 3499 |
} |
3487 | 3500 |
|
3488 | 3501 |
break; |
... | ... |
@@ -3501,7 +3514,7 @@ RcppExport SEXP gappyPairKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR, |
3501 | 3514 |
{ |
3502 | 3515 |
getKMStdAnnGappy(maxUIndex32, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY, |
3503 | 3516 |
unmapped, k, m, normalized, symmetric, presence, reverseComplement, |
3504 |
- maxSeqLength, &alphaInf); |
|
3517 |
+ maxSeqLength, dimFeatureSpace, &alphaInf); |
|
3505 | 3518 |
} |
3506 | 3519 |
|
3507 | 3520 |
break; |
... | ... |
@@ -3519,7 +3532,7 @@ RcppExport SEXP gappyPairKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR, |
3519 | 3532 |
{ |
3520 | 3533 |
getKMStdAnnGappy(maxUIndex64, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY, |
3521 | 3534 |
unmapped, k, m, normalized, symmetric, presence, reverseComplement, |
3522 |
- maxSeqLength, &alphaInf); |
|
3535 |
+ maxSeqLength, dimFeatureSpace, &alphaInf); |
|
3523 | 3536 |
} |
3524 | 3537 |
|
3525 | 3538 |
break; |
... | ... |
@@ -3536,7 +3549,7 @@ uint64_t * featureNamesToIndexGappyPair(SEXP featureNames, int numFeatures, Byte |
3536 | 3549 |
struct alphaInfo *alphaInf) |
3537 | 3550 |
{ |
3538 | 3551 |
int i, j, l, numDots; |
3539 |
- uint64_t featureIndex, *featIndex, annIndex, numAnnPow2K, tempIndex, fIndex; |
|
3552 |
+ uint64_t featureIndex, *featIndex, annIndex, tempIndex, fIndex, numAnnPow2K = 0; |
|
3540 | 3553 |
const char *pattern; |
3541 | 3554 |
|
3542 | 3555 |
featIndex = (uint64_t *) R_alloc(numFeatures, sizeof(uint64_t)); |
... | ... |
@@ -3600,7 +3613,7 @@ RcppExport void featuresToHashmapGappyPair(NumericMatrix featureWeights, int svm |
3600 | 3613 |
IntegerVector annotationIndexMap) |
3601 | 3614 |
{ |
3602 | 3615 |
int i, j, numFeatures, result, numDots; |
3603 |
- uint64_t featureIndex, annotIndex, numAnnPow2K; |
|
3616 |
+ uint64_t featureIndex, annotIndex, numAnnPow2K = 0; |
|
3604 | 3617 |
const char *pattern; |
3605 | 3618 |
SEXP dimNames, colNames; |
3606 | 3619 |
khiter_t iter; |
... | ... |
@@ -237,11 +237,11 @@ void computeKernelMatrix(T maxUnSignedIndex, T *featVectorIndex, int32_t *featVe |
237 | 237 |
|
238 | 238 |
while (ix < endx && iy < endy) |
239 | 239 |
{ |
240 |
- if (featVectorIndex[ix] == maxUnSignedIndex || |
|
241 |
- featVectorIndex[iy] == maxUnSignedIndex) |
|
242 |
- { |
|
240 |
+ if ((featVectorIndex[ix] == maxUnSignedIndex && |
|
241 |
+ featVectorValue[ix] == MAXINT32)|| |
|
242 |
+ (featVectorIndex[iy] == maxUnSignedIndex && |
|
243 |
+ featVectorValue[iy] == MAXINT32)) |
|
243 | 244 |
break; |
244 |
- } |
|
245 | 245 |
|
246 | 246 |
if (featVectorIndex[ix] < featVectorIndex[iy]) |
247 | 247 |
ix++; |
... | ... |
@@ -285,8 +285,10 @@ void computeKernelMatrix(T maxUnSignedIndex, T *featVectorIndex, int32_t *featVe |
285 | 285 |
|
286 | 286 |
while (ix < endx && iy < endy) |
287 | 287 |
{ |
288 |
- if (featVectorIndex[ix] == maxUnSignedIndex || |
|
289 |
- featVectorIndex[iy] == maxUnSignedIndex) |
|
288 |
+ if ((featVectorIndex[ix] == maxUnSignedIndex && |
|
289 |
+ featVectorValue[ix] == MAXINT32)|| |
|
290 |
+ (featVectorIndex[iy] == maxUnSignedIndex && |
|
291 |
+ featVectorValue[iy] == MAXINT32)) |
|
290 | 292 |
break; |
291 | 293 |
|
292 | 294 |
if (featVectorIndex[ix] < featVectorIndex[iy]) |
... | ... |
@@ -317,9 +319,9 @@ void computeKernelMatrix(T maxUnSignedIndex, T *featVectorIndex, int32_t *featVe |
317 | 319 |
|
318 | 320 |
template<typename T> |
319 | 321 |
void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *featVectorValue, |
320 |
- Rcpp::NumericMatrix km, double *normValues, uint32_t fDim, int maxNumPatterns, |
|
321 |
- int sizeX, int sizeY, bool normalized, bool symmetric, bool computePosition, |
|
322 |
- Rcpp::NumericVector distWeight) |
|
322 |
+ uint64_t *featVectorStart, Rcpp::NumericMatrix km, double *normValues, |
|
323 |
+ uint32_t fDim, int maxNumPatterns, int sizeX, int sizeY, bool normalized, |
|
324 |
+ bool symmetric, bool computePosition, Rcpp::NumericVector distWeight) |
|
323 | 325 |
{ |
324 | 326 |
uint32_t endx, endy, ix, iy, prevIndex; |
325 | 327 |
int i, j, j1, j2, posX, posY, distWeightLength, startIndex, numSamples, yOffset, numPatterns1, numPatterns2; |
... | ... |
@@ -360,10 +362,10 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
360 | 362 |
|
361 | 363 |
for (j = startIndex; j < sizeY; j++) |
362 | 364 |
{ |
363 |
- ix = i * fDim; |
|
364 |
- iy = (yOffset + j) * fDim; |
|
365 |
- endx = ix + fDim; |
|
366 |
- endy = iy + fDim; |
|
365 |
+ ix = featVectorStart[i]; |
|
366 |
+ iy = featVectorStart[yOffset + j]; |
|
367 |
+ endx = featVectorStart[i + 1]; |
|
368 |
+ endy = featVectorStart[yOffset + j + 1]; |
|
367 | 369 |
prevIndex = 0; |
368 | 370 |
kv = 0; |
369 | 371 |
|
... | ... |
@@ -378,10 +380,6 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
378 | 380 |
|
379 | 381 |
while (ix < endx && iy < endy) |
380 | 382 |
{ |
381 |
- if (featVectorIndex[ix] == maxUnSignedIndex || |
|
382 |
- featVectorIndex[iy] == maxUnSignedIndex) |
|
383 |
- break; |
|
384 |
- |
|
385 | 383 |
if (featVectorIndex[ix] == featVectorIndex[iy]) |
386 | 384 |
kv++; |
387 | 385 |
|
... | ... |
@@ -393,10 +391,6 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
393 | 391 |
{ |
394 | 392 |
while (ix < endx && iy < endy) |
395 | 393 |
{ |
396 |
- if (featVectorIndex[ix] == maxUnSignedIndex || |
|
397 |
- featVectorIndex[iy] == maxUnSignedIndex) |
|
398 |
- break; |
|
399 |
- |
|
400 | 394 |
if (featVectorValue[ix] < featVectorValue[iy]) |
401 | 395 |
ix++; |
402 | 396 |
else if (featVectorValue[ix] > featVectorValue[iy]) |
... | ... |
@@ -418,7 +412,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
418 | 412 |
|
419 | 413 |
for (j1 = 0; j1 < maxNumPatterns; j1++) |
420 | 414 |
{ |
421 |
- if (featVectorIndex[ix + j1] == maxUnSignedIndex) |
|
415 |
+ if (ix + j1 >= endx) |
|
422 | 416 |
break; |
423 | 417 |
|
424 | 418 |
if (featVectorValue[ix + j1] != featVectorValue[ix]) |
... | ... |
@@ -428,7 +422,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
428 | 422 |
|
429 | 423 |
for (j2 = 0; j2 < maxNumPatterns; j2++) |
430 | 424 |
{ |
431 |
- if (featVectorIndex[ix + j2] == maxUnSignedIndex) |
|
425 |
+ if (iy + j2 >= endy) |
|
432 | 426 |
break; |
433 | 427 |
|
434 | 428 |
if (featVectorValue[iy + j2] != featVectorValue[ix]) |
... | ... |
@@ -467,7 +461,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
467 | 461 |
else // distance weighting |
468 | 462 |
{ |
469 | 463 |
// sort feature vectors |
470 |
- sort2Arrays(maxUnSignedIndex, featVectorIndex, featVectorValue, numSamples, fDim, NULL); |
|
464 |
+ sort2Arrays(maxUnSignedIndex, featVectorIndex, featVectorValue, numSamples, fDim, featVectorStart); |
|
471 | 465 |
|
472 | 466 |
distWeightLength = distWeight.size(); |
473 | 467 |
|
... | ... |
@@ -476,18 +470,14 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
476 | 470 |
{ |
477 | 471 |
R_CheckUserInterrupt(); |
478 | 472 |
|
479 |
- ix = i * fDim; |
|
480 |
- iy = i * fDim; |
|
473 |
+ ix = featVectorStart[i]; |
|
474 |
+ iy = featVectorStart[i]; |
|
481 | 475 |
prevIndex = 0; |
482 |
- endx = ix + fDim; |
|
476 |
+ endx = featVectorStart[i + 1]; |
|
483 | 477 |
kv = 0; |
484 | 478 |
|
485 | 479 |
while (ix < endx && iy < endx) |
486 | 480 |
{ |
487 |
- if (featVectorIndex[ix] == maxUnSignedIndex || |
|
488 |
- featVectorIndex[iy] == maxUnSignedIndex) |
|
489 |
- break; |
|
490 |
- |
|
491 | 481 |
if (featVectorIndex[ix] < featVectorIndex[iy]) |
492 | 482 |
ix++; |
493 | 483 |
else if (featVectorIndex[ix] > featVectorIndex[iy]) |
... | ... |
@@ -496,7 +486,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
496 | 486 |
{ |
497 | 487 |
prevIndex = iy; |
498 | 488 |
|
499 |
- while (featVectorIndex[ix] == featVectorIndex[iy]) |
|
489 |
+ while (iy < endx && featVectorIndex[ix] == featVectorIndex[iy]) |
|
500 | 490 |
{ |
501 | 491 |
distance = abs(featVectorValue[iy++] - featVectorValue[ix]); |
502 | 492 |
|
... | ... |
@@ -538,19 +528,15 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
538 | 528 |
|
539 | 529 |
for (j = startIndex; j < sizeY; j++) |
540 | 530 |
{ |
541 |
- ix = i * fDim; |
|
542 |
- endx = ix + fDim; |
|
543 |
- iy = (yOffset + j) * fDim; |
|
544 |
- endy = (yOffset + j + 1) * fDim; |
|
531 |
+ ix = featVectorStart[i]; |
|
532 |
+ endx = featVectorStart[i + 1]; |
|
533 |
+ iy = featVectorStart[yOffset + j]; |
|
534 |
+ endy = featVectorStart[yOffset + j + 1]; |
|
545 | 535 |
prevIndex = 0; |
546 | 536 |
kv = 0; |
547 | 537 |
|
548 | 538 |
while (ix < endx && iy < endy) |
549 | 539 |
{ |
550 |
- if (featVectorIndex[ix] == maxUnSignedIndex || |
|
551 |
- featVectorIndex[iy] == maxUnSignedIndex) |
|
552 |
- break; |
|
553 |
- |
|
554 | 540 |
if (featVectorIndex[ix] < featVectorIndex[iy]) |
555 | 541 |
ix++; |
556 | 542 |
else if (featVectorIndex[ix] > featVectorIndex[iy]) |
... | ... |
@@ -559,7 +545,7 @@ void computeKernelMatrixPos(T maxUnSignedIndex, T *featVectorIndex, int32_t *fea |
559 | 545 |
{ |
560 | 546 |
prevIndex = iy; |
561 | 547 |
|
562 |
- while (featVectorIndex[ix] == featVectorIndex[iy]) |
|
548 |
+ while (iy < endy && featVectorIndex[ix] == featVectorIndex[iy]) |
|
563 | 549 |
{ |
564 | 550 |
distance = abs(featVectorValue[iy++] - featVectorValue[ix]); |
565 | 551 |
|
... | ... |
@@ -972,6 +972,7 @@ bool getIndexMap(ByteStringVector x, int sizeX, IntegerVector selX, bool unmappe |
972 | 972 |
featureCounts = NULL; |
973 | 973 |
featuresInSample = NULL; |
974 | 974 |
vmax = NULL; |
975 |
+ fchmap = NULL; |
|
975 | 976 |
|
976 | 977 |
// cyclic buffers for old index |
977 | 978 |
uint64_t *oldIndex = (uint64_t *) R_alloc(k, sizeof(uint64_t)); |
... | ... |
@@ -1666,7 +1667,7 @@ bool getERSMismatch(ByteStringVector x, int sizeX, IntegerVector selX, int maxSe |
1666 | 1667 |
double *normValues) |
1667 | 1668 |
{ |
1668 | 1669 |
int i, iX, freeNode, maxNoOfNodes, jIndex; |
1669 |
- uint64_t nodeLimit, maxNodesPerSequence; |
|
1670 |
+ uint64_t nodeLimit; |
|
1670 | 1671 |
double kernelValue; |
1671 | 1672 |
bool printWarning = TRUE; |
1672 | 1673 |
struct prefTree *pTree; |
... | ... |
@@ -1674,18 +1675,12 @@ bool getERSMismatch(ByteStringVector x, int sizeX, IntegerVector selX, int maxSe |
1674 | 1675 |
|
1675 | 1676 |
// alloc mem for prefix tree |
1676 | 1677 |
// consider mismatch subtree |
1677 |
- nodeLimit = ((pow(alphaInf->numAlphabetChars, k + 1) - 1) / (alphaInf->numAlphabetChars - 1)) * |
|
1678 |
- pow(alphaInf->numAlphabetChars, k) + 1; |
|
1679 |
- |
|
1680 |
- maxNodesPerSequence = pow(alphaInf->numAlphabetChars, m) * k * (maxSeqLength - k + 1) + 1; |
|
1681 |
- |
|
1682 |
- if (maxNodesPerSequence < (uint64_t) nodeLimit) |
|
1683 |
- nodeLimit = (int) maxNodesPerSequence; |
|
1684 |
- |
|
1685 |
- maxNoOfNodes = MAX_BLOCK; |
|
1678 |
+ nodeLimit = (pow(alphaInf->numAlphabetChars, k + 1) - 1) / (alphaInf->numAlphabetChars - 1); |
|
1686 | 1679 |
|
1687 | 1680 |
if (nodeLimit < (uint64_t) MAX_BLOCK) |
1688 | 1681 |
maxNoOfNodes = (int) nodeLimit; |
1682 |
+ else |
|
1683 |
+ maxNoOfNodes = MAX_BLOCK; |
|
1689 | 1684 |
|
1690 | 1685 |
pTree = (struct prefTree *) R_alloc(maxNoOfNodes, sizeof(struct treeNode)); |
1691 | 1686 |
|
... | ... |
@@ -847,7 +847,7 @@ void setFeatureIndex(struct prefTreeMotif *pTree, int maxMotifLength, int maxPat |
847 | 847 |
} |
848 | 848 |
} |
849 | 849 |
|
850 |
- free(pKeys); |
|
850 |
+ Free(pKeys); |
|
851 | 851 |
pKeys = NULL; |
852 | 852 |
} |
853 | 853 |
} |
... | ... |
@@ -1629,8 +1629,6 @@ void findMotifsForPos(struct intfFindMotifs *intf) |
1629 | 1629 |
|
1630 | 1630 |
descendOnBranchPos(0, intf->seqnchar, 0, &motifBegin, intf); |
1631 | 1631 |
|
1632 |
- intf->pFeatVecIndex[intf->elemIndex] = MAXUINT32; |
|
1633 |
- |
|
1634 | 1632 |
return; |
1635 | 1633 |
} |
1636 | 1634 |
|
... | ... |
@@ -1728,6 +1726,7 @@ void getNonzeroMotifs(bool annSpec, int32_t *featVectorValue, uint32_t *featVect |
1728 | 1726 |
} |
1729 | 1727 |
|
1730 | 1728 |
featVectorIndex[elemIndex] = MAXUINT32; |
1729 |
+ featVectorValue[elemIndex] = MAXINT32; |
|
1731 | 1730 |
} |
1732 | 1731 |
else |
1733 | 1732 |
{ |
... | ... |
@@ -1768,6 +1767,7 @@ void getNonzeroMotifs(bool annSpec, int32_t *featVectorValue, uint32_t *featVect |
1768 | 1767 |
} |
1769 | 1768 |
|
1770 | 1769 |
featVectorIndex[elemIndex] = MAXUINT32; |
1770 |
+ featVectorValue[elemIndex] = MAXINT32; |
|
1771 | 1771 |
} |
1772 | 1772 |
} |
1773 | 1773 |
|
... | ... |
@@ -1869,6 +1869,7 @@ void getNonzeroMotifsERS(bool annSpec, struct prefTreeMotif *pTree, khash_t(fim) |
1869 | 1869 |
if (curr > 0) |
1870 | 1870 |
{ |
1871 | 1871 |
featVectorIndex[curr] = MAXUINT32; |
1872 |
+ featVectorValue[curr] = MAXINT32; |
|
1872 | 1873 |
sort2Arrays(MAXUINT32, featVectorIndex, featVectorValue, 1, numUsedFeatures, NULL); |
1873 | 1874 |
|
1874 | 1875 |
curr = 0; |
... | ... |
@@ -2151,13 +2152,12 @@ void getKMStdAnnMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y, |
2151 | 2152 |
|
2152 | 2153 |
if (pKeys != NULL) |
2153 | 2154 |
{ |
2154 |
- free(pKeys); |
|
2155 |
+ Free(pKeys); |
|
2155 | 2156 |
pKeys = NULL; |
2156 | 2157 |
} |
2157 | 2158 |
|
2158 | 2159 |
// sort feature vectors |
2159 |
- if (annX.length > 0) |
|
2160 |
- sort2Arrays(MAXUINT32, featVectorIndex, featVectorValue, numSamples, fDim, NULL); |
|
2160 |
+ sort2Arrays(MAXUINT32, featVectorIndex, featVectorValue, numSamples, fDim, NULL); |
|
2161 | 2161 |
|
2162 | 2162 |
computeKernelMatrix(MAXUINT32, featVectorIndex, featVectorValue, km, normValues, fDim, |
2163 | 2163 |
sizeX, sizeY, normalized, symmetric); |
... | ... |
@@ -2174,7 +2174,8 @@ void getKMPosDistMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y, |
2174 | 2174 |
{ |
2175 | 2175 |
uint32_t fDim, *featVectorIndex; |
2176 | 2176 |
int32_t *featVectorValue; |
2177 |
- int i, freeNode, maxNoOfNodes, iX, iY, numSamples; |
|
2177 |
+ uint64_t *featVectorsStart; |
|
2178 |
+ int i, freeNode, maxNoOfNodes, iX, iY, numSamples, maxFeaturesPerSample; |
|
2178 | 2179 |
bool printWarning = TRUE; |
2179 | 2180 |
struct prefTreeMotif *pTree; |
2180 | 2181 |
struct indexBlock nullBlock; |
... | ... |
@@ -2197,6 +2198,7 @@ void getKMPosDistMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y, |
2197 | 2198 |
fDim = 2 * maxSeqLength + 1; |
2198 | 2199 |
featVectorValue = (int32_t *) R_alloc(numSamples * fDim, sizeof(int32_t)); |
2199 | 2200 |
featVectorIndex = (uint32_t *) R_alloc(numSamples * fDim, sizeof(uint32_t)); |
2201 |
+ featVectorsStart = (uint64_t *) R_alloc(numSamples + 1, sizeof(uint64_t)); |
|
2200 | 2202 |
|
2201 | 2203 |
// alloc mem for prefix tree |
2202 | 2204 |
maxNoOfNodes = MAX_BLOCK; |
... | ... |
@@ -2238,12 +2240,15 @@ void getKMPosDistMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y, |
2238 | 2240 |
intf.fDim = fDim; |
2239 | 2241 |
intf.allIndexMaps = allIndexMaps; |
2240 | 2242 |
|
2243 |
+ featVectorsStart[0] = 0; |
|
2244 |
+ maxFeaturesPerSample = 0; |
|
2245 |
+ intf.elemIndex = 0; |
|
2246 |
+ |
|
2241 | 2247 |
// calculate kernel matrix |
2242 | 2248 |
for (i = 0; i < numSamples; i++) |
2243 | 2249 |
{ |
2244 | 2250 |
R_CheckUserInterrupt(); |
2245 | 2251 |
intf.rowIndex = i; |
2246 |
- intf.elemIndex = intf.rowIndex * intf.fDim; |
|
2247 | 2252 |
intf.offset = 0; |
2248 | 2253 |
|
2249 | 2254 |
if (i < sizeX) |
... | ... |
@@ -2273,14 +2278,20 @@ void getKMPosDistMotif(NumericMatrix km, ByteStringVector x, ByteStringVector y, |
2273 | 2278 |
return; |
2274 | 2279 |
} |
2275 | 2280 |
|
2281 |
+ featVectorsStart[i + 1] = intf.elemIndex; |
|
2282 |
+ |
|
2283 |
+ if (maxFeaturesPerSample < (featVectorsStart[i + 1] - featVectorsStart[i])) |
|
2284 |
+ maxFeaturesPerSample = featVectorsStart[i + 1] - featVectorsStart[i]; |
|
2285 |
+ |
|
2276 | 2286 |
if (normalized) |
2277 | 2287 |
normValues[i] = sqrt(intf.kernelValue); |
2278 | 2288 |
else |
2279 | 2289 |
normValues[i] = intf.kernelValue; |
2280 | 2290 |
} |
2281 | 2291 |
|
2282 |
- computeKernelMatrixPos(MAXUINT32, featVectorIndex, featVectorValue, km, normValues, fDim, motifs.length, |
|
2283 |
- sizeX, sizeY, normalized, symmetric, FALSE, distWeight); |
|
2292 |
+ computeKernelMatrixPos(MAXUINT32, featVectorIndex, featVectorValue, featVectorsStart, km, |
|
2293 |
+ normValues, maxFeaturesPerSample, motifs.length, sizeX, sizeY, normalized, symmetric, |
|
2294 |
+ FALSE, distWeight); |
|
2284 | 2295 |
|
2285 | 2296 |
return; |
2286 | 2297 |
} |
... | ... |
@@ -581,12 +581,13 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B |
581 | 581 |
int sizeX, int sizeY, IntegerVector selX, IntegerVector selY, |
582 | 582 |
ByteStringVector annCharset, ByteStringVector annX, ByteStringVector annY, |
583 | 583 |
bool unmapped, int k, bool normalized, bool symmetric, bool presence, |
584 |
- bool reverseComplement, int maxSeqLength, struct alphaInfo *alphaInf) |
|
584 |
+ bool reverseComplement, int maxSeqLength, uint64_t dimFeatureSpace, |
|
585 |
+ struct alphaInfo *alphaInf) |
|
585 | 586 |
{ |
586 | 587 |
T featureIndex, annotIndex; |
587 | 588 |
int i, j, l, currStack, stack[4*k], iX, iY, twoK, elemIndex, maxNoOfNodes, noSamples = sizeX; |
588 | 589 |
int freeNode, nodeLimit, currBlock, currIndex, newBlock, lastBlockSize, maxBlockIndex; |
589 |
- uint64_t numAnnPowK, maxNodesPerSequence; |
|
590 |
+ uint64_t numAnnPowK, maxNodesPerSequence, maxNumFeatures; |
|
590 | 591 |
double kv; |
591 | 592 |
bool printWarning = TRUE; |
592 | 593 |
const char *annptr; |
... | ... |
@@ -608,13 +609,16 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B |
608 | 609 |
noSamples += sizeY; |
609 | 610 |
|
610 | 611 |
// add one for the sentinel |
611 |
- maxSeqLength += 1; |
|
612 |
+ if (dimFeatureSpace < maxSeqLength) |
|
613 |
+ maxNumFeatures = dimFeatureSpace + 1; |
|
614 |
+ else |
|
615 |
+ maxNumFeatures = maxSeqLength + 1; |
|
612 | 616 |
|
613 | 617 |
// allocate arrays for sparse feature vectors with 32 or 64 bit index |
614 | 618 |
// store only unnormalized k-mer counts to avoid double space usage |
615 | 619 |
featVectorArrays featVectors; |
616 |
- featVectors.x = (int32_t *) R_alloc(noSamples * maxSeqLength, sizeof(int32_t)); |
|
617 |
- featVectors.idx = (T *) R_alloc(noSamples * maxSeqLength, sizeof(T)); |
|
620 |
+ featVectors.x = (int32_t *) R_alloc(noSamples * maxNumFeatures, sizeof(int32_t)); |
|
621 |
+ featVectors.idx = (T *) R_alloc(noSamples * maxNumFeatures, sizeof(T)); |
|
618 | 622 |
|
619 | 623 |
double *normValues = (double *) R_alloc(noSamples, sizeof(double)); |
620 | 624 |
|
... | ... |
@@ -701,12 +705,13 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B |
701 | 705 |
currBlock = 0; |
702 | 706 |
currIndex = 0; |
703 | 707 |
currStack = -1; |
704 |
- elemIndex = i * maxSeqLength; |
|
708 |
+ elemIndex = i * maxNumFeatures; |
|
705 | 709 |
featureIndex = 0; |
706 | 710 |
annotIndex = 0; |
707 | 711 |
|
708 | 712 |
// for situation with no features set sentinel |
709 | 713 |
featVectors.idx[elemIndex] = maxUnSignedIndex; |
714 |
+ featVectors.x[elemIndex] = MAXINT32; |
|
710 | 715 |
|
711 | 716 |
while (currStack >= 0 || currIndex <= maxBlockIndex) |
712 | 717 |
{ |
... | ... |
@@ -793,9 +798,10 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B |
793 | 798 |
} |
794 | 799 |
|
795 | 800 |
featVectors.idx[elemIndex] = maxUnSignedIndex; |
801 |
+ featVectors.x[elemIndex] = MAXINT32; |
|
796 | 802 |
} |
797 | 803 |
|
798 |
- computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, maxSeqLength, |
|
804 |
+ computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, maxNumFeatures, |
|
799 | 805 |
sizeX, sizeY, normalized, symmetric); |
800 | 806 |
} |
801 | 807 |
else // not symmetric |
... | ... |
@@ -856,12 +862,13 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B |
856 | 862 |
currBlock = 0; |
857 | 863 |
currIndex = 0; |
858 | 864 |
currStack = -1; |
859 |
- elemIndex = i * maxSeqLength; |
|
865 |
+ elemIndex = i * maxNumFeatures; |
|
860 | 866 |
featureIndex = 0; |
861 | 867 |
annotIndex = 0; |
862 | 868 |
|
863 | 869 |
// for situation with no features set sentinel |
864 | 870 |
featVectors.idx[elemIndex] = maxUnSignedIndex; |
871 |
+ featVectors.x[elemIndex] = MAXINT32; |
|
865 | 872 |
|
866 | 873 |
while (currStack >= 0 || currIndex <= maxBlockIndex) |
867 | 874 |
{ |
... | ... |
@@ -947,9 +954,10 @@ void getKMStdAnnSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, B |
947 | 954 |
} |
948 | 955 |
|
949 | 956 |
featVectors.idx[elemIndex] = maxUnSignedIndex; |
957 |
+ featVectors.x[elemIndex] = MAXINT32; |
|
950 | 958 |
} |
951 | 959 |
|
952 |
- computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, maxSeqLength, |
|
960 |
+ computeKernelMatrix(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, maxNumFeatures, |
|
953 | 961 |
sizeX, sizeY, normalized, symmetric); |
954 | 962 |
} |
955 | 963 |
|
... | ... |
@@ -964,11 +972,12 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
964 | 972 |
NumericVector distWeight, int maxSeqLength, struct alphaInfo *alphaInf) |
965 | 973 |
{ |
966 | 974 |
T prevIndex, featureIndex, tempIndex, fIndex; |
967 |
- uint64_t elemIndex, fDimArray; |
|
968 |
- int i, j, l, offset, patLength, seqnchar, index, iold, iX, iY, maxNumPatterns, noSamples = sizeX; |
|
975 |
+ uint64_t elemIndex, fDimArray, *featVectorsStart; |
|
976 |
+ int i, j, l, offset, patLength, seqnchar, index, iold, iX, iY; |
|
977 |
+ int maxNumPatternsPerPos, maxFeaturesPerSample, noSamples = sizeX; |
|
969 | 978 |
bool distWeighting; |
970 | 979 |
const char *seqptr; |
971 |
- double kv; |
|
980 |
+ double kv, *normValues; |
|
972 | 981 |
|
973 | 982 |
struct featVectorArrays { |
974 | 983 |
int32_t *x; // value array |
... | ... |
@@ -984,7 +993,7 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
984 | 993 |
|
985 | 994 |
// allocate arrays for feature vectors with 8, 16, 32 or 64 bit index |
986 | 995 |
// without dist weighting use implicit position to save storage |
987 |
- fDimArray = maxSeqLength - k + 2; |
|
996 |
+ fDimArray = maxSeqLength - k + 1; |
|
988 | 997 |
featVectorArrays featVectors; |
989 | 998 |
|
990 | 999 |
if (distWeighting) |
... | ... |
@@ -993,11 +1002,14 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
993 | 1002 |
// just for the offset of each sample |
994 | 1003 |
featVectors.x = (int32_t *) R_alloc(noSamples, sizeof(int32_t)); |
995 | 1004 |
|
996 |
- |
|
997 | 1005 |
featVectors.idx = (T *) R_alloc(noSamples * fDimArray, sizeof(T)); |
998 |
- maxNumPatterns = 1; |
|
1006 |
+ featVectorsStart = (uint64_t *) R_alloc(noSamples + 1, sizeof(uint64_t)); |
|
1007 |
+ normValues = (double *) R_alloc(noSamples, sizeof(double)); |
|
1008 |
+ maxNumPatternsPerPos = 1; |
|
999 | 1009 |
|
1000 |
- double *normValues = (double *) R_alloc(noSamples, sizeof(double)); |
|
1010 |
+ featVectorsStart[0] = 0; |
|
1011 |
+ maxFeaturesPerSample = 0; |
|
1012 |
+ elemIndex = 0; |
|
1001 | 1013 |
|
1002 | 1014 |
// walk along sequence and create sparse feature vector |
1003 | 1015 |
for (i = 0; i < noSamples; i++) |
... | ... |
@@ -1029,7 +1041,6 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1029 | 1041 |
|
1030 | 1042 |
patLength = 0; |
1031 | 1043 |
featureIndex = 0; |
1032 |
- elemIndex = (uint64_t) i * fDimArray; |
|
1033 | 1044 |
iold = 0; |
1034 | 1045 |
kv = 0; |
1035 | 1046 |
|
... | ... |
@@ -1118,7 +1129,10 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1118 | 1129 |
} |
1119 | 1130 |
} |
1120 | 1131 |
|
1121 |
- featVectors.idx[elemIndex] = maxUnSignedIndex; |
|
1132 |
+ featVectorsStart[i + 1] = elemIndex; |
|
1133 |
+ |
|
1134 |
+ if (maxFeaturesPerSample < (featVectorsStart[i + 1] - featVectorsStart[i])) |
|
1135 |
+ maxFeaturesPerSample = featVectorsStart[i + 1] - featVectorsStart[i]; |
|
1122 | 1136 |
|
1123 | 1137 |
// for dist weighting the kernel value is determined in computeKernelMatrixPos |
1124 | 1138 |
if (!distWeighting) |
... | ... |
@@ -1130,8 +1144,9 @@ void getKMPosDistSpec(T maxUnSignedIndex, NumericMatrix km, ByteStringVector x, |
1130 | 1144 |
} |
1131 | 1145 |
} |
1132 | 1146 |
|
1133 |
- computeKernelMatrixPos(maxUnSignedIndex, featVectors.idx, featVectors.x, km, normValues, fDimArray, |
|
1134 |
- maxNumPatterns, sizeX, sizeY, normalized, symmetric, !distWeighting, distWeight); |
|
1147 |
+ computeKernelMatrixPos(maxUnSignedIndex, featVectors.idx, featVectors.x, featVectorsStart, km, |
|
1148 |
+ normValues, maxFeaturesPerSample, maxNumPatternsPerPos, sizeX, sizeY, |
|
1149 |
+ normalized, symmetric, !distWeighting, distWeight); |
|
1135 | 1150 |
|
1136 | 1151 |
return; |
1137 | 1152 |
} |
... | ... |
@@ -3211,7 +3226,7 @@ RcppExport SEXP spectrumKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR, |
3211 | 3226 |
{ |
3212 | 3227 |
getKMStdAnnSpec(maxUIndex8, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY, |
3213 | 3228 |
unmapped, k, normalized, symmetric, presence, reverseComplement, maxSeqLength, |
3214 |
- &alphaInf); |
|
3229 |
+ dimFeatureSpace, &alphaInf); |
|
3215 | 3230 |
} |
3216 | 3231 |
|
3217 | 3232 |
break; |
... | ... |
@@ -3229,7 +3244,7 @@ RcppExport SEXP spectrumKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR, |
3229 | 3244 |
{ |
3230 | 3245 |
getKMStdAnnSpec(maxUIndex16, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY, |
3231 | 3246 |
unmapped, k, normalized, symmetric, presence, reverseComplement, maxSeqLength, |
3232 |
- &alphaInf); |
|
3247 |
+ dimFeatureSpace, &alphaInf); |
|
3233 | 3248 |
} |
3234 | 3249 |
|
3235 | 3250 |
break; |
... | ... |
@@ -3248,7 +3263,7 @@ RcppExport SEXP spectrumKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR, |
3248 | 3263 |
{ |
3249 | 3264 |
getKMStdAnnSpec(maxUIndex32, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY, |
3250 | 3265 |
unmapped, k, normalized, symmetric, presence, reverseComplement, maxSeqLength, |
3251 |
- &alphaInf); |
|
3266 |
+ dimFeatureSpace, &alphaInf); |
|
3252 | 3267 |
} |
3253 | 3268 |
|
3254 | 3269 |
break; |
... | ... |
@@ -3266,7 +3281,7 @@ RcppExport SEXP spectrumKernelMatrixC(SEXP xR, SEXP yR, SEXP selXR, SEXP selYR, |
3266 | 3281 |
{ |
3267 | 3282 |
getKMStdAnnSpec(maxUIndex64, km, x, y, sizeX, sizeY, selX, selY, annCharset, annX, annY, |
3268 | 3283 |
unmapped, k, normalized, symmetric, presence, reverseComplement, maxSeqLength, |
3269 |
- &alphaInf); |
|
3284 |
+ dimFeatureSpace, &alphaInf); |
|
3270 | 3285 |
} |
3271 | 3286 |
|
3272 | 3287 |
break; |
... | ... |
@@ -3333,7 +3348,7 @@ RcppExport void featuresToHashmapSpectrum(NumericMatrix featureWeights, int svmI |
3333 | 3348 |
IntegerVector annotationIndexMap) |
3334 | 3349 |
{ |
3335 | 3350 |
int i, j, numFeatures, result; |
3336 |
- uint64_t featureIndex, annotIndex, numAnnPowK; |
|
3351 |
+ uint64_t featureIndex, annotIndex, numAnnPowK = 0; |
|
3337 | 3352 |
const char *pattern; |
3338 | 3353 |
SEXP dimNames, colNames; |
3339 | 3354 |
khiter_t iter; |
... | ... |
@@ -167,7 +167,6 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR, |
167 | 167 |
ptrI = iptr; |
168 | 168 |
ptrX = xptr; |
169 | 169 |
|
170 |
- // do processing |
|
171 | 170 |
nextFree = 0; |
172 | 171 |
|
173 | 172 |
if (symmetric) |
... | ... |
@@ -203,7 +202,7 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR, |
203 | 202 |
if (kv > lowerLimit) |
204 | 203 |
{ |
205 | 204 |
// new element - check if enough space |
206 |
- if (nextFree + 1 > vectorSize) |
|
205 |
+ if (((uint64_t) nextFree + 1) > vectorSize) |
|
207 | 206 |
{ |
208 | 207 |
// realloc arrays i and x |
209 | 208 |
vectorSize *= growBy; |
... | ... |
@@ -258,7 +257,7 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR, |
258 | 257 |
if (kv > lowerLimit) |
259 | 258 |
{ |
260 | 259 |
// new element - check if enough space |
261 |
- if (nextFree + 1 > vectorSize) |
|
260 |
+ if (((uint64_t) nextFree + 1) > vectorSize) |
|
262 | 261 |
{ |
263 | 262 |
// realloc arrays i and x |
264 | 263 |
vectorSize = (uint64_t) (vectorSize * growBy); |
... | ... |
@@ -306,13 +305,13 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR, |
306 | 305 |
else |
307 | 306 |
SET_VECTOR_ELT(dimnames, 1, R_NilValue); |
308 | 307 |
|
309 |
- slot_p = PROTECT(Rf_allocVector(INTSXP, sizeY+1)); |
|
308 |
+ slot_p = PROTECT(Rf_allocVector(INTSXP, sizeY + 1)); |
|
310 | 309 |
SET_SLOT(spm, Rf_mkChar("p"), slot_p); |
311 | 310 |
|
312 | 311 |
for (i=0; i < sizeY + 1; i++) |
313 | 312 |
INTEGER(slot_p)[i] = pptr[i]; |
314 | 313 |
|
315 |
- free(pptr); |
|
314 |
+ Free(pptr); |
|
316 | 315 |
ptrP = NULL; |
317 | 316 |
|
318 | 317 |
slot_i = PROTECT(Rf_allocVector(INTSXP, nextFree)); |
... | ... |
@@ -321,7 +320,7 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR, |
321 | 320 |
for (i=0; i < nextFree; i++) |
322 | 321 |
INTEGER(slot_i)[i] = iptr[i]; |
323 | 322 |
|
324 |
- free(iptr); |
|
323 |
+ Free(iptr); |
|
325 | 324 |
ptrI = NULL; |
326 | 325 |
|
327 | 326 |
slot_x = PROTECT(Rf_allocVector(REALSXP, nextFree)); |
... | ... |
@@ -330,7 +329,7 @@ RcppExport SEXP linearKernelSparseKMdgRMatrixC(SEXP sizeXR, SEXP pXR, SEXP jXR, |
330 | 329 |
for (i=0; i < nextFree; i++) |
331 | 330 |
REAL(slot_x)[i] = xptr[i]; |
332 | 331 |
|
333 |
- free(xptr); |
|
332 |
+ Free(xptr); |
|
334 | 333 |
ptrX = NULL; |
335 | 334 |
|
336 | 335 |
numProtect += 3; |
... | ... |
@@ -59,7 +59,7 @@ kebabsDate <- paste(month.name[kebabsDateMonth], " ", |
59 |