... | ... |
@@ -62,10 +62,9 @@ blacklist <- function(files, assembly, bins, min.mapq=10, pairedEndReads=FALSE) |
62 | 62 |
#' @inheritParams bam2GRanges |
63 | 63 |
#' @inheritParams bed2GRanges |
64 | 64 |
#' @return A \code{\link{GRanges}} object with reads. |
65 |
-#' @examples |
|
66 |
-#' files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE) |
|
67 |
-#' reads <- mergeStrandseqFiles(files, assembly='hg38') |
|
68 |
-#' |
|
65 |
+#' @examples |
|
66 |
+#' #files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE) |
|
67 |
+#' #reads <- mergeStrandseqFiles(files, assembly='hg38') |
|
69 | 68 |
mergeStrandseqFiles <- function(files, assembly, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000) { |
70 | 69 |
|
71 | 70 |
## Determine format |
... | ... |
@@ -15,7 +15,7 @@ |
15 | 15 |
#'@examples |
16 | 16 |
#'## Get an example BED file with single-cell-sequencing reads |
17 | 17 |
#'bedfile <- system.file("extdata", "KK150311_VI_07.bam.bed.gz", package="AneuFinderData") |
18 |
-#'## Bin the BAM file into bin size 1Mp |
|
18 |
+#'## Bin the data into bin size 1Mp |
|
19 | 19 |
#'binned <- binReads(bedfile, assembly='mm10', binsize=1e6, |
20 | 20 |
#' chromosomes=c(1:19,'X','Y')) |
21 | 21 |
#'## Fit the Hidden Markov Model |
... | ... |
@@ -26,7 +26,7 @@ |
26 | 26 |
findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1, max.iter=1000, num.trials=15, eps.try=10*eps, num.threads=1, count.cutoff.quantile=0.999, strand='*', states=c("zero-inflation",paste0(0:10,"-somy")), most.frequent.state="2-somy", method="HMM", algorithm="EM", initial.params=NULL) { |
27 | 27 |
|
28 | 28 |
## Intercept user input |
29 |
- binned.data <- loadFromFiles(binned.data, check.class='GRanges')[[1]] |
|
29 |
+ binned.data <- loadFromFiles(binned.data, check.class=c('GRanges', 'GRangesList'))[[1]] |
|
30 | 30 |
if (is.null(ID)) { |
31 | 31 |
ID <- attr(binned.data, 'ID') |
32 | 32 |
} |
... | ... |
@@ -63,7 +63,7 @@ findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", max.time=-1 |
63 | 63 |
#' |
64 | 64 |
#' \code{univariate.findCNVs} classifies the binned read counts into several states which represent copy-number-variation. |
65 | 65 |
#' |
66 |
-#' @param binned.data A \link{GRanges} object with binned read counts. |
|
66 |
+#' @param binned.data A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts. |
|
67 | 67 |
#' @param ID An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example. |
68 | 68 |
#' @param eps Convergence threshold for the Baum-Welch algorithm. |
69 | 69 |
#' @param init One of the following initialization procedures: |
... | ... |
@@ -90,10 +90,16 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
90 | 90 |
on.exit(.C("C_univariate_cleanup", PACKAGE = 'AneuFinder')) |
91 | 91 |
|
92 | 92 |
## Intercept user input |
93 |
- binned.data <- loadFromFiles(binned.data, check.class='GRanges')[[1]] |
|
93 |
+ binned.data <- loadFromFiles(binned.data, check.class=c('GRanges', 'GRangesList'))[[1]] |
|
94 | 94 |
if (is.null(ID)) { |
95 | 95 |
ID <- attr(binned.data, 'ID') |
96 | 96 |
} |
97 |
+ if (class(binned.data) == 'GRangesList') { |
|
98 |
+ binned.data.list <- binned.data |
|
99 |
+ binned.data <- binned.data.list[[1]] |
|
100 |
+ } else if (class(binned.data) == 'GRanges') { |
|
101 |
+ binned.data.list <- GRangesList('0'=binned.data) |
|
102 |
+ } |
|
97 | 103 |
if (check.positive(eps)!=0) stop("argument 'eps' expects a positive numeric") |
98 | 104 |
if (check.integer(max.time)!=0) stop("argument 'max.time' expects an integer") |
99 | 105 |
if (check.integer(max.iter)!=0) stop("argument 'max.iter' expects an integer") |
... | ... |
@@ -125,6 +131,17 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
125 | 131 |
warlist <- list() |
126 | 132 |
if (num.trials==1) eps.try <- eps |
127 | 133 |
|
134 |
+ ### Make return object |
|
135 |
+ result <- list() |
|
136 |
+ class(result) <- class.univariate.hmm |
|
137 |
+ result$ID <- ID |
|
138 |
+ result$bins <- binned.data.list |
|
139 |
+ ## Quality info |
|
140 |
+ result$qualityInfo <- as.list(getQC(binned.data.list)) |
|
141 |
+ ## Convergence info |
|
142 |
+ convergenceInfo <- list(eps=eps, loglik=NA, loglik.delta=NA, num.iterations=NA, time.sec=NA, error=NA) |
|
143 |
+ result$convergenceInfo <- convergenceInfo |
|
144 |
+ |
|
128 | 145 |
## Assign variables |
129 | 146 |
inistates <- initializeStates(states) |
130 | 147 |
state.labels <- inistates$states |
... | ... |
@@ -132,8 +149,6 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
132 | 149 |
multiplicity <- inistates$multiplicity |
133 | 150 |
dependent.states.mask <- (state.labels != 'zero-inflation') & (state.labels != '0-somy') |
134 | 151 |
numstates <- length(states) |
135 |
- numbins <- length(binned.data) |
|
136 |
- iniproc <- which(init==c("standard","random")) # transform to int |
|
137 | 152 |
if (strand=='+') { |
138 | 153 |
select <- 'pcounts' |
139 | 154 |
} else if (strand=='-') { |
... | ... |
@@ -141,321 +156,391 @@ univariate.findCNVs <- function(binned.data, ID=NULL, eps=0.1, init="standard", |
141 | 156 |
} else if (strand=='*') { |
142 | 157 |
select <- 'counts' |
143 | 158 |
} |
144 |
- counts <- mcols(binned.data)[,select] |
|
145 | 159 |
algorithm <- factor(algorithm, levels=c('baumWelch','viterbi','EM')) |
146 |
- |
|
147 |
- ### Make return object |
|
148 |
- result <- list() |
|
149 |
- class(result) <- class.univariate.hmm |
|
150 |
- result$ID <- ID |
|
151 |
- result$bins <- binned.data |
|
152 |
- ## Quality info |
|
153 |
- result$qualityInfo <- as.list(getQC(binned.data)) |
|
154 |
- ## Convergence info |
|
155 |
- convergenceInfo <- list(eps=eps, loglik=NA, loglik.delta=NA, num.iterations=NA, time.sec=NA, error=NA) |
|
156 |
- result$convergenceInfo <- convergenceInfo |
|
157 |
- |
|
158 |
- # Check if there are counts in the data, otherwise HMM will blow up |
|
159 |
- if (any(is.na(counts))) { |
|
160 |
- stop(paste0("ID = ",ID,": NAs found in reads.")) |
|
161 |
- } |
|
162 |
- if (!any(counts!=0)) { |
|
163 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done.")) |
|
164 |
- result$warnings <- warlist |
|
165 |
- return(result) |
|
166 |
- } else if (any(counts<0)) { |
|
167 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done.")) |
|
168 |
- result$warnings <- warlist |
|
169 |
- return(result) |
|
170 |
- } |
|
171 |
- |
|
172 |
- |
|
173 |
- # Filter high counts out, makes HMM faster |
|
174 |
- count.cutoff <- quantile(counts, count.cutoff.quantile) |
|
175 |
- names.count.cutoff <- names(count.cutoff) |
|
176 |
- count.cutoff <- ceiling(count.cutoff) |
|
177 |
- mask <- counts > count.cutoff |
|
178 |
- counts[mask] <- count.cutoff |
|
179 |
- numfiltered <- length(which(mask)) |
|
180 |
- if (numfiltered > 0) { |
|
181 |
- message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to enhance performance.")) |
|
182 |
- } |
|
183 | 160 |
|
184 |
- ## Call univariate in a for loop to enable multiple trials |
|
185 |
- modellist <- list() |
|
186 |
- for (i_try in 1:num.trials) { |
|
187 |
- message(paste0("Trial ",i_try," / ",num.trials)) |
|
188 |
- |
|
189 |
- ## Initial parameters |
|
190 |
- if (init == 'initial.params') { |
|
191 |
- A.initial <- initial.params$transitionProbs |
|
192 |
- proba.initial <- initial.params$startProbs |
|
193 |
- size.initial <- initial.params$distributions[,'size'] |
|
194 |
- prob.initial <- initial.params$distributions[,'prob'] |
|
195 |
- size.initial[is.na(size.initial)] <- 0 |
|
196 |
- prob.initial[is.na(prob.initial)] <- 0 |
|
197 |
- } else if (init == 'random') { |
|
198 |
- A.initial <- matrix(stats::runif(numstates^2), ncol=numstates) |
|
199 |
- A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/") |
|
200 |
- proba.initial <- stats::runif(numstates) |
|
201 |
- # Distributions for dependent states |
|
202 |
- size.initial <- stats::runif(1, min=0, max=100) * cumsum(dependent.states.mask) |
|
203 |
- prob.initial <- stats::runif(1) * dependent.states.mask |
|
204 |
- # Assign initials for the 0-somy distribution |
|
205 |
- index <- which('0-somy'==state.labels) |
|
206 |
- size.initial[index] <- 1 |
|
207 |
- prob.initial[index] <- 0.5 |
|
208 |
- } else if (init == 'standard') { |
|
209 |
- A.initial <- matrix(NA, ncol=numstates, nrow=numstates) |
|
210 |
- for (irow in 1:numstates) { |
|
211 |
- for (icol in 1:numstates) { |
|
212 |
- if (irow==icol) { A.initial[irow,icol] <- 0.9 } |
|
213 |
- else { A.initial[irow,icol] <- 0.1/(numstates-1) } |
|
214 |
- } |
|
215 |
- } |
|
216 |
- proba.initial <- rep(1/numstates, numstates) |
|
217 |
- ## Set initial mean of most.frequent.state distribution to max of count histogram |
|
218 |
- max.counts <- as.integer(names(which.max(table(counts[counts>0])))) |
|
219 |
- divf <- max(multiplicity[most.frequent.state], 1) |
|
220 |
- mean.initial.monosomy <- max.counts/divf |
|
221 |
- var.initial.monosomy <- mean.initial.monosomy * 2 |
|
222 |
-# mean.initial.monosomy <- mean(counts[counts>0])/divf |
|
223 |
-# var.initial.monosomy <- var(counts[counts>0])/divf |
|
224 |
- if (is.na(mean.initial.monosomy)) { |
|
225 |
- mean.initial.monosomy <- 1 |
|
226 |
- } |
|
227 |
- if (is.na(var.initial.monosomy)) { |
|
228 |
- var.initial.monosomy <- mean.initial.monosomy + 1 |
|
229 |
- } |
|
230 |
- if (mean.initial.monosomy >= var.initial.monosomy) { |
|
231 |
- mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask) |
|
232 |
- var.initial <- (mean.initial.monosomy+1) * cumsum(dependent.states.mask) |
|
233 |
- size.initial <- rep(0,numstates) |
|
234 |
- prob.initial <- rep(0,numstates) |
|
235 |
- mask <- dependent.states.mask |
|
236 |
- size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
|
237 |
- prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
|
238 |
- } else { |
|
239 |
- mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask) |
|
240 |
- var.initial <- var.initial.monosomy * cumsum(dependent.states.mask) |
|
241 |
- size.initial <- rep(0,numstates) |
|
242 |
- prob.initial <- rep(0,numstates) |
|
243 |
- mask <- dependent.states.mask |
|
244 |
- size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
|
245 |
- prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
|
246 |
- } |
|
247 |
- # Assign initials for the 0-somy distribution |
|
248 |
- index <- which('0-somy'==state.labels) |
|
249 |
- size.initial[index] <- 1 |
|
250 |
- prob.initial[index] <- 0.5 |
|
251 |
- } |
|
252 |
- |
|
253 |
- hmm <- .C("C_univariate_hmm", |
|
254 |
- counts = as.integer(counts), # int* O |
|
255 |
- num.bins = as.integer(numbins), # int* T |
|
256 |
- num.states = as.integer(numstates), # int* N |
|
257 |
- state.labels = as.integer(state.labels), # int* state_labels |
|
258 |
- size = double(length=numstates), # double* size |
|
259 |
- prob = double(length=numstates), # double* prob |
|
260 |
- num.iterations = as.integer(max.iter), # int* maxiter |
|
261 |
- time.sec = as.integer(max.time), # double* maxtime |
|
262 |
- loglik.delta = as.double(eps.try), # double* eps |
|
263 |
- states = integer(length=numbins), # int* states |
|
264 |
- A = double(length=numstates*numstates), # double* A |
|
265 |
- proba = double(length=numstates), # double* proba |
|
266 |
- loglik = double(length=1), # double* loglik |
|
267 |
- weights = double(length=numstates), # double* weights |
|
268 |
- distr.type = as.integer(state.distributions), # int* distr_type |
|
269 |
- size.initial = as.vector(size.initial), # double* initial_size |
|
270 |
- prob.initial = as.vector(prob.initial), # double* initial_prob |
|
271 |
- A.initial = as.vector(A.initial), # double* initial_A |
|
272 |
- proba.initial = as.vector(proba.initial), # double* initial_proba |
|
273 |
- use.initial.params = as.logical(1), # bool* use_initial_params |
|
274 |
- num.threads = as.integer(num.threads), # int* num_threads |
|
275 |
- error = as.integer(0), # int* error (error handling) |
|
276 |
- count.cutoff = as.integer(count.cutoff), # int* count.cutoff |
|
277 |
- algorithm = as.integer(algorithm), # int* algorithm |
|
278 |
- PACKAGE = 'AneuFinder' |
|
279 |
- ) |
|
280 |
- |
|
281 |
- hmm$eps <- eps.try |
|
282 |
- if (num.trials > 1) { |
|
283 |
- if (hmm$loglik.delta > hmm$eps) { |
|
284 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge in trial run ",i_try,"!\n")) |
|
285 |
- } |
|
286 |
- # Store model in list |
|
287 |
- hmm$counts <- NULL |
|
288 |
- modellist[[as.character(i_try)]] <- hmm |
|
289 |
- init <- 'random' |
|
290 |
- } else if (num.trials == 1) { |
|
291 |
- if (hmm$loglik.delta > eps) { |
|
292 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge!\n")) |
|
293 |
- } |
|
294 |
- } |
|
295 |
- } |
|
296 |
- |
|
297 |
- if (num.trials > 1) { |
|
298 |
- |
|
299 |
- # Mathematically we should select the fit with highest loglikelihood. If we think the fit with the highest loglikelihood is incorrect, we should change the underlying model. However, this is very complex and we choose to select a fit that we think is (more) correct, although it has not the highest support given our (imperfect) model. |
|
300 |
- if (length(modellist)>1) { |
|
301 |
-# ## Get some metrics |
|
302 |
-# num.segs <- sapply(modellist, function(x) { length(rle(x$states)$values) }) |
|
303 |
-# logliks <- sapply(modellist,'[[','loglik') |
|
304 |
-# bhattacharyya <- numeric(length=length(modellist)) |
|
305 |
-# sos <- numeric(length=length(modellist)) |
|
306 |
-# diff.most.frequent <- numeric(length=length(modellist)) |
|
307 |
-# ind.1somy <- which(state.labels=='1-somy') |
|
308 |
-# for (i1 in 1:length(modellist)) { |
|
309 |
-# hmm <- modellist[[i1]] |
|
310 |
-# x <- 0:max(max(counts), 500) |
|
311 |
-# bhattacharyya[i1] <- -log(sum(sqrt(stats::dnbinom(x, size=hmm$size[ind.1somy], prob=hmm$prob[ind.1somy]) * stats::dnbinom(x, size=hmm$size[ind.1somy+1], prob=hmm$prob[ind.1somy+1])))) |
|
312 |
-# sos[i1] <- sum( (counts - dnbinom.mean(size=hmm$size, prob=hmm$prob)[hmm$states]) ^ 2 ) |
|
313 |
-# diff.most.frequent[i1] <- 1 - hmm$weights[state.labels==most.frequent.state] |
|
314 |
-# } |
|
315 |
-# names(bhattacharyya) <- 1:length(modellist) |
|
316 |
-# names(sos) <- 1:length(modellist) |
|
317 |
-# names(diff.most.frequent) <- 1:length(modellist) |
|
318 |
-# # Diff to most.frequen.state |
|
319 |
-# df.select <- data.frame(num.seg=num.segs, loglik=logliks, bhattacharyya=bhattacharyya, sos=sos, diff.most.frequent=diff.most.frequent) |
|
320 |
-# ## Get probability of finding value |
|
321 |
-# num.segs <- ecdf(num.segs)(num.segs) |
|
322 |
-# logliks <- 1-ecdf(logliks)(logliks) |
|
323 |
-# bhattacharyya <- 1-ecdf(bhattacharyya)(bhattacharyya) |
|
324 |
-# sos <- ecdf(sos)(sos) |
|
325 |
-# diff.most.frequent <- ecdf(diff.most.frequent)(diff.most.frequent) |
|
326 |
-# df.order <- data.frame(num.seg=num.segs, loglik=logliks, bhattacharyya=bhattacharyya, sos=sos, diff.most.frequent=diff.most.frequent) |
|
327 |
-# ## Get sum for each tuple |
|
328 |
-# ind <- rowSums(df.order) |
|
329 |
-# ## Select the one that has lowest sum |
|
330 |
-# index2use <- which.min(ind) |
|
331 |
-# result$dforder <- df.order |
|
332 |
-# result$dfselect <- df.select |
|
333 |
-# result$dfi <- index2use |
|
334 |
- |
|
335 |
- ## Select models where weight of most.frequent.state is at least half of that of actual most frequent state, then select model with highest loglik |
|
336 |
- logliks <- sapply(modellist,'[[','loglik') |
|
337 |
- df.weight <- as.data.frame(lapply(modellist, '[[', 'weights')) |
|
338 |
- names(df.weight) <- 1:length(modellist) |
|
339 |
- rownames(df.weight) <- state.labels |
|
340 |
- models2use <- df.weight[most.frequent.state,] / apply(df.weight, 2, max) > 0.5 |
|
341 |
- models2use[is.na(models2use)] <- FALSE |
|
342 |
- if (any(models2use)) { |
|
343 |
- index2use <- names(which.max(logliks[models2use])) |
|
344 |
- } else { |
|
345 |
- index2use <- names(which.max(logliks)) |
|
346 |
- } |
|
347 |
- } else { |
|
348 |
- index2use <- 1 |
|
349 |
- } |
|
350 |
- hmm <- modellist[[index2use]] |
|
351 |
- |
|
352 |
- # Check if size and prob parameter are correct |
|
353 |
- if (any(is.na(hmm$size) | is.nan(hmm$size) | is.infinite(hmm$size) | is.na(hmm$prob) | is.nan(hmm$prob) | is.infinite(hmm$prob))) { |
|
354 |
- hmm$error <- 3 |
|
355 |
- } else { |
|
356 |
- |
|
357 |
- # Rerun the HMM with different epsilon and initial parameters from trial run |
|
358 |
- message(paste0("Rerunning trial ",index2use," with eps = ",eps)) |
|
359 |
- hmm <- .C("C_univariate_hmm", |
|
360 |
- counts = as.integer(counts), # int* O |
|
361 |
- num.bins = as.integer(numbins), # int* T |
|
362 |
- num.states = as.integer(numstates), # int* N |
|
363 |
- state.labels = as.integer(state.labels), # int* state_labels |
|
364 |
- size = double(length=numstates), # double* size |
|
365 |
- prob = double(length=numstates), # double* prob |
|
366 |
- num.iterations = as.integer(max.iter), # int* maxiter |
|
367 |
- time.sec = as.integer(max.time), # double* maxtime |
|
368 |
- loglik.delta = as.double(eps), # double* eps |
|
369 |
- states = integer(length=numbins), # int* states |
|
370 |
- A = double(length=numstates*numstates), # double* A |
|
371 |
- proba = double(length=numstates), # double* proba |
|
372 |
- loglik = double(length=1), # double* loglik |
|
373 |
- weights = double(length=numstates), # double* weights |
|
374 |
- distr.type = as.integer(state.distributions), # int* distr_type |
|
375 |
- size.initial = as.vector(hmm$size), # double* initial_size |
|
376 |
- prob.initial = as.vector(hmm$prob), # double* initial_prob |
|
377 |
- A.initial = as.vector(hmm$A), # double* initial_A |
|
378 |
- proba.initial = as.vector(hmm$proba), # double* initial_proba |
|
379 |
- use.initial.params = as.logical(1), # bool* use_initial_params |
|
380 |
- num.threads = as.integer(num.threads), # int* num_threads |
|
381 |
- error = as.integer(0), # int* error (error handling) |
|
382 |
- count.cutoff = as.integer(count.cutoff), # int* count.cutoff |
|
383 |
- algorithm = as.integer(algorithm), # int* algorithm |
|
384 |
- PACKAGE = 'AneuFinder' |
|
385 |
- ) |
|
386 |
- } |
|
387 |
- |
|
388 |
- } |
|
161 |
+ ### Arrays for finding maximum posterior for each bin between offsets |
|
162 |
+ ## Make bins with offset |
|
163 |
+ ptm <- startTimedMessage("Making bins with offsets ...") |
|
164 |
+ if (length(binned.data.list) > 1) { |
|
165 |
+ stepbins <- disjoin(unlist(binned.data.list, use.names=FALSE)) |
|
166 |
+ } else { |
|
167 |
+ stepbins <- binned.data.list[[1]] |
|
168 |
+ mcols(stepbins) <- NULL |
|
169 |
+ } |
|
170 |
+ amaxPosterior.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset'))) # to store maximum posterior for current and max-of-previous offsets |
|
171 |
+ astates.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset'))) # to store states for current and max-of-previous offsets |
|
172 |
+ stopTimedMessage(ptm) |
|
173 |
+ |
|
174 |
+ ### Loop over offsets ### |
|
175 |
+ for (istep in 1:length(binned.data.list)) { |
|
176 |
+ binned.data <- binned.data.list[[istep]] |
|
177 |
+ numbins <- length(binned.data) |
|
178 |
+ counts <- mcols(binned.data)[,select] |
|
179 |
+ if (istep > 1) { |
|
180 |
+ ptm.offset <- startTimedMessage("Obtaining states for step = ", istep, " ...") |
|
181 |
+ ## Run only one iteration (no updating) if we are already over istep==1 |
|
182 |
+ initial.params <- result |
|
183 |
+ init <- 'initial.params' |
|
184 |
+ algorithm <- factor('baumWelch', levels=c('baumWelch','viterbi','EM')) |
|
185 |
+ num.trials <- 1 |
|
186 |
+ } |
|
389 | 187 |
|
188 |
+ # Check if there are counts in the data, otherwise HMM will blow up |
|
189 |
+ if (any(is.na(counts))) { |
|
190 |
+ stop(paste0("ID = ",ID,": NAs found in reads.")) |
|
191 |
+ } |
|
192 |
+ if (!any(counts!=0)) { |
|
193 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": All counts in data are zero. No HMM done.")) |
|
194 |
+ result$warnings <- warlist |
|
195 |
+ return(result) |
|
196 |
+ } else if (any(counts<0)) { |
|
197 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": Some counts in data are negative. No HMM done.")) |
|
198 |
+ result$warnings <- warlist |
|
199 |
+ return(result) |
|
200 |
+ } |
|
201 |
+ |
|
202 |
+ # Filter high counts out, makes HMM faster |
|
203 |
+ count.cutoff <- quantile(counts, count.cutoff.quantile) |
|
204 |
+ names.count.cutoff <- names(count.cutoff) |
|
205 |
+ count.cutoff <- ceiling(count.cutoff) |
|
206 |
+ mask <- counts > count.cutoff |
|
207 |
+ counts[mask] <- count.cutoff |
|
208 |
+ numfiltered <- length(which(mask)) |
|
209 |
+ if (numfiltered > 0 & istep == 1) { |
|
210 |
+ message(paste0("Replaced read counts > ",count.cutoff," (",names.count.cutoff," quantile) by ",count.cutoff," in ",numfiltered," bins. Set option 'count.cutoff.quantile=1' to disable this filtering. This filtering was done to enhance performance.")) |
|
211 |
+ } |
|
212 |
+ |
|
213 |
+ ## Call univariate in a for loop to enable multiple trials |
|
214 |
+ modellist <- list() |
|
215 |
+ for (i_try in 1:num.trials) { |
|
216 |
+ message(paste0("Trial ",i_try," / ",num.trials)) |
|
217 |
+ |
|
218 |
+ ## Initial parameters |
|
219 |
+ if (init == 'initial.params') { |
|
220 |
+ A.initial <- initial.params$transitionProbs |
|
221 |
+ proba.initial <- initial.params$startProbs |
|
222 |
+ size.initial <- initial.params$distributions[,'size'] |
|
223 |
+ prob.initial <- initial.params$distributions[,'prob'] |
|
224 |
+ size.initial[is.na(size.initial)] <- 0 |
|
225 |
+ prob.initial[is.na(prob.initial)] <- 0 |
|
226 |
+ } else if (init == 'random') { |
|
227 |
+ A.initial <- matrix(stats::runif(numstates^2), ncol=numstates) |
|
228 |
+ A.initial <- sweep(A.initial, 1, rowSums(A.initial), "/") |
|
229 |
+ proba.initial <- stats::runif(numstates) |
|
230 |
+ # Distributions for dependent states |
|
231 |
+ size.initial <- stats::runif(1, min=0, max=100) * cumsum(dependent.states.mask) |
|
232 |
+ prob.initial <- stats::runif(1) * dependent.states.mask |
|
233 |
+ # Assign initials for the 0-somy distribution |
|
234 |
+ index <- which('0-somy'==state.labels) |
|
235 |
+ size.initial[index] <- 1 |
|
236 |
+ prob.initial[index] <- 0.5 |
|
237 |
+ } else if (init == 'standard') { |
|
238 |
+ A.initial <- matrix(NA, ncol=numstates, nrow=numstates) |
|
239 |
+ for (irow in 1:numstates) { |
|
240 |
+ for (icol in 1:numstates) { |
|
241 |
+ if (irow==icol) { A.initial[irow,icol] <- 0.9 } |
|
242 |
+ else { A.initial[irow,icol] <- 0.1/(numstates-1) } |
|
243 |
+ } |
|
244 |
+ } |
|
245 |
+ proba.initial <- rep(1/numstates, numstates) |
|
246 |
+ ## Set initial mean of most.frequent.state distribution to max of count histogram |
|
247 |
+ max.counts <- as.integer(names(which.max(table(counts[counts>0])))) |
|
248 |
+ divf <- max(multiplicity[most.frequent.state], 1) |
|
249 |
+ mean.initial.monosomy <- max.counts/divf |
|
250 |
+ var.initial.monosomy <- mean.initial.monosomy * 2 |
|
251 |
+ # mean.initial.monosomy <- mean(counts[counts>0])/divf |
|
252 |
+ # var.initial.monosomy <- var(counts[counts>0])/divf |
|
253 |
+ if (is.na(mean.initial.monosomy)) { |
|
254 |
+ mean.initial.monosomy <- 1 |
|
255 |
+ } |
|
256 |
+ if (is.na(var.initial.monosomy)) { |
|
257 |
+ var.initial.monosomy <- mean.initial.monosomy + 1 |
|
258 |
+ } |
|
259 |
+ if (mean.initial.monosomy >= var.initial.monosomy) { |
|
260 |
+ mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask) |
|
261 |
+ var.initial <- (mean.initial.monosomy+1) * cumsum(dependent.states.mask) |
|
262 |
+ size.initial <- rep(0,numstates) |
|
263 |
+ prob.initial <- rep(0,numstates) |
|
264 |
+ mask <- dependent.states.mask |
|
265 |
+ size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
|
266 |
+ prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
|
267 |
+ } else { |
|
268 |
+ mean.initial <- mean.initial.monosomy * cumsum(dependent.states.mask) |
|
269 |
+ var.initial <- var.initial.monosomy * cumsum(dependent.states.mask) |
|
270 |
+ size.initial <- rep(0,numstates) |
|
271 |
+ prob.initial <- rep(0,numstates) |
|
272 |
+ mask <- dependent.states.mask |
|
273 |
+ size.initial[mask] <- dnbinom.size(mean.initial[mask], var.initial[mask]) |
|
274 |
+ prob.initial[mask] <- dnbinom.prob(mean.initial[mask], var.initial[mask]) |
|
275 |
+ } |
|
276 |
+ # Assign initials for the 0-somy distribution |
|
277 |
+ index <- which('0-somy'==state.labels) |
|
278 |
+ size.initial[index] <- 1 |
|
279 |
+ prob.initial[index] <- 0.5 |
|
280 |
+ } |
|
281 |
+ |
|
282 |
+ hmm <- .C("C_univariate_hmm", |
|
283 |
+ counts = as.integer(counts), # int* O |
|
284 |
+ num.bins = as.integer(numbins), # int* T |
|
285 |
+ num.states = as.integer(numstates), # int* N |
|
286 |
+ state.labels = as.integer(state.labels), # int* state_labels |
|
287 |
+ size = double(length=numstates), # double* size |
|
288 |
+ prob = double(length=numstates), # double* prob |
|
289 |
+ num.iterations = as.integer(max.iter), # int* maxiter |
|
290 |
+ time.sec = as.integer(max.time), # double* maxtime |
|
291 |
+ loglik.delta = as.double(eps.try), # double* eps |
|
292 |
+ maxPosterior = double(length=numbins), # double* maxPosterior |
|
293 |
+ states = integer(length=numbins), # int* states |
|
294 |
+ A = double(length=numstates*numstates), # double* A |
|
295 |
+ proba = double(length=numstates), # double* proba |
|
296 |
+ loglik = double(length=1), # double* loglik |
|
297 |
+ weights = double(length=numstates), # double* weights |
|
298 |
+ distr.type = as.integer(state.distributions), # int* distr_type |
|
299 |
+ size.initial = as.vector(size.initial), # double* initial_size |
|
300 |
+ prob.initial = as.vector(prob.initial), # double* initial_prob |
|
301 |
+ A.initial = as.vector(A.initial), # double* initial_A |
|
302 |
+ proba.initial = as.vector(proba.initial), # double* initial_proba |
|
303 |
+ use.initial.params = as.logical(1), # bool* use_initial_params |
|
304 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
305 |
+ error = as.integer(0), # int* error (error handling) |
|
306 |
+ count.cutoff = as.integer(count.cutoff), # int* count.cutoff |
|
307 |
+ algorithm = as.integer(algorithm), # int* algorithm |
|
308 |
+ PACKAGE = 'AneuFinder' |
|
309 |
+ ) |
|
310 |
+ |
|
311 |
+ hmm$eps <- eps.try |
|
312 |
+ if (num.trials > 1) { |
|
313 |
+ if (hmm$loglik.delta > hmm$eps & istep == 1) { |
|
314 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge in trial run ",i_try,"!\n")) |
|
315 |
+ } |
|
316 |
+ # Store model in list |
|
317 |
+ hmm$counts <- NULL |
|
318 |
+ modellist[[as.character(i_try)]] <- hmm |
|
319 |
+ init <- 'random' |
|
320 |
+ } else if (num.trials == 1) { |
|
321 |
+ if (hmm$loglik.delta > eps & istep == 1) { |
|
322 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": HMM did not converge!\n")) |
|
323 |
+ } |
|
324 |
+ } |
|
325 |
+ } |
|
326 |
+ |
|
327 |
+ if (num.trials > 1) { |
|
328 |
+ |
|
329 |
+ # Mathematically we should select the fit with highest loglikelihood. If we think the fit with the highest loglikelihood is incorrect, we should change the underlying model. However, this is very complex and we choose to select a fit that we think is (more) correct, although it has not the highest support given our (imperfect) model. |
|
330 |
+ if (length(modellist)>1) { |
|
331 |
+ # ## Get some metrics |
|
332 |
+ # num.segs <- sapply(modellist, function(x) { length(rle(x$states)$values) }) |
|
333 |
+ # logliks <- sapply(modellist,'[[','loglik') |
|
334 |
+ # bhattacharyya <- numeric(length=length(modellist)) |
|
335 |
+ # sos <- numeric(length=length(modellist)) |
|
336 |
+ # diff.most.frequent <- numeric(length=length(modellist)) |
|
337 |
+ # ind.1somy <- which(state.labels=='1-somy') |
|
338 |
+ # for (i1 in 1:length(modellist)) { |
|
339 |
+ # hmm <- modellist[[i1]] |
|
340 |
+ # x <- 0:max(max(counts), 500) |
|
341 |
+ # bhattacharyya[i1] <- -log(sum(sqrt(stats::dnbinom(x, size=hmm$size[ind.1somy], prob=hmm$prob[ind.1somy]) * stats::dnbinom(x, size=hmm$size[ind.1somy+1], prob=hmm$prob[ind.1somy+1])))) |
|
342 |
+ # sos[i1] <- sum( (counts - dnbinom.mean(size=hmm$size, prob=hmm$prob)[hmm$states]) ^ 2 ) |
|
343 |
+ # diff.most.frequent[i1] <- 1 - hmm$weights[state.labels==most.frequent.state] |
|
344 |
+ # } |
|
345 |
+ # names(bhattacharyya) <- 1:length(modellist) |
|
346 |
+ # names(sos) <- 1:length(modellist) |
|
347 |
+ # names(diff.most.frequent) <- 1:length(modellist) |
|
348 |
+ # # Diff to most.frequen.state |
|
349 |
+ # df.select <- data.frame(num.seg=num.segs, loglik=logliks, bhattacharyya=bhattacharyya, sos=sos, diff.most.frequent=diff.most.frequent) |
|
350 |
+ # ## Get probability of finding value |
|
351 |
+ # num.segs <- ecdf(num.segs)(num.segs) |
|
352 |
+ # logliks <- 1-ecdf(logliks)(logliks) |
|
353 |
+ # bhattacharyya <- 1-ecdf(bhattacharyya)(bhattacharyya) |
|
354 |
+ # sos <- ecdf(sos)(sos) |
|
355 |
+ # diff.most.frequent <- ecdf(diff.most.frequent)(diff.most.frequent) |
|
356 |
+ # df.order <- data.frame(num.seg=num.segs, loglik=logliks, bhattacharyya=bhattacharyya, sos=sos, diff.most.frequent=diff.most.frequent) |
|
357 |
+ # ## Get sum for each tuple |
|
358 |
+ # ind <- rowSums(df.order) |
|
359 |
+ # ## Select the one that has lowest sum |
|
360 |
+ # index2use <- which.min(ind) |
|
361 |
+ # result$dforder <- df.order |
|
362 |
+ # result$dfselect <- df.select |
|
363 |
+ # result$dfi <- index2use |
|
364 |
+ |
|
365 |
+ ## Select models where weight of most.frequent.state is at least half of that of actual most frequent state, then select model with highest loglik |
|
366 |
+ logliks <- sapply(modellist,'[[','loglik') |
|
367 |
+ df.weight <- as.data.frame(lapply(modellist, '[[', 'weights')) |
|
368 |
+ names(df.weight) <- 1:length(modellist) |
|
369 |
+ rownames(df.weight) <- state.labels |
|
370 |
+ models2use <- df.weight[most.frequent.state,] / apply(df.weight, 2, max) > 0.5 |
|
371 |
+ models2use[is.na(models2use)] <- FALSE |
|
372 |
+ if (any(models2use)) { |
|
373 |
+ index2use <- names(which.max(logliks[models2use])) |
|
374 |
+ } else { |
|
375 |
+ index2use <- names(which.max(logliks)) |
|
376 |
+ } |
|
377 |
+ } else { |
|
378 |
+ index2use <- 1 |
|
379 |
+ } |
|
380 |
+ hmm <- modellist[[index2use]] |
|
381 |
+ |
|
382 |
+ # Check if size and prob parameter are correct |
|
383 |
+ if (any(is.na(hmm$size) | is.nan(hmm$size) | is.infinite(hmm$size) | is.na(hmm$prob) | is.nan(hmm$prob) | is.infinite(hmm$prob))) { |
|
384 |
+ hmm$error <- 3 |
|
385 |
+ } else { |
|
386 |
+ |
|
387 |
+ # Rerun the HMM with different epsilon and initial parameters from trial run |
|
388 |
+ message(paste0("Rerunning trial ",index2use," with eps = ",eps)) |
|
389 |
+ hmm <- .C("C_univariate_hmm", |
|
390 |
+ counts = as.integer(counts), # int* O |
|
391 |
+ num.bins = as.integer(numbins), # int* T |
|
392 |
+ num.states = as.integer(numstates), # int* N |
|
393 |
+ state.labels = as.integer(state.labels), # int* state_labels |
|
394 |
+ size = double(length=numstates), # double* size |
|
395 |
+ prob = double(length=numstates), # double* prob |
|
396 |
+ num.iterations = as.integer(max.iter), # int* maxiter |
|
397 |
+ time.sec = as.integer(max.time), # double* maxtime |
|
398 |
+ loglik.delta = as.double(eps), # double* eps |
|
399 |
+ maxPosterior = double(length=numbins), # double* maxPosterior |
|
400 |
+ states = integer(length=numbins), # int* states |
|
401 |
+ A = double(length=numstates*numstates), # double* A |
|
402 |
+ proba = double(length=numstates), # double* proba |
|
403 |
+ loglik = double(length=1), # double* loglik |
|
404 |
+ weights = double(length=numstates), # double* weights |
|
405 |
+ distr.type = as.integer(state.distributions), # int* distr_type |
|
406 |
+ size.initial = as.vector(hmm$size), # double* initial_size |
|
407 |
+ prob.initial = as.vector(hmm$prob), # double* initial_prob |
|
408 |
+ A.initial = as.vector(hmm$A), # double* initial_A |
|
409 |
+ proba.initial = as.vector(hmm$proba), # double* initial_proba |
|
410 |
+ use.initial.params = as.logical(1), # bool* use_initial_params |
|
411 |
+ num.threads = as.integer(num.threads), # int* num_threads |
|
412 |
+ error = as.integer(0), # int* error (error handling) |
|
413 |
+ count.cutoff = as.integer(count.cutoff), # int* count.cutoff |
|
414 |
+ algorithm = as.integer(algorithm), # int* algorithm |
|
415 |
+ PACKAGE = 'AneuFinder' |
|
416 |
+ ) |
|
417 |
+ } |
|
418 |
+ |
|
419 |
+ } # if (num.trials > 1) |
|
420 |
+ |
|
421 |
+ if (istep == 1) { |
|
422 |
+ ### Make return object ### |
|
423 |
+ ## Check for errors |
|
424 |
+ if (hmm$error == 0) { |
|
425 |
+ ## Bin coordinates and states ### |
|
426 |
+ result$bins <- binned.data |
|
427 |
+ result$bins$state <- state.labels[hmm$states] |
|
428 |
+ result$bins$copy.number <- multiplicity[as.character(result$bins$state)] |
|
429 |
+ ## Parameters |
|
430 |
+ # Weights |
|
431 |
+ result$weights <- hmm$weights |
|
432 |
+ names(result$weights) <- state.labels |
|
433 |
+ # Transition matrices |
|
434 |
+ transitionProbs <- matrix(hmm$A, ncol=hmm$num.states) |
|
435 |
+ rownames(transitionProbs) <- state.labels |
|
436 |
+ colnames(transitionProbs) <- state.labels |
|
437 |
+ result$transitionProbs <- transitionProbs |
|
438 |
+ transitionProbs.initial <- matrix(hmm$A.initial, ncol=hmm$num.states) |
|
439 |
+ rownames(transitionProbs.initial) <- state.labels |
|
440 |
+ colnames(transitionProbs.initial) <- state.labels |
|
441 |
+ result$transitionProbs.initial <- transitionProbs.initial |
|
442 |
+ # Initial probs |
|
443 |
+ result$startProbs <- hmm$proba |
|
444 |
+ names(result$startProbs) <- state.labels |
|
445 |
+ result$startProbs.initial <- hmm$proba.initial |
|
446 |
+ names(result$startProbs.initial) <- state.labels |
|
447 |
+ # Distributions |
|
448 |
+ distributions <- data.frame() |
|
449 |
+ distributions.initial <- data.frame() |
|
450 |
+ for (idistr in 1:length(hmm$distr.type)) { |
|
451 |
+ distr <- levels(state.distributions)[hmm$distr.type[idistr]] |
|
452 |
+ if (distr == 'dnbinom') { |
|
453 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
454 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
455 |
+ } else if (distr == 'dgeom') { |
|
456 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr]))) |
|
457 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr]))) |
|
458 |
+ } else if (distr == 'delta') { |
|
459 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0)) |
|
460 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0)) |
|
461 |
+ } else if (distr == 'dbinom') { |
|
462 |
+ distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
463 |
+ distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
464 |
+ } |
|
465 |
+ } |
|
466 |
+ rownames(distributions) <- state.labels |
|
467 |
+ rownames(distributions.initial) <- state.labels |
|
468 |
+ result$distributions <- distributions |
|
469 |
+ result$distributions.initial <- distributions.initial |
|
470 |
+ ## Convergence info |
|
471 |
+ convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error) |
|
472 |
+ result$convergenceInfo <- convergenceInfo |
|
473 |
+ |
|
474 |
+ } else if (hmm$error == 1) { |
|
475 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
|
476 |
+ } else if (hmm$error == 2) { |
|
477 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library")) |
|
478 |
+ } else if (hmm$error == 3) { |
|
479 |
+ warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": NA/NaN/Inf in 'size' or 'prob' parameter detected. This is probably because your binned data contains too few bins.")) |
|
480 |
+ } |
|
481 |
+ if (hmm$error != 0) { |
|
482 |
+ result$warlist <- warlist |
|
483 |
+ return(result) |
|
484 |
+ } |
|
485 |
+ } # if (istep == 1) |
|
486 |
+ |
|
487 |
+ if (istep == 1) { ptm <- startTimedMessage("Collecting counts and posteriors ...") } |
|
488 |
+ |
|
489 |
+ ## Inflate posteriors, states, counts to new offset |
|
490 |
+ ind <- findOverlaps(stepbins, binned.data) |
|
491 |
+ astates.step[ind@from, 'currentOffset'] <- hmm$states[ind@to] |
|
492 |
+ amaxPosterior.step[ind@from, 'currentOffset'] <- hmm$maxPosterior[ind@to] |
|
493 |
+ |
|
494 |
+ ## Find offset that maximizes the posteriors for each bin |
|
495 |
+ ##-- Start stuff to call C code |
|
496 |
+ # Work with changing dimensions to avoid copies being made |
|
497 |
+ dim_amaxPosterior.step <- dim(amaxPosterior.step) |
|
498 |
+ dimnames_amaxPosterior.step <- dimnames(amaxPosterior.step) |
|
499 |
+ dim(amaxPosterior.step) <- NULL |
|
500 |
+ z <- .C("C_array2D_which_max", |
|
501 |
+ array2D = amaxPosterior.step, |
|
502 |
+ dim = as.integer(dim_amaxPosterior.step), |
|
503 |
+ ind_max = integer(dim_amaxPosterior.step[1]), |
|
504 |
+ value_max = double(dim_amaxPosterior.step[1])) |
|
505 |
+ dim(amaxPosterior.step) <- dim_amaxPosterior.step |
|
506 |
+ dimnames(amaxPosterior.step) <- dimnames_amaxPosterior.step |
|
507 |
+ ind <- z$ind_max |
|
508 |
+ ##-- End stuff to call C code |
|
509 |
+ for (i1 in 1:2) { |
|
510 |
+ mask <- ind == i1 |
|
511 |
+ astates.step[mask, 'previousOffsets'] <- astates.step[mask,i1, drop=FALSE] |
|
512 |
+ amaxPosterior.step[mask, 'previousOffsets'] <- amaxPosterior.step[mask,i1, drop=FALSE] |
|
513 |
+ } |
|
514 |
+ if (istep == 1) { stopTimedMessage(ptm) } |
|
515 |
+ |
|
516 |
+ if (istep > 1) { |
|
517 |
+ stopTimedMessage(ptm.offset) |
|
518 |
+ } |
|
519 |
+ |
|
520 |
+ rm(hmm, ind) |
|
521 |
+ } # loop over offsets |
|
522 |
+ states.step <- astates.step[, 'previousOffsets'] |
|
523 |
+ rm(amaxPosterior.step, astates.step); gc() |
|
524 |
+ |
|
390 | 525 |
### Make return object ### |
391 |
- ## Check for errors |
|
392 |
- if (hmm$error == 0) { |
|
393 |
- ## Bin coordinates and states ### |
|
394 |
- result$bins$state <- state.labels[hmm$states] |
|
395 |
- result$bins$copy.number <- multiplicity[as.character(result$bins$state)] |
|
396 |
- ## Segmentation |
|
397 |
- message("Making segmentation ...", appendLF=FALSE) |
|
398 |
- ptm <- proc.time() |
|
399 |
- suppressMessages( |
|
400 |
- result$segments <- as(collapseBins(as.data.frame(result$bins), column2collapseBy='state', columns2drop='width', columns2average=c('counts','mcounts','pcounts')), 'GRanges') |
|
401 |
- ) |
|
402 |
- seqlevels(result$segments) <- seqlevels(result$bins) # correct order from as() |
|
403 |
- seqlengths(result$segments) <- seqlengths(binned.data)[names(seqlengths(result$segments))] |
|
404 |
- time <- proc.time() - ptm |
|
405 |
- message(" ",round(time[3],2),"s") |
|
406 |
- ## Parameters |
|
407 |
- # Weights |
|
408 |
- result$weights <- hmm$weights |
|
409 |
- names(result$weights) <- state.labels |
|
410 |
- # Transition matrices |
|
411 |
- transitionProbs <- matrix(hmm$A, ncol=hmm$num.states) |
|
412 |
- rownames(transitionProbs) <- state.labels |
|
413 |
- colnames(transitionProbs) <- state.labels |
|
414 |
- result$transitionProbs <- transitionProbs |
|
415 |
- transitionProbs.initial <- matrix(hmm$A.initial, ncol=hmm$num.states) |
|
416 |
- rownames(transitionProbs.initial) <- state.labels |
|
417 |
- colnames(transitionProbs.initial) <- state.labels |
|
418 |
- result$transitionProbs.initial <- transitionProbs.initial |
|
419 |
- # Initial probs |
|
420 |
- result$startProbs <- hmm$proba |
|
421 |
- names(result$startProbs) <- state.labels |
|
422 |
- result$startProbs.initial <- hmm$proba.initial |
|
423 |
- names(result$startProbs.initial) <- state.labels |
|
424 |
- # Distributions |
|
425 |
- distributions <- data.frame() |
|
426 |
- distributions.initial <- data.frame() |
|
427 |
- for (idistr in 1:length(hmm$distr.type)) { |
|
428 |
- distr <- levels(state.distributions)[hmm$distr.type[idistr]] |
|
429 |
- if (distr == 'dnbinom') { |
|
430 |
- distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dnbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dnbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
431 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dnbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dnbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
432 |
- } else if (distr == 'dgeom') { |
|
433 |
- distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=hmm$prob[idistr], mu=dgeom.mean(hmm$prob[idistr]), variance=dgeom.variance(hmm$prob[idistr]))) |
|
434 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=hmm$prob.initial[idistr], mu=dgeom.mean(hmm$prob.initial[idistr]), variance=dgeom.variance(hmm$prob.initial[idistr]))) |
|
435 |
- } else if (distr == 'delta') { |
|
436 |
- distributions <- rbind(distributions, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0)) |
|
437 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=NA, prob=NA, mu=0, variance=0)) |
|
438 |
- } else if (distr == 'dbinom') { |
|
439 |
- distributions <- rbind(distributions, data.frame(type=distr, size=hmm$size[idistr], prob=hmm$prob[idistr], mu=dbinom.mean(hmm$size[idistr],hmm$prob[idistr]), variance=dbinom.variance(hmm$size[idistr],hmm$prob[idistr]))) |
|
440 |
- distributions.initial <- rbind(distributions.initial, data.frame(type=distr, size=hmm$size.initial[idistr], prob=hmm$prob.initial[idistr], mu=dbinom.mean(hmm$size.initial[idistr],hmm$prob.initial[idistr]), variance=dbinom.variance(hmm$size.initial[idistr],hmm$prob.initial[idistr]))) |
|
441 |
- } |
|
442 |
- } |
|
443 |
- rownames(distributions) <- state.labels |
|
444 |
- rownames(distributions.initial) <- state.labels |
|
445 |
- result$distributions <- distributions |
|
446 |
- result$distributions.initial <- distributions.initial |
|
447 |
- ## Convergence info |
|
448 |
- convergenceInfo <- list(eps=eps, loglik=hmm$loglik, loglik.delta=hmm$loglik.delta, num.iterations=hmm$num.iterations, time.sec=hmm$time.sec, error=hmm$error) |
|
449 |
- result$convergenceInfo <- convergenceInfo |
|
450 |
- ## Quality info |
|
451 |
- result$qualityInfo <- as.list(getQC(result)) |
|
452 |
- } else if (hmm$error == 1) { |
|
453 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": A NaN occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library! The following factors are known to cause this error: 1) Your read counts contain very high numbers. Try again with a lower value for 'count.cutoff.quantile'. 2) Your library contains too few reads in each bin. 3) Your library contains reads for a different genome than it was aligned to.")) |
|
454 |
- } else if (hmm$error == 2) { |
|
455 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": An error occurred during the Baum-Welch! Parameter estimation terminated prematurely. Check your library")) |
|
456 |
- } else if (hmm$error == 3) { |
|
457 |
- warlist[[length(warlist)+1]] <- warning(paste0("ID = ",ID,": NA/NaN/Inf in 'size' or 'prob' parameter detected. This is probably because your binned data contains too few bins.")) |
|
458 |
- } |
|
526 |
+ ## Bin coordinates and states ### |
|
527 |
+ result$stepbins <- stepbins |
|
528 |
+ result$stepbins$state <- state.labels[states.step] |
|
529 |
+ result$stepbins$copy.number <- multiplicity[as.character(result$stepbins$state)] |
|
530 |
+ ## Segmentation |
|
531 |
+ message("Making segmentation ...", appendLF=FALSE) |
|
532 |
+ ptm <- proc.time() |
|
533 |
+ suppressMessages( |
|
534 |
+ result$segments <- as(collapseBins(as.data.frame(result$stepbins), column2collapseBy='state', columns2drop='width', columns2average=c('counts','mcounts','pcounts')), 'GRanges') |
|
535 |
+ ) |
|
536 |
+ seqlevels(result$segments) <- seqlevels(result$stepbins) # correct order from as() |
|
537 |
+ seqlengths(result$segments) <- seqlengths(binned.data)[names(seqlengths(result$segments))] |
|
538 |
+ time <- proc.time() - ptm |
|
539 |
+ message(" ",round(time[3],2),"s") |
|
540 |
+ ## Counts |
|
541 |
+ result$counts <- binned.data.list |
|
542 |
+ ## Quality info |
|
543 |
+ result$qualityInfo <- as.list(getQC(result)) |
|
459 | 544 |
|
460 | 545 |
## Issue warnings |
461 | 546 |
result$warnings <- warlist |
... | ... |
@@ -911,7 +996,11 @@ DNAcopy.findCNVs <- function(binned.data, ID=NULL, CNgrid.start=1.5, count.cutof |
911 | 996 |
} |
912 | 997 |
} |
913 | 998 |
## Intercept user input |
914 |
- binned.data <- loadFromFiles(binned.data, check.class='GRanges')[[1]] |
|
999 |
+ binned.data <- loadFromFiles(binned.data, check.class=c('GRanges', 'GRangesList'))[[1]] |
|
1000 |
+ if (class(binned.data) == 'GRangesList') { |
|
1001 |
+ binned.data.list <- binned.data |
|
1002 |
+ binned.data <- binned.data.list[[1]] |
|
1003 |
+ } |
|
915 | 1004 |
if (is.null(ID)) { |
916 | 1005 |
ID <- attr(binned.data, 'ID') |
917 | 1006 |
} |
... | ... |
@@ -2,9 +2,9 @@ |
2 | 2 |
#' |
3 | 3 |
#' Wrapper to load \pkg{\link{AneuFinder}} objects from file and check the class of the loaded objects. |
4 | 4 |
#' |
5 |
-#' @param files A list of \code{\link{GRanges}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects or a character vector with files that contain such objects. |
|
6 |
-#' @param check.class Any combination of \code{c('GRanges', 'aneuHMM', 'aneuBiHMM')}. If any of the loaded objects does not belong to the specified class, an error is thrown. |
|
7 |
-#' @return A list of \code{\link{GRanges}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects. |
|
5 |
+#' @param files A list of \code{\link{GRanges}}, \code{\link{GRangesList}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects or a character vector with files that contain such objects. |
|
6 |
+#' @param check.class Any combination of \code{c('GRanges', 'GRangesList', 'aneuHMM', 'aneuBiHMM')}. If any of the loaded objects does not belong to the specified class, an error is thrown. |
|
7 |
+#' @return A list of \code{\link{GRanges}}, \code{\link{GRangesList}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects. |
|
8 | 8 |
#' @export |
9 | 9 |
#' @examples |
10 | 10 |
#'## Get some files that you want to load |
... | ... |
@@ -14,15 +14,15 @@ |
14 | 14 |
#'hmms <- loadFromFiles(files[1:10]) |
15 | 15 |
#'lapply(hmms, plot, type='profile') |
16 | 16 |
#' |
17 |
-loadFromFiles <- function(files, check.class=c('GRanges', 'aneuHMM', 'aneuBiHMM')) { |
|
17 |
+loadFromFiles <- function(files, check.class=c('GRanges', 'GRangesList', 'aneuHMM', 'aneuBiHMM')) { |
|
18 | 18 |
|
19 | 19 |
# ptm <- startTimedMessage("Loading data from files ...") |
20 | 20 |
if (is.null(files)) { |
21 | 21 |
# stopTimedMessage(ptm) |
22 | 22 |
return(files) |
23 | 23 |
} |
24 |
- if (any(! check.class %in% c('GRanges', class.univariate.hmm, class.bivariate.hmm))) { |
|
25 |
- stop("Argument 'check.class' must contain any combination of c('", paste0(c('GRanges', class.univariate.hmm, class.bivariate.hmm), collapse="', '"), "').") |
|
24 |
+ if (any(! check.class %in% c('GRanges', 'GRangesList', class.univariate.hmm, class.bivariate.hmm))) { |
|
25 |
+ stop("Argument 'check.class' must contain any combination of c('", paste0(c('GRanges', 'GRangesList', class.univariate.hmm, class.bivariate.hmm), collapse="', '"), "').") |
|
26 | 26 |
} |
27 | 27 |
modellist <- list() |
28 | 28 |
if (is.character(files)) { |
... | ... |
@@ -114,7 +114,7 @@ getQC <- function(models) { |
114 | 114 |
return(x) |
115 | 115 |
} |
116 | 116 |
} |
117 |
- models <- suppressMessages( loadFromFiles(models, check.class=c('GRanges', class.univariate.hmm, class.bivariate.hmm)) ) |
|
117 |
+ models <- suppressMessages( loadFromFiles(models, check.class=c('GRanges', 'GRangesList', class.univariate.hmm, class.bivariate.hmm)) ) |
|
118 | 118 |
qframe <- list() |
119 | 119 |
for (i1 in 1:length(models)) { |
120 | 120 |
model <- models[[i1]] |
... | ... |
@@ -144,6 +144,19 @@ getQC <- function(models) { |
144 | 144 |
bhattacharyya=NA, |
145 | 145 |
sos=NA |
146 | 146 |
) |
147 |
+ } else if (class(model) == 'GRangesList') { |
|
148 |
+ bins <- model[[1]] |
|
149 |
+ qframe[[i1]] <- data.frame( total.read.count=sum(bins$counts), |
|
150 |
+ avg.binsize=mean(width(bins)), |
|
151 |
+ avg.read.count=mean(bins$counts), |
|
152 |
+ spikiness=qc.spikiness(bins$counts), |
|
153 |
+ entropy=qc.entropy(bins$counts), |
|
154 |
+ complexity=null2na(attr(model,'qualityInfo')$complexity[1]), |
|
155 |
+ loglik=NA, |
|
156 |
+ num.segments=NA, |
|
157 |
+ bhattacharyya=NA, |
|
158 |
+ sos=NA |
|
159 |
+ ) |
|
147 | 160 |
} |
148 | 161 |
} |
149 | 162 |
names(qframe) <- names(models) |
... | ... |
@@ -11,7 +11,7 @@ bivariate.findCNVs(binned.data, ID = NULL, eps = 0.1, init = "standard", |
11 | 11 |
most.frequent.state = "1-somy", algorithm = "EM", initial.params = NULL) |
12 | 12 |
} |
13 | 13 |
\arguments{ |
14 |
-\item{binned.data}{A \link{GRanges} object with binned read counts.} |
|
14 |
+\item{binned.data}{A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.} |
|
15 | 15 |
|
16 | 16 |
\item{ID}{An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.} |
17 | 17 |
|
... | ... |
@@ -12,7 +12,7 @@ findCNVs(binned.data, ID = NULL, eps = 0.1, init = "standard", |
12 | 12 |
initial.params = NULL) |
13 | 13 |
} |
14 | 14 |
\arguments{ |
15 |
-\item{binned.data}{A \link{GRanges} object with binned read counts.} |
|
15 |
+\item{binned.data}{A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.} |
|
16 | 16 |
|
17 | 17 |
\item{ID}{An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.} |
18 | 18 |
|
... | ... |
@@ -60,7 +60,7 @@ An \code{\link{aneuHMM}} object. |
60 | 60 |
\examples{ |
61 | 61 |
## Get an example BED file with single-cell-sequencing reads |
62 | 62 |
bedfile <- system.file("extdata", "KK150311_VI_07.bam.bed.gz", package="AneuFinderData") |
63 |
-## Bin the BAM file into bin size 1Mp |
|
63 |
+## Bin the data into bin size 1Mp |
|
64 | 64 |
binned <- binReads(bedfile, assembly='mm10', binsize=1e6, |
65 | 65 |
chromosomes=c(1:19,'X','Y')) |
66 | 66 |
## Fit the Hidden Markov Model |
... | ... |
@@ -12,7 +12,7 @@ findCNVs.strandseq(binned.data, ID = NULL, eps = 0.1, init = "standard", |
12 | 12 |
initial.params = NULL) |
13 | 13 |
} |
14 | 14 |
\arguments{ |
15 |
-\item{binned.data}{A \link{GRanges} object with binned read counts.} |
|
15 |
+\item{binned.data}{A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.} |
|
16 | 16 |
|
17 | 17 |
\item{ID}{An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.} |
18 | 18 |
|
... | ... |
@@ -4,15 +4,16 @@ |
4 | 4 |
\alias{loadFromFiles} |
5 | 5 |
\title{Load \pkg{AneuFinder} objects from file} |
6 | 6 |
\usage{ |
7 |
-loadFromFiles(files, check.class = c("GRanges", "aneuHMM", "aneuBiHMM")) |
|
7 |
+loadFromFiles(files, check.class = c("GRanges", "GRangesList", "aneuHMM", |
|
8 |
+ "aneuBiHMM")) |
|
8 | 9 |
} |
9 | 10 |
\arguments{ |
10 |
-\item{files}{A list of \code{\link{GRanges}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects or a character vector with files that contain such objects.} |
|
11 |
+\item{files}{A list of \code{\link{GRanges}}, \code{\link{GRangesList}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects or a character vector with files that contain such objects.} |
|
11 | 12 |
|
12 |
-\item{check.class}{Any combination of \code{c('GRanges', 'aneuHMM', 'aneuBiHMM')}. If any of the loaded objects does not belong to the specified class, an error is thrown.} |
|
13 |
+\item{check.class}{Any combination of \code{c('GRanges', 'GRangesList', 'aneuHMM', 'aneuBiHMM')}. If any of the loaded objects does not belong to the specified class, an error is thrown.} |
|
13 | 14 |
} |
14 | 15 |
\value{ |
15 |
-A list of \code{\link{GRanges}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects. |
|
16 |
+A list of \code{\link{GRanges}}, \code{\link{GRangesList}}, \code{\link{aneuHMM}} or \code{\link{aneuBiHMM}} objects. |
|
16 | 17 |
} |
17 | 18 |
\description{ |
18 | 19 |
Wrapper to load \pkg{\link{AneuFinder}} objects from file and check the class of the loaded objects. |
... | ... |
@@ -30,8 +30,7 @@ A \code{\link{GRanges}} object with reads. |
30 | 30 |
Merge strand libraries to generate a high-coverage Strand-seq library. |
31 | 31 |
} |
32 | 32 |
\examples{ |
33 |
-files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE) |
|
34 |
-reads <- mergeStrandseqFiles(files, assembly='hg38') |
|
35 |
- |
|
33 |
+#files <- list.files('~/work_ERIBA/test/aneufinder_files/DH161028_WT', full.names = TRUE) |
|
34 |
+#reads <- mergeStrandseqFiles(files, assembly='hg38') |
|
36 | 35 |
} |
37 | 36 |
|
... | ... |
@@ -11,7 +11,7 @@ univariate.findCNVs(binned.data, ID = NULL, eps = 0.1, init = "standard", |
11 | 11 |
most.frequent.state = "2-somy", algorithm = "EM", initial.params = NULL) |
12 | 12 |
} |
13 | 13 |
\arguments{ |
14 |
-\item{binned.data}{A \link{GRanges} object with binned read counts.} |
|
14 |
+\item{binned.data}{A \code{\link{GRanges}} object with binned read counts. Alternatively a \code{\link{GRangesList}} object with offsetted read counts.} |
|
15 | 15 |
|
16 | 16 |
\item{ID}{An identifier that will be used to identify this sample in various downstream functions. Could be the file name of the \code{binned.data} for example.} |
17 | 17 |
|
... | ... |
@@ -9,7 +9,7 @@ static double** multiD; |
9 | 9 |
// =================================================================================================================================================== |
10 | 10 |
// This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the EM and returns the result to R. |
11 | 11 |
// =================================================================================================================================================== |
12 |
-void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm) |
|
12 |
+void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm) |
|
13 | 13 |
{ |
14 | 14 |
|
15 | 15 |
// Define logging level |
... | ... |
@@ -137,16 +137,16 @@ void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, dou |
137 | 137 |
else { *error = 2; } |
138 | 138 |
} |
139 | 139 |
|
140 |
-// // Compute the posteriors and save results directly to the R pointer |
|
141 |
-// //FILE_LOG(logDEBUG1) << "Recode posteriors into column representation"; |
|
142 |
-// #pragma omp parallel for |
|
143 |
-// for (int iN=0; iN<*N; iN++) |
|
144 |
-// { |
|
145 |
-// for (int t=0; t<*T; t++) |
|
146 |
-// { |
|
147 |
-// posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t); |
|
148 |
-// } |
|
149 |
-// } |
|
140 |
+ // // Compute the posteriors and save results directly to the R pointer |
|
141 |
+ // //FILE_LOG(logDEBUG1) << "Recode posteriors into column representation"; |
|
142 |
+ // #pragma omp parallel for |
|
143 |
+ // for (int iN=0; iN<*N; iN++) |
|
144 |
+ // { |
|
145 |
+ // for (int t=0; t<*T; t++) |
|
146 |
+ // { |
|
147 |
+ // posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t); |
|
148 |
+ // } |
|
149 |
+ // } |
|
150 | 150 |
|
151 | 151 |
// Compute the states from posteriors |
152 | 152 |
//FILE_LOG(logDEBUG1) << "Computing states from posteriors"; |
... | ... |
@@ -160,6 +160,7 @@ void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, dou |
160 | 160 |
} |
161 | 161 |
ind_max = std::distance(posterior_per_t.begin(), std::max_element(posterior_per_t.begin(), posterior_per_t.end())); |
162 | 162 |
states[t] = state_labels[ind_max]; |
163 |
+ maxPosterior[t] = posterior_per_t[ind_max]; |
|
163 | 164 |
} |
164 | 165 |
|
165 | 166 |
//FILE_LOG(logDEBUG1) << "Return parameters"; |
... | ... |
@@ -365,3 +366,23 @@ void multivariate_cleanup(int* N) |
365 | 366 |
FreeDoubleMatrix(multiD, *N); |
366 | 367 |
} |
367 | 368 |
|
369 |
+ |
|
370 |
+// ==================================================================== |
|
371 |
+// C version of apply(array2D, 1, which.max) and apply(array2D, 1, max) |
|
372 |
+// ==================================================================== |
|
373 |
+void array2D_which_max(double* array2D, int* dim, int* ind_max, double* value_max) |
|
374 |
+{ |
|
375 |
+ // array2D is actually a vector, but is intended to originate from a 2D array in R |
|
376 |
+ std::vector<double> value_per_i0(dim[1]); |
|
377 |
+ for (int i0=0; i0<dim[0]; i0++) |
|
378 |
+ { |
|
379 |
+ for (int i1=0; i1<dim[1]; i1++) |
|
380 |
+ { |
|
381 |
+ value_per_i0[i1] = array2D[i1 * dim[0] + i0]; |
|
382 |
+ // Rprintf("i0=%d, i1=%d, value_per_i0[%d] = %g\n", i0, i1, i1, value_per_i0[i1]); |
|
383 |
+ } |
|
384 |
+ ind_max[i0] = 1 + std::distance(value_per_i0.begin(), std::max_element(value_per_i0.begin(), value_per_i0.end())); |
|
385 |
+ value_max[i0] = *std::max_element(value_per_i0.begin(), value_per_i0.end()); |
|
386 |
+ } |
|
387 |
+ |
|
388 |
+} |
|
368 | 389 |
\ No newline at end of file |
... | ... |
@@ -13,7 +13,7 @@ |
13 | 13 |
// #endif |
14 | 14 |
|
15 | 15 |
extern "C" |
16 |
-void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm); |
|
16 |
+void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm); |
|
17 | 17 |
|
18 | 18 |
extern "C" |
19 | 19 |
void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm); |
... | ... |
@@ -24,3 +24,5 @@ void univariate_cleanup(); |
24 | 24 |
extern "C" |
25 | 25 |
void multivariate_cleanup(int* N); |
26 | 26 |
|
27 |
+extern "C" |
|
28 |
+void array2D_which_max(double* array2D, int* dim, int* ind_max, double* value_max); |
... | ... |
@@ -3,15 +3,17 @@ |
3 | 3 |
#include "R_interface.h" |
4 | 4 |
|
5 | 5 |
|
6 |
-R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP}; |
|
6 |
+R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP}; |
|
7 | 7 |
R_NativePrimitiveArgType arg2[] = {REALSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP}; |
8 | 8 |
R_NativePrimitiveArgType arg4[] = {INTSXP}; |
9 |
+R_NativePrimitiveArgType arg5[] = {REALSXP, INTSXP, INTSXP, REALSXP}; |
|
9 | 10 |
|
10 | 11 |
static const R_CMethodDef CEntries[] = { |
11 |
- {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 24, arg1}, |
|
12 |
+ {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 25, arg1}, |
|
12 | 13 |
{"C_multivariate_hmm", (DL_FUNC) &multivariate_hmm, 18, arg2}, |
13 | 14 |
{"C_univariate_cleanup", (DL_FUNC) &univariate_cleanup, 0, NULL}, |
14 | 15 |
{"C_multivariate_cleanup", (DL_FUNC) &multivariate_cleanup, 1, arg4}, |
16 |
+ {"C_array2D_which_max", (DL_FUNC) &array2D_which_max, 4, arg5}, |
|
15 | 17 |
{NULL, NULL, 0, NULL} |
16 | 18 |
}; |
17 | 19 |
|