Browse code

more memory efficient stepsize calculation

chakalakka authored on 03/05/2017 15:39:22
Showing 6 changed files

... ...
@@ -208,6 +208,7 @@ runMultivariate <- function(bins, info, comb.states, use.states, distributions,
208 208
 
209 209
     ### Define cleanup behaviour ###
210 210
     nummod <- dim(bins$counts)[2]
211
+    offsets <- dimnames(bins$counts)[[3]]
211 212
     on.exit(.C("C_multivariate_cleanup", as.integer(nummod)))
212 213
 
213 214
     # Prepare input for C function
... ...
@@ -229,9 +230,23 @@ runMultivariate <- function(bins, info, comb.states, use.states, distributions,
229 230
         startProbs.initial <- rep(1/length(comb.states), length(comb.states))
230 231
     }
231 232
 
232
-    offsets <- dimnames(bins$counts)[[3]]
233
-    states.list <- list()
234
-    aposteriors <- array(NA, dim = c(length(bins), length(comb.states), length(offsets)), dimnames = list(bin=NULL, state=comb.states, offset=offsets))
233
+    ### Arrays for finding maximum posterior for each bin between offsets
234
+    ## Make bins with offset
235
+    ptm <- startTimedMessage("Making bins with offsets ...")
236
+    if (length(offsets) > 1) {
237
+        stepbins <- suppressMessages( fixedWidthBins(chrom.lengths = seqlengths(bins), binsizes = as.numeric(offsets[2]))[[1]] )
238
+    } else {
239
+        stepbins <- bins
240
+        mcols(stepbins) <- NULL
241
+    }
242
+    sbins <- bins
243
+    mcols(sbins) <- NULL
244
+    aposteriors.step <- array(0, dim = c(length(stepbins), length(comb.states), 2), dimnames = list(bin=NULL, state=comb.states, offset=c('previousOffsets', 'currentOffset')))
245
+    acounts.step <- array(0, dim = c(length(stepbins), nummod, 2), dimnames = list(bin=NULL, track=info$ID, offset=c('previousOffsets', 'currentOffset')))
246
+    amaxPosterior.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset')))
247
+    astates.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset')))
248
+    
249
+    ### Loop over offsets ###
235 250
     for (ioffset in 1:length(offsets)) {
236 251
       
237 252
         offset <- offsets[ioffset]
... ...
@@ -239,8 +254,8 @@ runMultivariate <- function(bins, info, comb.states, use.states, distributions,
239 254
             messageU("Obtaining states for step size = ", offset, overline = '-', underline = NULL)
240 255
             ## Run only one iteration (no updating) if we are already over ioffset=1
241 256
             max.iter <- 1
242
-            transitionProbs.initial <- hmm$A
243
-            startProbs.initial <- hmm$proba
257
+            transitionProbs.initial <- hmm.A
258
+            startProbs.initial <- hmm.proba
244 259
             verbosity <- 0
245 260
         }
246 261
       
... ...
@@ -264,6 +279,7 @@ runMultivariate <- function(bins, info, comb.states, use.states, distributions,
264 279
             densities = double(length=lenDensities), # double* densities
265 280
             keep.densities = as.logical(keep.densities), # bool* keep_densities
266 281
             states = integer(length=length(bins)), # int* states
282
+            maxPosterior = double(length=length(bins)), # double* maxPosterior
267 283
             A = double(length=length(comb.states)*length(comb.states)), # double* A
268 284
             proba = double(length=length(comb.states)), # double* proba
269 285
             loglik = double(length=1), # double* loglik
... ...
@@ -328,134 +344,121 @@ runMultivariate <- function(bins, info, comb.states, use.states, distributions,
328 344
                 war <- warning("HMM did not converge!\n")
329 345
             }
330 346
         }
331
-        ## Store states and posteriors in list
332
-        states.list[[offset]] <- hmm$states
333
-        aposteriors[,, offset] <- hmm$posteriors
347
+        
348
+        ## Store counts and posteriors in list
349
+        dim(hmm$posteriors) <- c(length(bins), length(comb.states))
350
+        dimnames(hmm$posteriors) <- list(bin=NULL, state=comb.states)
351
+        dim(hmm$counts) <- c(length(bins), nummod)
352
+        dimnames(hmm$counts) <- list(bin=NULL, track=info$ID)
353
+        if (keep.densities) {
354
+            densities <- hmm$densities
355
+        }
356
+        hmm.A <- hmm$A
357
+        hmm.proba <- hmm$proba
358
+        
359
+        ## Inflate posteriors to new offset
360
+        bins.shift <- suppressWarnings( shift(sbins, shift = as.numeric(offset)) )
361
+        ind <- findOverlaps(stepbins, bins.shift)
362
+        aposteriors.step[ind@from, , 'currentOffset'] <- hmm$posteriors[ind@to, , drop=FALSE]
363
+        acounts.step[ind@from, , 'currentOffset'] <- hmm$counts[ind@to, , drop=FALSE]
364
+        astates.step[ind@from, 'currentOffset'] <- hmm$states[ind@to]
365
+        amaxPosterior.step[ind@from, 'currentOffset'] <- hmm$maxPosterior[ind@to]
366
+        
367
+        ## Sum counts
368
+        for (i3 in 1:dim(acounts.step)[2]) {
369
+            acounts.step[, i3, 'previousOffsets'] <- acounts.step[, i3, 'previousOffsets'] + acounts.step[, i3, 'currentOffset']
370
+        }
371
+        
372
+        ## Find offset that maximizes the posteriors for each bin
373
+        # Start stuff to call C code
374
+            # Work with changing dimensions to avoid copies being made
375
+            dim_amaxPosterior.step <- dim(amaxPosterior.step)
376
+            dimnames_amaxPosterior.step <- dimnames(amaxPosterior.step)
377
+            dim(amaxPosterior.step) <- NULL
378
+            z <- .C("C_array2D_which_max",
379
+                    array2D = amaxPosterior.step,
380
+                    dim = as.integer(dim_amaxPosterior.step),
381
+                    ind_max = integer(dim_amaxPosterior.step[1]),
382
+                    value_max = double(dim_amaxPosterior.step[1]))
383
+            dim(amaxPosterior.step) <- dim_amaxPosterior.step
384
+            dimnames(amaxPosterior.step) <- dimnames_amaxPosterior.step
385
+            ind <- z$ind_max
386
+        # End stuff to call C code
387
+        for (i1 in 1:2) {
388
+            mask <- ind == i1
389
+            aposteriors.step[mask, , 'previousOffsets'] <- aposteriors.step[mask,,i1, drop=FALSE]
390
+            astates.step[mask, 'previousOffsets'] <- astates.step[mask,i1, drop=FALSE]
391
+            amaxPosterior.step[mask, 'previousOffsets'] <- amaxPosterior.step[mask,i1, drop=FALSE]
392
+        }
334 393
         
335 394
         message("Time spent in multivariate HMM: ", appendLF=FALSE)
336 395
         stopTimedMessage(ptm)
337 396
         ptm <- proc.time()
338 397
     
398
+        rm(hmm)
339 399
     } # loop over offsets
400
+    states.step <- astates.step[, 'previousOffsets']
401
+    rm(amaxPosterior.step, astates.step)
340 402
 
341
-    ### Find maximum posterior for each bin between offsets
342
-    ## Make bins with offset
343
-    ptm <- startTimedMessage("Making bins with offsets ...")
344
-    if (length(offsets) > 1) {
345
-        stepbins <- suppressMessages( fixedWidthBins(chrom.lengths = seqlengths(bins), binsizes = as.numeric(offsets[2]))[[1]] )
403
+    # Average and normalize counts to RPKM
404
+    ptm <- startTimedMessage("Collecting counts and posteriors over offsets ...")
405
+    counts.step <- acounts.step[,, 'previousOffsets'] / dim(acounts.step)[2]
406
+    rm(acounts.step)
407
+    counts.step <- rpkm.matrix(counts.step, binsize = width(bins)[1])
408
+    stepbins$posteriors <- aposteriors.step[,,'previousOffsets']
409
+    stopTimedMessage(ptm)
410
+    
411
+    ## Counts ##
412
+    result$bincounts <- bins
413
+    result$bincounts$state <- NULL
414
+    
415
+    ## Bin coordinates, posteriors and states ##
416
+    result$bins <- GRanges(seqnames=seqnames(stepbins), ranges=ranges(stepbins))
417
+    result$bins$counts.rpkm <- counts.step
418
+    if (!is.null(use.states$state)) {
419
+        state.levels <- levels(use.states$state)
346 420
     } else {
347
-        stepbins <- bins
348
-        mcols(stepbins) <- NULL
421
+        state.levels <- comb.states
349 422
     }
350
-    aposteriors.step <- array(0, dim = c(length(stepbins), length(comb.states), length(offsets)), dimnames = list(bin=NULL, state=comb.states, offset=offsets))
351
-    acounts.step <- array(0, dim = c(length(stepbins), nummod, length(offsets)), dimnames = list(bin=NULL, track=result$info$ID, offset=offsets))
352
-    astates.step <- array(0, dim = c(length(stepbins), length(offsets)), dimnames = list(bin=NULL, offset=offsets))
353
-    ## Inflate posteriors to new offset
354
-    sbins <- bins
355
-    mcols(sbins) <- NULL
356
-    for (offset in offsets) {
357
-        bins.shift <- suppressWarnings( shift(sbins, shift = as.numeric(offset)) )
358
-        ind <- findOverlaps(stepbins, bins.shift)
359
-        aposteriors.step[ind@from, , offset] <- aposteriors[ind@to, , offset, drop=FALSE]
360
-        acounts.step[ind@from, , offset] <- bins$counts[ind@to, , offset, drop=FALSE]
361
-        astates.step[ind@from, offset] <- states.list[[offset]][ind@to]
423
+    result$bins$state <- factor(states.step, levels=state.levels)
424
+    if (get.posteriors) {
425
+        ptm <- startTimedMessage("Transforming posteriors to `per sample` representation ...")
426
+        binstates <- dec2bin(comb.states, ndigits=nummod)
427
+        post.per.track <- stepbins$posteriors %*% binstates
428
+        colnames(post.per.track) <- result$info$ID
429
+        result$bins$posteriors <- post.per.track
430
+        result$bins$peakScores <- getPeakScores(result$bins)
431
+        result$bins$differential.score <- differentialScoreSum(result$bins$peakScores, result$info)
432
+        stopTimedMessage(ptm)
433
+    }
434
+    if (keep.densities) {
435
+        result$bincounts$densities <- matrix(densities, ncol=length(comb.states))
436
+        colnames(result$bincounts$densities) <- comb.states
362 437
     }
363
-    rm(aposteriors)
364
-    stopTimedMessage(ptm)
365
-    
366
-    # Average and normalize counts to RPKM
367
-    ptm <- startTimedMessage("Averaging counts between offsets ...")
368
-    # Start stuff to call C code
369
-        # Work with changing dimensions to avoid copies being made
370
-        dim_acounts.step <- dim(acounts.step)
371
-        dimnames_acounts.step <- dimnames(acounts.step)
372
-        dim(acounts.step) <- NULL
373
-        z <- .C("C_array3D_mean",
374
-                array3D = acounts.step,
375
-                dim = as.integer(dim_acounts.step),
376
-                mean = double(dim_acounts.step[1]*dim_acounts.step[2]))
377
-        # dim(acounts.step) <- dim_acounts.step
378
-        rm(acounts.step)
379
-        dim(z$mean) <- dim_acounts.step[1:2]
380
-        dimnames(z$mean) <- dimnames_acounts.step[1:2]
381
-        counts.step <- z$mean
382
-        counts.step <- rpkm.matrix(counts.step, binsize = width(bins)[1])
383
-    # End stuff to call C code
384
-    stopTimedMessage(ptm)
385 438
     
386
-    ## Find offset that maximizes the posteriors for each bin
387
-    ptm <- startTimedMessage("Finding maximum posterior between offsets ...")
388
-    # Start stuff to call C code
389
-        # Work with changing dimensions to avoid copies being made
390
-        dim_aposteriors.step <- dim(aposteriors.step)
391
-        dimnames_aposteriors.step <- dimnames(aposteriors.step)
392
-        dim(aposteriors.step) <- NULL
393
-        z <- .C("C_array3D_which_max",
394
-                array3D = aposteriors.step,
395
-                dim = as.integer(dim_aposteriors.step),
396
-                ind_max = integer(dim_aposteriors.step[1]))
397
-        dim(aposteriors.step) <- dim_aposteriors.step
398
-        ind <- z$ind_max
399
-    # End stuff to call C code
400
-    numstates <- length(comb.states)
401
-    ind <- ceiling(ind / numstates)
402
-    posteriors.step <- array(0, dim = c(length(stepbins), numstates), dimnames = list(bin=NULL, state=comb.states))
403
-    states.step <- numeric(length=length(stepbins))
404
-    for (i1 in 1:length(offsets)) {
405
-        mask <- ind == i1
406
-        posteriors.step[mask,] <- aposteriors.step[mask,,i1, drop=FALSE]
407
-        states.step[mask] <- astates.step[mask, i1, drop=FALSE]
439
+    ## Add combinations ##
440
+    if (!is.null(use.states)) {
441
+        result$bins$combination <- factor(mapping[as.character(result$bins$state)], levels=levels(use.states$combination))
442
+    } else {
443
+        result$bins$combination <- result$bins$state
408 444
     }
409
-    stepbins$posteriors <- posteriors.step
410
-    stopTimedMessage(ptm)
411 445
     
412
-    ## Counts
413
-        result$bincounts <- bins
414
-        result$bincounts$state <- NULL
415
-    ## Bin coordinates, posteriors and states
416
-        result$bins <- GRanges(seqnames=seqnames(stepbins), ranges=ranges(stepbins))
417
-        result$bins$counts.rpkm <- counts.step
418
-        if (!is.null(use.states$state)) {
419
-            state.levels <- levels(use.states$state)
420
-        } else {
421
-            state.levels <- hmm$comb.states
422
-        }
423
-        result$bins$state <- factor(states.step, levels=state.levels)
424
-        if (get.posteriors) {
425
-            ptm <- startTimedMessage("Transforming posteriors to `per sample` representation ...")
426
-            binstates <- dec2bin(hmm$comb.states, ndigits=hmm$num.modifications)
427
-            post.per.track <- stepbins$posteriors %*% binstates
428
-            colnames(post.per.track) <- result$info$ID
429
-            result$bins$posteriors <- post.per.track
430
-            result$bins$peakScores <- getPeakScores(result$bins)
431
-            result$bins$differential.score <- differentialScoreSum(result$bins$peakScores, result$info)
432
-            stopTimedMessage(ptm)
433
-        }
434
-        if (keep.densities) {
435
-            result$bins$densities <- matrix(hmm$densities, ncol=hmm$max.states)
436
-            colnames(result$bins$densities) <- hmm$comb.states
437
-        }
438
-    ## Add combinations
439
-        if (!is.null(use.states)) {
440
-            result$bins$combination <- factor(mapping[as.character(result$bins$state)], levels=levels(use.states$combination))
441
-        } else {
442
-            result$bins$combination <- result$bins$state
443
-        }
444
-    ## Segmentation
445
-        result$segments <- multivariateSegmentation(result$bins, column2collapseBy='state')
446
-        if (!keep.posteriors) {
447
-            result$bins$posteriors <- NULL
448
-        }
449
-    ## Peaks
450
-        result$peaks <- list()
451
-        for (i1 in 1:ncol(result$segments$peakScores)) {
452
-            mask <- result$segments$peakScores[,i1] > 0
453
-            peaks <- result$segments[mask]
454
-            mcols(peaks) <- NULL
455
-            peaks$peakScores <- result$segments$peakScores[mask,i1]
456
-            result$peaks[[i1]] <- peaks
457
-        }
458
-        names(result$peaks) <- colnames(result$segments$peakScores)
446
+    ## Segmentation ##
447
+    result$segments <- multivariateSegmentation(result$bins, column2collapseBy='state')
448
+    if (!keep.posteriors) {
449
+        result$bins$posteriors <- NULL
450
+    }
451
+        
452
+    ## Peaks ##
453
+    result$peaks <- list()
454
+    for (i1 in 1:ncol(result$segments$peakScores)) {
455
+        mask <- result$segments$peakScores[,i1] > 0
456
+        peaks <- result$segments[mask]
457
+        mcols(peaks) <- NULL
458
+        peaks$peakScores <- result$segments$peakScores[mask,i1]
459
+        result$peaks[[i1]] <- peaks
460
+    }
461
+    names(result$peaks) <- colnames(result$segments$peakScores)
459 462
         
460 463
     return(result)
461 464
 
... ...
@@ -170,6 +170,7 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
170 170
     }
171 171
     numstates <- length(state.labels)
172 172
     iniproc <- which(init==c("standard","random","empiric")) # transform to int
173
+    offsets <- dimnames(binned.data$counts)[[2]]
173 174
 
174 175
     ### Assign initial parameters ###
175 176
     if (class(binned.data) == class.univariate.hmm) {
... ...
@@ -199,9 +200,23 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
199 200
     binsize <- width(binned.data)[1]
200 201
     if (keep.densities) { lenDensities <- numbins * numstates } else { lenDensities <- 1 }
201 202
     
202
-    offsets <- dimnames(binned.data$counts)[[2]]
203
-    counts.list <- list()
204
-    aposteriors <- array(NA, dim = c(numbins, numstates, length(offsets)), dimnames = list(bin=NULL, state=state.labels, offset=offsets))
203
+    ### Arrays for finding maximum posterior for each bin between offsets
204
+    ## Make bins with offset
205
+    ptm <- startTimedMessage("Making bins with offsets ...")
206
+    if (length(offsets) > 1) {
207
+        stepbins <- suppressMessages( fixedWidthBins(chrom.lengths = seqlengths(binned.data), binsizes = as.numeric(offsets[2]))[[1]] )
208
+    } else {
209
+        stepbins <- binned.data
210
+        mcols(stepbins) <- NULL
211
+    }
212
+    bins <- binned.data
213
+    mcols(bins) <- NULL
214
+    aposteriors.step <- array(0, dim = c(length(stepbins), numstates, 2), dimnames = list(bin=NULL, state=state.labels, offset=c('previousOffsets', 'currentOffset')))
215
+    acounts.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset')))
216
+    amaxPosterior.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset')))
217
+    astates.step <- array(0, dim = c(length(stepbins), 2), dimnames = list(bin=NULL, offset=c('previousOffsets', 'currentOffset')))
218
+    
219
+    ### Loop over offsets ###
205 220
     for (ioffset in 1:length(offsets)) {
206 221
         offset <- offsets[ioffset]
207 222
         counts <- binned.data$counts[,offset, drop=FALSE]
... ...
@@ -302,6 +317,8 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
302 317
                 posteriors = double(length=numbins * numstates), # double* posteriors
303 318
                 densities = double(length=lenDensities), # double* densities
304 319
                 keep.densities = as.logical(keep.densities), # bool* keep_densities
320
+                states = integer(length=numbins), # int* states
321
+                maxPosterior = double(length=numbins), # double* maxPosterior
305 322
                 A = double(length=numstates*numstates), # double* A
306 323
                 proba = double(length=numstates), # double* proba
307 324
                 loglik = double(length=1), # double* loglik
... ...
@@ -357,6 +374,8 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
357 374
                 posteriors = double(length=numbins * numstates), # double* posteriors
358 375
                 densities = double(length=lenDensities), # double* densities
359 376
                 keep.densities = as.logical(keep.densities), # bool* keep_densities
377
+                states = integer(length=numbins), # int* states
378
+                maxPosterior = double(length=numbins), # double* maxPosterior
360 379
                 A = double(length=numstates*numstates), # double* A
361 380
                 proba = double(length=numstates), # double* proba
362 381
                 loglik = double(length=1), # double* loglik
... ...
@@ -426,128 +445,114 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
426 445
             ## Add class
427 446
                 class(result) <- class.univariate.hmm
428 447
         }
448
+        
449
+        ptm <- startTimedMessage("Maximizing over offsets ...")
429 450
         ## Store counts and posteriors in list
430
-        counts.list[[offset]] <- hmm$counts
431
-        aposteriors[,, offset] <- hmm$posteriors
451
+        dim(hmm$posteriors) <- c(numbins, numstates)
452
+        dimnames(hmm$posteriors) <- list(bin=NULL, state=state.labels)
453
+        binned.data$counts[,offset] <- hmm$counts
454
+        if (keep.densities) {
455
+            densities <- hmm$densities
456
+        }
432 457
         
458
+        ## Inflate posteriors to new offset
459
+        bins.shift <- suppressWarnings( shift(bins, shift = as.numeric(offset)) )
460
+        ind <- findOverlaps(stepbins, bins.shift)
461
+        aposteriors.step[ind@from, , 'currentOffset'] <- hmm$posteriors[ind@to, , drop=FALSE]
462
+        acounts.step[ind@from, 'currentOffset'] <- hmm$counts[ind@to]
463
+        astates.step[ind@from, 'currentOffset'] <- hmm$states[ind@to]
464
+        amaxPosterior.step[ind@from, 'currentOffset'] <- hmm$maxPosterior[ind@to]
465
+        
466
+        ## Sum counts
467
+        acounts.step[, 'previousOffsets'] <- rowSums(acounts.step)
433 468
         
469
+        ## Find offset that maximizes the posteriors for each bin
470
+        # Start stuff to call C code
471
+            # Work with changing dimensions to avoid copies being made
472
+            dim_amaxPosterior.step <- dim(amaxPosterior.step)
473
+            dimnames_amaxPosterior.step <- dimnames(amaxPosterior.step)
474
+            dim(amaxPosterior.step) <- NULL
475
+            z <- .C("C_array2D_which_max",
476
+                    array2D = amaxPosterior.step,
477
+                    dim = as.integer(dim_amaxPosterior.step),
478
+                    ind_max = integer(dim_amaxPosterior.step[1]),
479
+                    value_max = double(dim_amaxPosterior.step[1]))
480
+            dim(amaxPosterior.step) <- dim_amaxPosterior.step
481
+            dimnames(amaxPosterior.step) <- dimnames_amaxPosterior.step
482
+            ind <- z$ind_max
483
+        # End stuff to call C code
484
+        for (i1 in 1:2) {
485
+            mask <- ind == i1
486
+            aposteriors.step[mask, , 'previousOffsets'] <- aposteriors.step[mask,,i1, drop=FALSE]
487
+            astates.step[mask, 'previousOffsets'] <- astates.step[mask,i1, drop=FALSE]
488
+            amaxPosterior.step[mask, 'previousOffsets'] <- amaxPosterior.step[mask,i1, drop=FALSE]
489
+        }
490
+        stopTimedMessage(ptm)
491
+        
492
+        rm(hmm)
434 493
     } # loop over offsets
494
+    rm(amaxPosterior.step, astates.step)
435 495
         
436
-    ### Find maximum posterior for each bin between offsets
437
-    ## Make bins with offset
438
-    ptm <- startTimedMessage("Making bins with offsets ...")
439
-    if (length(offsets) > 1) {
440
-        stepbins <- suppressMessages( fixedWidthBins(chrom.lengths = seqlengths(binned.data), binsizes = as.numeric(offsets[2]))[[1]] )
496
+    # Average and normalize counts to RPKM
497
+    ptm <- startTimedMessage("Collecting counts and posteriors over offsets ...")
498
+    counts.step <- acounts.step[, 'previousOffsets'] / ncol(acounts.step)
499
+    rm(acounts.step)
500
+    counts.step <- rpkm.vector(counts.step, binsize = binsize)
501
+    stepbins$posteriors <- aposteriors.step[,,'previousOffsets']
502
+    rm(aposteriors.step)
503
+    stopTimedMessage(ptm)
504
+    
505
+    ## Get states ##
506
+    ptm <- startTimedMessage("Calculating states from posteriors ...")
507
+    posteriors <- stepbins$posteriors
508
+    threshold <- 1-post.cutoff
509
+    if (control) {
510
+        states <- rep(1, ncol(posteriors))
511
+        states[ posteriors[,2] >= posteriors[,1] ] <- 2
441 512
     } else {
442
-        stepbins <- binned.data
443
-        mcols(stepbins) <- NULL
513
+        states <- rep(NA, ncol(posteriors))
514
+        states[ posteriors[,3]<threshold & posteriors[,2]<=posteriors[,1] ] <- 1
515
+        states[ posteriors[,3]<threshold & posteriors[,2]>posteriors[,1] ] <- 2
516
+        states[ posteriors[,3]>=threshold ] <- 3
444 517
     }
445
-    aposteriors.step <- array(0, dim = c(length(stepbins), numstates, length(offsets)), dimnames = list(bin=NULL, state=state.labels, offset=offsets))
446
-    acounts.step <- array(0, dim = c(length(stepbins), length(offsets)), dimnames = list(bin=NULL, offset=offsets))
447
-    ## Inflate posteriors to new offset
448
-    bins <- binned.data
449
-    mcols(bins) <- NULL
450
-    for (offset in offsets) {
451
-        bins.shift <- suppressWarnings( shift(bins, shift = as.numeric(offset)) )
452
-        ind <- findOverlaps(stepbins, bins.shift)
453
-        aposteriors.step[ind@from, , offset] <- aposteriors[ind@to, , offset, drop=FALSE]
454
-        acounts.step[ind@from, offset] <- counts.list[[offset]][ind@to]
518
+    states <- state.labels[states]
519
+    
520
+    ## Counts ##
521
+    result$bincounts <- binned.data
522
+    
523
+    ## Bin coordinates, posteriors and states ##
524
+    result$bins <- GRanges(seqnames=seqnames(stepbins), ranges=ranges(stepbins))
525
+    result$bins$counts.rpkm <- counts.step
526
+    mcols(result$bins)[, 'state'] <- states
527
+    result$bins$posteriors <- posteriors
528
+    if (!control) {
529
+        result$bins$posterior.modified <- posteriors[,'modified']
455 530
     }
456
-    rm(aposteriors)
531
+    if (keep.densities) {
532
+        result$bincounts$densities <- matrix(densities, ncol=numstates)
533
+        colnames(result$bincounts$densities) <- state.labels
534
+    }
535
+    seqlengths(result$bins) <- seqlengths(stepbins)
457 536
     stopTimedMessage(ptm)
458 537
     
459
-    # Average and normalize counts to RPKM
460
-    ptm <- startTimedMessage("Averaging counts between offsets ...")
461
-    # Start stuff to call C code
462
-        # Work with changing dimensions to avoid copies being made
463
-        dim_acounts.step <- dim(acounts.step)
464
-        dimnames_acounts.step <- dimnames(acounts.step)
465
-        dim(acounts.step) <- NULL
466
-        z <- .C("C_array2D_mean",
467
-                array2D = acounts.step,
468
-                dim = as.integer(dim_acounts.step),
469
-                mean = double(dim_acounts.step[1]))
470
-        # dim(acounts.step) <- dim_acounts.step
471
-        rm(acounts.step)
472
-        counts.step <- z$mean
473
-        counts.step <- rpkm.vector(counts.step, binsize = binsize)
474
-    # End stuff to call C code
475
-    stopTimedMessage(ptm)
538
+    ## Peak score as maximum posterior in that peak ##
539
+    result$bins$peakScores <- getPeakScore.univariate(result$bins)
476 540
     
477
-    ## Find offset that maximizes the posteriors for each bin
478
-    ptm <- startTimedMessage("Finding maximum posterior between offsets ...")
479
-    # Start stuff to call C code
480
-        # Work with changing dimensions to avoid copies being made
481
-        dim_aposteriors.step <- dim(aposteriors.step)
482
-        dimnames_aposteriors.step <- dimnames(aposteriors.step)
483
-        dim(aposteriors.step) <- NULL
484
-        z <- .C("C_array3D_which_max",
485
-                array3D = aposteriors.step,
486
-                dim = as.integer(dim_aposteriors.step),
487
-                ind_max = integer(dim_aposteriors.step[1]))
488
-        dim(aposteriors.step) <- dim_aposteriors.step
489
-        ind <- z$ind_max
490
-    # End stuff to call C code
491
-    ind <- ceiling(ind / numstates)
492
-    posteriors.step <- array(0, dim = c(length(stepbins), numstates), dimnames = list(bin=NULL, state=state.labels))
493
-    for (i1 in 1:length(offsets)) {
494
-        mask <- ind == i1
495
-        posteriors.step[mask,] <- aposteriors.step[mask,,i1, drop=FALSE]
541
+    ## Segmentation ##
542
+    ptm <- startTimedMessage("Making segmentation ...")
543
+    df <- as.data.frame(result$bins)
544
+    red.df <- suppressMessages(collapseBins(df, column2collapseBy='state', columns2drop=c('width',grep('posterior', names(df), value=TRUE), 'counts.rpkm')))
545
+    result$segments <- methods::as(red.df, 'GRanges')
546
+    seqlengths(result$segments) <- seqlengths(binned.data)[seqlevels(result$segments)]
547
+    if (!keep.posteriors) {
548
+        result$bins$posteriors <- NULL
496 549
     }
497
-    rm(aposteriors.step)
498
-    stepbins$posteriors <- posteriors.step
499 550
     stopTimedMessage(ptm)
500 551
     
501
-    ## Get states
502
-        ptm <- startTimedMessage("Calculating states from posteriors ...")
503
-        hmm$posteriors <- stepbins$posteriors
504
-        threshold <- 1-post.cutoff
505
-        if (control) {
506
-            states <- rep(1, ncol(hmm$posteriors))
507
-            states[ hmm$posteriors[,2] >= hmm$posteriors[,1] ] <- 2
508
-        } else {
509
-            states <- rep(NA, ncol(hmm$posteriors))
510
-            states[ hmm$posteriors[,3]<threshold & hmm$posteriors[,2]<=hmm$posteriors[,1] ] <- 1
511
-            states[ hmm$posteriors[,3]<threshold & hmm$posteriors[,2]>hmm$posteriors[,1] ] <- 2
512
-            states[ hmm$posteriors[,3]>=threshold ] <- 3
513
-        }
514
-        states <- state.labels[states]
515
-    ## Counts
516
-        result$bincounts <- binned.data
517
-        for (offset in offsets) {
518
-            result$bincounts$counts[, offset] <- counts.list[[offset]]
519
-        }
520
-        rm(counts.list)
521
-        seqlengths(result$bincounts) <- seqlengths(binned.data)
522
-    ## Bin coordinates, posteriors and states
523
-        result$bins <- GRanges(seqnames=seqnames(stepbins), ranges=ranges(stepbins))
524
-        result$bins$counts.rpkm <- counts.step
525
-        mcols(result$bins)[, 'state'] <- states
526
-        result$bins$posteriors <- hmm$posteriors
527
-        if (!control) {
528
-            result$bins$posterior.modified <- hmm$posteriors[,'modified']
529
-        }
530
-        if (keep.densities) {
531
-            result$bins$densities <- matrix(hmm$densities, ncol=hmm$num.states)
532
-        }
533
-        seqlengths(result$bins) <- seqlengths(stepbins)
534
-        stopTimedMessage(ptm)
535
-    ## Peak score as maximum posterior in that peak
536
-        result$bins$peakScores <- getPeakScore.univariate(result$bins)
537
-    ## Segmentation
538
-        ptm <- startTimedMessage("Making segmentation ...")
539
-        df <- as.data.frame(result$bins)
540
-        red.df <- suppressMessages(collapseBins(df, column2collapseBy='state', columns2drop=c('width',grep('posterior', names(df), value=TRUE), 'counts.rpkm')))
541
-        result$segments <- methods::as(red.df, 'GRanges')
542
-        seqlengths(result$segments) <- seqlengths(binned.data)[seqlevels(result$segments)]
543
-        if (!keep.posteriors) {
544
-            result$bins$posteriors <- NULL
545
-        }
546
-        stopTimedMessage(ptm)
547
-    ## Peaks
548
-        result$peaks <- result$segments[result$segments$state == 'modified']
549
-        result$peaks$state <- NULL
550
-        result$segments <- NULL
552
+    ## Peaks ##
553
+    result$peaks <- result$segments[result$segments$state == 'modified']
554
+    result$peaks$state <- NULL
555
+    result$segments <- NULL
551 556
         
552 557
       
553 558
     # Return results
... ...
@@ -45,4 +45,4 @@ rpkm.matrix <- function(counts, binsize) {
45 45
     rpkm[is.nan(rpkm)] <- 0
46 46
     rpkm <- rpkm * 1e6 * 1000 / binsize
47 47
     return(rpkm)
48
-}
49 48
\ No newline at end of file
49
+}
... ...
@@ -6,7 +6,7 @@ static int** multiO;
6 6
 // ===================================================================================================================================================
7 7
 // This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R.
8 8
 // ===================================================================================================================================================
9
-void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* densities, bool* keep_densities, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* verbosity)
9
+void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* verbosity)
10 10
 {
11 11
 
12 12
 	// Define logging level
... ...
@@ -220,6 +220,21 @@ void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* max
220 220
 		}
221 221
 	}
222 222
 
223
+	// Compute the states from posteriors
224
+	//FILE_LOG(logDEBUG1) << "Computing states from posteriors";
225
+	int ind_max;
226
+	std::vector<double> posterior_per_t(*N);
227
+	for (int t=0; t<*T; t++)
228
+	{
229
+		for (int iN=0; iN<*N; iN++)
230
+		{
231
+			posterior_per_t[iN] = hmm->get_posterior(iN, t);
232
+		}
233
+		ind_max = std::distance(posterior_per_t.begin(), std::max_element(posterior_per_t.begin(), posterior_per_t.end()));
234
+		states[t] = ind_max + 1;
235
+		maxPosterior[t] = posterior_per_t[ind_max];
236
+	}
237
+		
223 238
 	//FILE_LOG(logDEBUG1) << "Return parameters";
224 239
 	// also return the estimated transition matrix and the initial probs
225 240
 	for (int i=0; i<*N; i++)
... ...
@@ -258,7 +273,7 @@ void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* max
258 273
 // =====================================================================================================================================================
259 274
 // This function takes parameters from R, creates a multivariate HMM object, creates the distributions, runs the Baum-Welch and returns the result to R.
260 275
 // =====================================================================================================================================================
261
-void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity)
276
+void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity)
262 277
 {
263 278
 
264 279
 	// Define logging level {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"}
... ...
@@ -432,6 +447,7 @@ void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, do
432 447
 			}
433 448
 			ind_max = std::distance(posterior_per_t.begin(), std::max_element(posterior_per_t.begin(), posterior_per_t.end()));
434 449
 			states[t] = comb_states[ind_max];
450
+			maxPosterior[t] = posterior_per_t[ind_max];
435 451
 		}
436 452
 // 	}
437 453
 // 	else
... ...
@@ -506,46 +522,22 @@ void array3D_which_max(double* array3D, int* dim, int* ind_max)
506 522
 	
507 523
 }
508 524
 
509
-
510
-// ====================================================================================
511
-// C version of apply(array2D, MARGIN = 1, FUN = mean)
512
-// ====================================================================================
513
-void array2D_mean(double* array2D, int* dim, double* mean)
525
+// ====================================================================
526
+// C version of apply(array2D, 1, which.max) and apply(array2D, 1, max)
527
+// ====================================================================
528
+void array2D_which_max(double* array2D, int* dim, int* ind_max, double* value_max)
514 529
 {
515
-  // array2D is actually a vector, but is intended to originate from a matrix in R
516
-  double sum=0;
530
+  // array2D is actually a vector, but is intended to originate from a 2D array in R
531
+	std::vector<double> value_per_i0(dim[1]);
517 532
   for (int i0=0; i0<dim[0]; i0++)
518 533
   {
519
-    sum = 0;
520 534
     for (int i1=0; i1<dim[1]; i1++)
521 535
     {
522
-      sum += array2D[i1*dim[0] + i0];
523
-    }
524
-    mean[i0] = sum / dim[1];
525
-  }
526
-	
527
-}
528
-
529
-
530
-// ====================================================================================
531
-// C version of apply(array3D, MARGIN = c(1,2), FUN = mean)
532
-// ====================================================================================
533
-void array3D_mean(double* array3D, int* dim, double* mean)
534
-{
535
-  // array3D is actually a vector, but is intended to originate from a 3D array in R
536
-  double sum=0;
537
-  for (int i0=0; i0<dim[0]; i0++)
538
-  {
539
-    for (int i1=0; i1<dim[1]; i1++)
540
-    {
541
-      sum = 0;
542
-      for (int i2=0; i2<dim[2]; i2++)
543
-      {
544
-        sum += array3D[(i2*dim[1] + i1)*dim[0] + i0];
545
-        // Rprintf("i0=%d, i1=%d, i2=%d, array3D[(i2*dim[1] + i1)*dim[0] + i0 = %d] = %g\n", i0, i1, i2, (i2*dim[1] + i1)*dim[0] + i0, array3D[(i2*dim[1] + i1)*dim[0] + i0]);
546
-      }
547
-      mean[i1*dim[0] + i0] = sum / dim[2];
536
+			value_per_i0[i1] = array2D[i1 * dim[0] + i0];
537
+      // Rprintf("i0=%d, i1=%d, value_per_i0[%d] = %g\n", i0, i1, i1, value_per_i0[i1]);
548 538
     }
539
+		ind_max[i0] = 1 + std::distance(value_per_i0.begin(), std::max_element(value_per_i0.begin(), value_per_i0.end()));
540
+    value_max[i0] = *std::max_element(value_per_i0.begin(), value_per_i0.end());
549 541
   }
550 542
 	
551 543
 }
552 544
\ No newline at end of file
... ...
@@ -17,10 +17,10 @@
17 17
 // #endif
18 18
 
19 19
 extern "C"
20
-void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* densities, bool* keep_densities, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* verbosity);
20
+void univariate_hmm(int* O, int* T, int* N, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* weights, int* iniproc, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* verbosity);
21 21
 
22 22
 extern "C"
23
-void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity);
23
+void multivariate_hmm(int* O, int* T, int* N, int *Nmod, double* comb_states, double* size, double* prob, double* w, double* cor_matrix_inv, double* det, int* maxiter, int* maxtime, double* eps, double* posteriors, bool* keep_posteriors, double* densities, bool* keep_densities, int* states, double* maxPosterior, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* verbosity);
24 24
 
25 25
 extern "C"
26 26
 void univariate_cleanup();
... ...
@@ -32,7 +32,4 @@ extern "C"
32 32
 void array3D_which_max(double* array3D, int* dim, int* ind_max);
33 33
 
34 34
 extern "C"
35
-void array2D_mean(double* array2D, int* dim, double* mean);
36
-
37
-extern "C"
38
-void array3D_mean(double* array3D, int* dim, double* mean);
39 35
\ No newline at end of file
36
+void array2D_which_max(double* array3D, int* dim, int* ind_max, double* value_max);
40 37
\ No newline at end of file
... ...
@@ -3,21 +3,19 @@
3 3
 #include "R_interface.h"
4 4
 
5 5
 
6
-R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, LGLSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP};
7
-R_NativePrimitiveArgType arg2[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, LGLSXP, REALSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP};
6
+R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP};
7
+R_NativePrimitiveArgType arg2[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, LGLSXP, REALSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP};
8 8
 R_NativePrimitiveArgType arg4[] = {INTSXP};
9 9
 R_NativePrimitiveArgType arg5[] = {REALSXP, INTSXP, INTSXP};
10
-R_NativePrimitiveArgType arg6[] = {REALSXP, INTSXP, REALSXP};
11
-R_NativePrimitiveArgType arg7[] = {REALSXP, INTSXP, REALSXP};
10
+R_NativePrimitiveArgType arg6[] = {REALSXP, INTSXP, INTSXP, REALSXP};
12 11
 
13 12
 static const R_CMethodDef CEntries[]  = {
14
-    {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 25, arg1},
15
-    {"C_multivariate_hmm", (DL_FUNC) &multivariate_hmm, 27, arg2},
13
+    {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 27, arg1},
14
+    {"C_multivariate_hmm", (DL_FUNC) &multivariate_hmm, 28, arg2},
16 15
     {"C_univariate_cleanup", (DL_FUNC) &univariate_cleanup, 0, NULL},
17 16
     {"C_multivariate_cleanup", (DL_FUNC) &multivariate_cleanup, 1, arg4},
18 17
     {"C_array3D_which_max", (DL_FUNC) &array3D_which_max, 3, arg5},
19
-    {"C_array2D_mean", (DL_FUNC) &array2D_mean, 3, arg6},
20
-    {"C_array3D_mean", (DL_FUNC) &array3D_mean, 3, arg7},
18
+    {"C_array2D_which_max", (DL_FUNC) &array2D_which_max, 4, arg6},
21 19
     {NULL, NULL, 0, NULL}
22 20
 };
23 21