Browse code

C helper functions for time-intensive calculations in stepsize posterior selection

chakalakka authored on 09/04/2017 12:54:46
Showing 6 changed files

... ...
@@ -338,8 +338,8 @@ runMultivariate <- function(bins, info, comb.states, use.states, distributions,
338 338
     } # loop over offsets
339 339
 
340 340
     ### Find maximum posterior for each bin between offsets
341
-    ptm <- startTimedMessage("Finding maximum posterior between offsets ...")
342 341
     ## Make bins with offset
342
+    ptm <- startTimedMessage("Making bins with offsets ...")
343 343
     if (length(offsets) > 1) {
344 344
         stepbins <- suppressMessages( fixedWidthBins(chrom.lengths = seqlengths(bins), binsizes = as.numeric(offsets[2]))[[1]] )
345 345
     } else {
... ...
@@ -360,13 +360,37 @@ runMultivariate <- function(bins, info, comb.states, use.states, distributions,
360 360
         astates.step[ind@from, offset] <- states.list[[offset]][ind@to]
361 361
     }
362 362
     rm(aposteriors)
363
+    stopTimedMessage(ptm)
364
+    
363 365
     # Average and normalize counts to RPKM
364
-    counts.step <- apply(X = acounts.step, MARGIN = c(1,2), FUN = mean, na.rm=TRUE)
365
-    counts.step <- rpkm.matrix(counts.step, binsize = width(bins)[1])
366
-    rm(acounts.step)
366
+    ptm <- startTimedMessage("Averaging counts between offsets ...")
367
+    # Start stuff to call C code
368
+        dim_acounts.step <- dim(acounts.step)
369
+        dimnames_acounts.step <- dimnames(acounts.step)
370
+        dim(acounts.step) <- NULL
371
+        z <- .C("C_array3D_mean",
372
+                array3D = acounts.step,
373
+                dim = as.integer(dim_acounts.step),
374
+                mean = double(dim_acounts.step[1]*dim_acounts.step[2]))
375
+        # dim(acounts.step) <- dim_acounts.step
376
+        rm(acounts.step)
377
+        dim(z$mean) <- dim_acounts.step[1:2]
378
+        counts.step <- z$mean
379
+        counts.step <- rpkm.matrix(counts.step, binsize = width(bins)[1])
380
+    # End stuff to call C code
381
+    stopTimedMessage(ptm)
367 382
     
368 383
     ## Find offset that maximizes the posteriors for each bin
369
-    ind <- apply(aposteriors.step, 1, which.max)
384
+    ptm <- startTimedMessage("Finding maximum posterior between offsets ...")
385
+    dim_aposteriors.step <- dim(aposteriors.step)
386
+    dimnames_aposteriors.step <- dimnames(aposteriors.step)
387
+    dim(aposteriors.step) <- NULL
388
+    z <- .C("C_array3D_which_max",
389
+            array3D = aposteriors.step,
390
+            dim = as.integer(dim_aposteriors.step),
391
+            ind_max = integer(dim_aposteriors.step[1]))
392
+    ind <- z$ind_max
393
+    dim(aposteriors.step) <- dim_aposteriors.step
370 394
     numstates <- length(comb.states)
371 395
     ind <- ceiling(ind / numstates)
372 396
     posteriors.step <- array(0, dim = c(length(stepbins), numstates), dimnames = list(bin=NULL, state=comb.states))
... ...
@@ -346,7 +346,7 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
346 346
             # Rerun the HMM with different epsilon and initial parameters from trial run
347 347
             if (verbosity>=1) message("------------------------- Rerunning try ",indexmax," with eps = ",eps," -------------------------")
348 348
             hmm <- .C("C_univariate_hmm",
349
-                counts = as.integer(counts), # double* O
349
+                counts = as.integer(counts), # int* O
350 350
                 num.bins = as.integer(numbins), # int* T
351 351
                 num.states = as.integer(numstates), # int* N
352 352
                 size = double(length=numstates), # double* size
... ...
@@ -434,8 +434,8 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
434 434
     } # loop over offsets
435 435
         
436 436
     ### Find maximum posterior for each bin between offsets
437
-    ptm <- startTimedMessage("Finding maximum posterior between offsets ...")
438 437
     ## Make bins with offset
438
+    ptm <- startTimedMessage("Making bins with offsets ...")
439 439
     if (length(offsets) > 1) {
440 440
         stepbins <- suppressMessages( fixedWidthBins(chrom.lengths = seqlengths(binned.data), binsizes = as.numeric(offsets[2]))[[1]] )
441 441
     } else {
... ...
@@ -454,19 +454,45 @@ callPeaksUnivariateAllChr <- function(binned.data, input.data=NULL, eps=0.01, in
454 454
         acounts.step[ind@from, offset] <- counts.list[[offset]][ind@to]
455 455
     }
456 456
     rm(aposteriors)
457
+    stopTimedMessage(ptm)
458
+    
457 459
     # Average and normalize counts to RPKM
458
-    counts.step <- apply(X = acounts.step, MARGIN = 1, FUN = mean, na.rm=TRUE, drop=FALSE)
459
-    counts.step <- rpkm.vector(counts.step, binsize = binsize)
460
-    rm(acounts.step)
460
+    ptm <- startTimedMessage("Averaging counts between offsets ...")
461
+    # Start stuff to call C code
462
+        dim_acounts.step <- dim(acounts.step)
463
+        dimnames_acounts.step <- dimnames(acounts.step)
464
+        dim(acounts.step) <- NULL
465
+        z <- .C("C_array2D_mean",
466
+                array2D = acounts.step,
467
+                dim = as.integer(dim_acounts.step),
468
+                mean = double(dim_acounts.step[1]))
469
+        # dim(acounts.step) <- dim_acounts.step
470
+        rm(acounts.step)
471
+        counts.step <- z$mean
472
+        counts.step <- rpkm.vector(counts.step, binsize = binsize)
473
+    # End stuff to call C code
474
+    stopTimedMessage(ptm)
461 475
     
462 476
     ## Find offset that maximizes the posteriors for each bin
463
-    ind <- apply(aposteriors.step, 1, which.max)
477
+    ptm <- startTimedMessage("Finding maximum posterior between offsets ...")
478
+    # Start stuff to call C code
479
+        dim_aposteriors.step <- dim(aposteriors.step)
480
+        dimnames_aposteriors.step <- dimnames(aposteriors.step)
481
+        dim(aposteriors.step) <- NULL
482
+        z <- .C("C_array3D_which_max",
483
+                array3D = aposteriors.step,
484
+                dim = as.integer(dim_aposteriors.step),
485
+                ind_max = integer(dim_aposteriors.step[1]))
486
+        dim(aposteriors.step) <- dim_aposteriors.step
487
+        ind <- z$ind_max
488
+    # End stuff to call C code
464 489
     ind <- ceiling(ind / numstates)
465 490
     posteriors.step <- array(0, dim = c(length(stepbins), numstates), dimnames = list(bin=NULL, state=state.labels))
466 491
     for (i1 in 1:length(offsets)) {
467 492
         mask <- ind == i1
468 493
         posteriors.step[mask,] <- aposteriors.step[mask,,i1, drop=FALSE]
469 494
     }
495
+    rm(aposteriors.step)
470 496
     stepbins$posteriors <- posteriors.step
471 497
     stopTimedMessage(ptm)
472 498
     
... ...
@@ -36,11 +36,13 @@ dnbinom.variance <- function(size, prob) {
36 36
 
37 37
 rpkm.vector <- function(counts, binsize) {
38 38
     rpkm <- counts / sum(as.numeric(counts)) * 1e6 * 1000 / binsize
39
+    rpkm[is.nan(rpkm)] <- 0
39 40
     return(rpkm)
40 41
 }
41 42
 
42 43
 rpkm.matrix <- function(counts, binsize) {
43 44
     rpkm <- sweep(counts, MARGIN = 2, STATS = colSums(counts), FUN = '/')
45
+    rpkm[is.nan(rpkm)] <- 0
44 46
     rpkm <- rpkm * 1e6 * 1000 / binsize
45 47
     return(rpkm)
46 48
 }
47 49
\ No newline at end of file
... ...
@@ -483,3 +483,69 @@ void multivariate_cleanup(int* Nmod)
483 483
 	FreeIntMatrix(multiO, *Nmod);
484 484
 }
485 485
 
486
+
487
+// =======================================================
488
+// C version of apply(array3D, 1, which.max)
489
+// =======================================================
490
+void array3D_which_max(double* array3D, int* dim, int* ind_max)
491
+{
492
+  // array3D is actually a vector, but is intended to originate from a 3D array in R
493
+	std::vector<double> value_per_i0(dim[1] * dim[2]);
494
+  for (int i0=0; i0<dim[0]; i0++)
495
+  {
496
+    for (int i1=0; i1<dim[1]; i1++)
497
+    {
498
+      for (int i2=0; i2<dim[2]; i2++)
499
+      {
500
+  			value_per_i0[i1*dim[2]+i2] = array3D[(i1*dim[2]+i2) * dim[0] + i0];
501
+        // Rprintf("i0=%d, i1=%d, i2=%d, value_per_i0[%d] = %g\n", i0, i1, i2, i1*dim[2]+i2, value_per_i0[i1*dim[2]+i2]);
502
+      }
503
+    }
504
+		ind_max[i0] = 1 + std::distance(value_per_i0.begin(), std::max_element(value_per_i0.begin(), value_per_i0.end()));
505
+  }
506
+	
507
+}
508
+
509
+
510
+// ====================================================================================
511
+// C version of apply(array2D, MARGIN = 1, FUN = mean)
512
+// ====================================================================================
513
+void array2D_mean(double* array2D, int* dim, double* mean)
514
+{
515
+  // array2D is actually a vector, but is intended to originate from a matrix in R
516
+  double sum=0;
517
+  for (int i0=0; i0<dim[0]; i0++)
518
+  {
519
+    sum = 0;
520
+    for (int i1=0; i1<dim[1]; i1++)
521
+    {
522
+      sum += array2D[i1*dim[0] + i0];
523
+    }
524
+    mean[i0] = sum / dim[1];
525
+  }
526
+	
527
+}
528
+
529
+
530
+// ====================================================================================
531
+// C version of apply(array3D, MARGIN = c(1,2), FUN = mean)
532
+// ====================================================================================
533
+void array3D_mean(double* array3D, int* dim, double* mean)
534
+{
535
+  // array3D is actually a vector, but is intended to originate from a 3D array in R
536
+  double sum=0;
537
+  for (int i0=0; i0<dim[0]; i0++)
538
+  {
539
+    for (int i1=0; i1<dim[1]; i1++)
540
+    {
541
+      sum = 0;
542
+      for (int i2=0; i2<dim[2]; i2++)
543
+      {
544
+        sum += array3D[(i2*dim[1] + i1)*dim[0] + i0];
545
+        // Rprintf("i0=%d, i1=%d, i2=%d, array3D[(i2*dim[1] + i1)*dim[0] + i0 = %d] = %g\n", i0, i1, i2, (i2*dim[1] + i1)*dim[0] + i0, array3D[(i2*dim[1] + i1)*dim[0] + i0]);
546
+      }
547
+      mean[i1*dim[0] + i0] = sum / dim[2];
548
+    }
549
+  }
550
+	
551
+}
486 552
\ No newline at end of file
... ...
@@ -28,3 +28,11 @@ void univariate_cleanup();
28 28
 extern "C"
29 29
 void multivariate_cleanup(int* Nmod);
30 30
 
31
+extern "C"
32
+void array3D_which_max(double* array3D, int* dim, int* ind_max);
33
+
34
+extern "C"
35
+void array2D_mean(double* array2D, int* dim, double* mean);
36
+
37
+extern "C"
38
+void array3D_mean(double* array3D, int* dim, double* mean);
31 39
\ No newline at end of file
... ...
@@ -6,12 +6,18 @@
6 6
 R_NativePrimitiveArgType arg1[] = {INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, LGLSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP, INTSXP};
7 7
 R_NativePrimitiveArgType arg2[] = {INTSXP, INTSXP, INTSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, INTSXP, REALSXP, REALSXP, LGLSXP, REALSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, REALSXP, LGLSXP, INTSXP, INTSXP, INTSXP};
8 8
 R_NativePrimitiveArgType arg4[] = {INTSXP};
9
+R_NativePrimitiveArgType arg5[] = {REALSXP, INTSXP, INTSXP};
10
+R_NativePrimitiveArgType arg6[] = {REALSXP, INTSXP, REALSXP};
11
+R_NativePrimitiveArgType arg7[] = {REALSXP, INTSXP, REALSXP};
9 12
 
10 13
 static const R_CMethodDef CEntries[]  = {
11 14
     {"C_univariate_hmm", (DL_FUNC) &univariate_hmm, 25, arg1},
12 15
     {"C_multivariate_hmm", (DL_FUNC) &multivariate_hmm, 27, arg2},
13 16
     {"C_univariate_cleanup", (DL_FUNC) &univariate_cleanup, 0, NULL},
14 17
     {"C_multivariate_cleanup", (DL_FUNC) &multivariate_cleanup, 1, arg4},
18
+    {"C_array3D_which_max", (DL_FUNC) &array3D_which_max, 3, arg5},
19
+    {"C_array2D_mean", (DL_FUNC) &array2D_mean, 3, arg6},
20
+    {"C_array3D_mean", (DL_FUNC) &array3D_mean, 3, arg7},
15 21
     {NULL, NULL, 0, NULL}
16 22
 };
17 23