Browse code

Transition to beachmat version 2

Peter Hickey authored on 23/04/2019 01:03:45
Showing 1 changed files
... ...
@@ -1,5 +1,10 @@
1 1
 #include "BSseq.h"
2 2
 
3
+#include "beachmat/integer_matrix.h"
4
+#include "beachmat/numeric_matrix.h"
5
+
6
+#include "utils.h"
7
+
3 8
 // NOTE: Returning Rcpp::CharacterVector rather than throwing an error because
4 9
 //       this function is used within a validity method.
5 10
 
Browse code

Work in progress: refactoring bsseq

- BSseq objects can once again use ordinary matrix objects as assays.
- Reimplement `BSmooth()` more-or-less from scratch:
- Switch from 'parallel' to 'BiocParallel' for parallelization. This brings some notable improvements:
- Smoothed results can now be written directly to an on-disk realization backend by the worker. This dramatically reduces memory usage compared to previous versions of 'bsseq' that required all results be retained in-memory.
- Parallelization is now supported on Windows through the use of a 'SnowParam' object as the value of `BPPARAM`.
- Improved error handling makes it possible to gracefully resume `BSmooth()` jobs that encountered errors by re-doing only the necessary tasks.
- Detailed and extensive job logging facilities.
- Fix bug in `BSmooth()` with the `maxGap` parameter.
- Re-factor BSseq() constructor and add fast, internal .BSseq() constructor.
- Re-factor collapseBSseq() and combine(). Should be much more performant.
- Use beachmat to implement fast validity checking of 'M' and 'Cov' matrices.
- Resave BS.chr22 (supplied data) using integer for storage mode of assays to reduce size.
- Switch from RUnit to testthat. testthat has better integration with code coverage tools that help when refactoring.

Peter Hickey authored on 28/05/2018 23:42:18
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,107 @@
1
+#include "BSseq.h"
2
+
3
+// NOTE: Returning Rcpp::CharacterVector rather than throwing an error because
4
+//       this function is used within a validity method.
5
+
6
+template <class M_column_class, class Cov_column_class, class M_class,
7
+          class Cov_class>
8
+Rcpp::RObject check_M_and_Cov_internal(M_class M_bm, Cov_class Cov_bm) {
9
+    BEGIN_RCPP
10
+
11
+    // Get the dimensions of 'M' and 'Cov' and check these are compatible.
12
+    const size_t M_nrow = M_bm->get_nrow();
13
+    const size_t Cov_nrow = Cov_bm->get_nrow();
14
+    if (M_nrow != Cov_nrow) {
15
+        return Rcpp::CharacterVector(
16
+            "'M' and 'Cov' must have the same number of rows.");
17
+    }
18
+    const size_t M_ncol = M_bm->get_ncol();
19
+    const size_t Cov_ncol = Cov_bm->get_ncol();
20
+    if (M_ncol != Cov_ncol) {
21
+        return Rcpp::CharacterVector(
22
+            "'M' and 'Cov' must have the same number of columns.");
23
+    }
24
+
25
+    // Simultaneously loop over columns of 'M' and 'Cov', checking that
26
+    // `all(0 <= M <= Cov) && !anyNA(M) && !anyNA(Cov)` && all(is.finite(Cov)).
27
+    M_column_class M_column(M_nrow);
28
+    Cov_column_class Cov_column(Cov_nrow);
29
+    for (size_t j = 0; j < M_ncol; ++j) {
30
+        // Copy the j-th column of M to M_column and the j-th column of Cov to
31
+        // Cov_column
32
+        M_bm->get_col(j, M_column.begin());
33
+        Cov_bm->get_col(j, Cov_column.begin());
34
+        // Construct iterators
35
+        // NOTE: Iterators constructed outside of loop because they may be of
36
+        //       different type, which is not supported within a for loop
37
+        //       constructor.
38
+        auto M_column_it = M_column.begin();
39
+        auto Cov_column_it = Cov_column.begin();
40
+        for (M_column_it = M_column.begin(), Cov_column_it = Cov_column.begin();
41
+             M_column_it != M_column.end();
42
+             ++M_column_it, ++Cov_column_it) {
43
+            if (isNA(*M_column_it)) {
44
+                return Rcpp::CharacterVector("'M' must not contain NAs.");
45
+            }
46
+            if (isNA(*Cov_column_it)) {
47
+                return Rcpp::CharacterVector("'Cov' must not contain NAs.");
48
+            }
49
+            if (*M_column_it < 0) {
50
+                return Rcpp::CharacterVector(
51
+                    "'M' must not contain negative values.");
52
+            }
53
+            if (*M_column_it > *Cov_column_it) {
54
+                return Rcpp::CharacterVector(
55
+                    "All values of 'M' must be less than or equal to the corresponding value of 'Cov'.");
56
+            }
57
+            if (!R_FINITE(*Cov_column_it)) {
58
+                return Rcpp::CharacterVector("All values of 'Cov' must be finite.");
59
+            }
60
+        }
61
+    }
62
+
63
+    return R_NilValue;
64
+    END_RCPP
65
+}
66
+
67
+SEXP check_M_and_Cov(SEXP M, SEXP Cov) {
68
+    BEGIN_RCPP
69
+
70
+    // Get the type of 'M' and 'Cov',
71
+    int M_type = beachmat::find_sexp_type(M);
72
+    int Cov_type = beachmat::find_sexp_type(Cov);
73
+    if (M_type == INTSXP && Cov_type == INTSXP) {
74
+        auto M_bm = beachmat::create_integer_matrix(M);
75
+        auto Cov_bm = beachmat::create_integer_matrix(Cov);
76
+        return check_M_and_Cov_internal<
77
+            Rcpp::IntegerVector, Rcpp::IntegerVector>(M_bm.get(), Cov_bm.get());
78
+    } else if (M_type == REALSXP && Cov_type == REALSXP) {
79
+        auto M_bm = beachmat::create_numeric_matrix(M);
80
+        auto Cov_bm = beachmat::create_numeric_matrix(Cov);
81
+        return check_M_and_Cov_internal<
82
+            Rcpp::NumericVector, Rcpp::NumericVector>(M_bm.get(), Cov_bm.get());
83
+    } else if (M_type == INTSXP && Cov_type == REALSXP) {
84
+        auto M_bm = beachmat::create_integer_matrix(M);
85
+        auto Cov_bm = beachmat::create_numeric_matrix(Cov);
86
+        return check_M_and_Cov_internal<
87
+            Rcpp::IntegerVector, Rcpp::NumericVector>(M_bm.get(), Cov_bm.get());
88
+    } else if (M_type == REALSXP && Cov_type == INTSXP) {
89
+        auto M_bm = beachmat::create_numeric_matrix(M);
90
+        auto Cov_bm = beachmat::create_integer_matrix(Cov);
91
+        return check_M_and_Cov_internal<
92
+            Rcpp::NumericVector, Rcpp::IntegerVector>(M_bm.get(), Cov_bm.get());
93
+    }
94
+    else {
95
+        return Rcpp::CharacterVector(
96
+            "'M' and 'Cov' must contain integer or numeric values.");
97
+    }
98
+    END_RCPP
99
+}
100
+// TODOs -----------------------------------------------------------------------
101
+
102
+// TODO: Add code path to process ordinary R vectors (for use within
103
+//       read.bismark() funcionality)?
104
+// TODO: Wasn't able to figure out how to use get_const_col. See
105
+//       https://gist.github.com/PeteHaitch/cf6cffd40d4f8bd7082eec1b0f330082
106
+//       and note that `check_M_and_Cov_using_const_columns` never fails when
107
+//       matrix input contains NA