Browse code

[FIX] Fixes for next version

Giulia Pais authored on 04/05/2022 12:58:45
Showing 9 changed files

... ...
@@ -2,6 +2,20 @@
2 2
 title: "NEWS"
3 3
 output: github_document
4 4
 ---
5
+# ISAnalytics <version>
6
+
7
+## BUG FIXES AND MINOR CHANGES
8
+
9
+* Fixed minor issue in `compute_near_integrations()` - function errored when
10
+`report_path` argument was set to `NULL`
11
+* Fixed dplyr warning in `integration_alluvial_plot()` internals
12
+* Fixed issue with report of VISPA2 stats - report failed due to minor error
13
+in rmd fragment
14
+* Internals of `remove_collisions()` use again dplyr internally for joining
15
+and grouping operations - needed because of performance issues with data.table
16
+* `fisher_scatterplot()` has 2 new arguments that allow the disabling of 
17
+highlighting for some genes even if their p-value is under the threshold
18
+
5 19
 # ISAnalytics 1.5.4 (2022-04-20)
6 20
 
7 21
 ## MAJOR CHANGES
... ...
@@ -146,11 +146,14 @@ remove_collisions <- function(x,
146 146
     replicate_n_col <- req_tag_cols[eval(sym("tag")) ==
147 147
         "pcr_replicate"][["names"]]
148 148
     pool_col <- req_tag_cols[eval(sym("tag")) == "pool_id"][["names"]]
149
-    joined <- association_file[pre_process, on = pcr_col]
150
-    joined <- joined[, mget(c(
151
-        colnames(pre_process), date_col,
152
-        replicate_n_col, independent_sample_id, pool_col
153
-    ))]
149
+    joined <- pre_process %>%
150
+      dplyr::left_join(association_file, by = pcr_col) %>%
151
+      dplyr::select(dplyr::all_of(
152
+        c(
153
+          colnames(pre_process), date_col,
154
+          replicate_n_col, independent_sample_id, pool_col
155
+        )
156
+      ))
154 157
     if (verbose) {
155 158
         rlang::inform("Identifying collisions...")
156 159
     }
... ...
@@ -2893,7 +2893,8 @@
2893 2893
     distinct_is <- purrr::map2_int(matrices, correct, ~ {
2894 2894
         if (.y) {
2895 2895
             .x %>%
2896
-                dplyr::distinct(dplyr::across(dplyr::all_of(mandatory_IS_vars()))) %>%
2896
+                dplyr::distinct(dplyr::across(
2897
+                  dplyr::all_of(mandatory_IS_vars()))) %>%
2897 2898
                 nrow()
2898 2899
         } else {
2899 2900
             NA_integer_
... ...
@@ -3102,17 +3103,19 @@
3102 3103
 #' @importFrom rlang .data sym
3103 3104
 .check_same_info <- function(association_file, df, req_tag_cols,
3104 3105
     indep_sample_id) {
3105
-    association_file <- data.table::setDT(association_file)
3106
-    df <- data.table::setDT(df)
3107
-    req_tag_cols <- data.table::setDT(req_tag_cols)
3108
-    joined <- df[association_file, on = pcr_id_column()]
3109
-    predicate <- purrr::map(mandatory_IS_vars(), ~ {
3110
-        rlang::expr(!is.na(!!sym(.x)))
3111
-    }) %>% purrr::reduce(~ rlang::expr(!!.x & !!.y))
3106
+    data.table::setDT(req_tag_cols)
3107
+    joined <- association_file %>%
3108
+      dplyr::left_join(df, by = pcr_id_column())
3112 3109
     proj_col_name <- req_tag_cols[eval(rlang::expr(
3113 3110
         !!sym("tag") == "project_id"
3114 3111
     ))][["names"]]
3115
-    projects <- unique(joined[eval(predicate), ][[proj_col_name]])
3112
+    projects <- joined %>%
3113
+      dplyr::filter(!dplyr::if_all(
3114
+        .cols = dplyr::all_of(mandatory_IS_vars()),
3115
+        .fns = is.na
3116
+      )) %>%
3117
+      dplyr::distinct(.data[[proj_col_name]]) %>%
3118
+      dplyr::pull(.data[[proj_col_name]])
3116 3119
     wanted_tags <- c(
3117 3120
         "subject", "tissue", "cell_marker", "tp_days"
3118 3121
     )
... ...
@@ -3121,15 +3124,22 @@
3121 3124
     wanted <- wanted[eval(sym("tag")) %in% wanted_tags &
3122 3125
         eval(sym("names")) %in% colnames(association_file), ]
3123 3126
     wanted <- data.table::rbindlist(list(req_tag_cols, wanted))
3124
-    predicate <- purrr::map(mandatory_IS_vars(), ~ {
3125
-        rlang::expr(is.na(!!sym(.x)))
3126
-    }) %>% purrr::reduce(~ rlang::expr(!!.x & !!.y))
3127
-    predicate <- rlang::expr(!!predicate & !!sym(proj_col_name) %in% projects)
3128
-    missing_in_df <- unique(joined[
3129
-        eval(predicate),
3130
-        mget(unique(c(wanted$names, indep_sample_id)))
3131
-    ])
3132
-    reduced_af <- association_file[eval(sym(proj_col_name)) %in% projects]
3127
+    missing_in_df <- joined %>%
3128
+      dplyr::filter(
3129
+        dplyr::if_all(
3130
+          .cols = dplyr::all_of(mandatory_IS_vars()),
3131
+          .fns = is.na
3132
+          ) & .data[[proj_col_name]] %in% projects
3133
+      ) %>%
3134
+      dplyr::distinct(dplyr::across(
3135
+        dplyr::all_of(c(wanted$names, indep_sample_id))
3136
+        ))
3137
+    reduced_af <- association_file %>%
3138
+      dplyr::filter(
3139
+        .data[[proj_col_name]] %in% projects
3140
+      )
3141
+    data.table::setDT(reduced_af)
3142
+    data.table::setDT(missing_in_df)
3133 3143
     list(miss = missing_in_df, reduced_af = reduced_af)
3134 3144
 }
3135 3145
 
... ...
@@ -3141,14 +3151,21 @@
3141 3151
 # @return A named list containing the splitted joined_df for collisions and
3142 3152
 # non-collisions
3143 3153
 .identify_independent_samples <- function(joined, indep_sample_id) {
3144
-    temp <- joined[, mget(c(mandatory_IS_vars(), indep_sample_id))]
3145
-    temp <- temp[, unique(.SD, by = indep_sample_id),
3146
-        by = eval(mandatory_IS_vars())
3147
-    ]
3148
-    temp <- temp[, .N, by = eval(mandatory_IS_vars())]
3149
-    temp <- temp[eval(sym("N")) > 1, mget(mandatory_IS_vars())]
3150
-    non_collisions <- joined[!temp, on = mandatory_IS_vars()]
3151
-    collisions <- joined[temp, on = mandatory_IS_vars()]
3154
+    indep_syms <- purrr::map(indep_sample_id, rlang::sym)
3155
+    temp <- joined %>%
3156
+      dplyr::select(dplyr::all_of(
3157
+        c(mandatory_IS_vars(), indep_sample_id)
3158
+      )) %>%
3159
+      dplyr::group_by(dplyr::across(dplyr::all_of(mandatory_IS_vars()))) %>%
3160
+      dplyr::summarise(N = dplyr::n_distinct(!!!indep_syms),
3161
+                       .groups = "drop") %>%
3162
+      dplyr::filter(.data$N > 1)
3163
+    non_collisions <- joined %>%
3164
+      dplyr::anti_join(temp, by = mandatory_IS_vars())
3165
+    collisions <- joined %>%
3166
+      dplyr::semi_join(temp, by = mandatory_IS_vars())
3167
+    data.table::setDT(non_collisions)
3168
+    data.table::setDT(collisions)
3152 3169
     list(collisions = collisions, non_collisions = non_collisions)
3153 3170
 }
3154 3171
 
... ...
@@ -3183,7 +3200,9 @@
3183 3200
     }
3184 3201
     winning_sample <- x[1, mget(ind_sample_key)]
3185 3202
     # Filter the winning rows
3186
-    x <- x[winning_sample, on = ind_sample_key]
3203
+    x <- x %>%
3204
+      dplyr::semi_join(winning_sample, by = ind_sample_key)
3205
+    data.table::setDT(x)
3187 3206
     return(list(data = x, check = TRUE))
3188 3207
 }
3189 3208
 
... ...
@@ -3207,14 +3226,18 @@
3207 3226
 # * $check: a logical value indicating whether the analysis was successful or
3208 3227
 # not (and therefore there is the need to perform the next step)
3209 3228
 .discriminate_by_replicate <- function(x, repl_col, ind_sample_key) {
3210
-    temp <- x[, .N, by = eval(ind_sample_key)]
3211
-    temp <- dplyr::arrange(temp, dplyr::desc(.data$N))
3212
-
3229
+    temp <- x %>%
3230
+      dplyr::group_by(dplyr::across(dplyr::all_of(ind_sample_key))) %>%
3231
+      dplyr::summarise(N = dplyr::n(), .groups = "drop") %>%
3232
+      dplyr::arrange(dplyr::desc(.data$N))
3233
+    data.table::setDT(temp)
3213 3234
     if (length(temp$N) != 1 & !temp$N[1] > temp$N[2]) {
3214 3235
         return(list(data = x, check = FALSE))
3215 3236
     }
3216 3237
     temp <- temp[1, mget(ind_sample_key)]
3217
-    x <- x[temp, on = ind_sample_key]
3238
+    x <- x %>%
3239
+      dplyr::semi_join(temp, by = ind_sample_key)
3240
+    data.table::setDT(x)
3218 3241
     return(list(data = x, check = TRUE))
3219 3242
 }
3220 3243
 
... ...
@@ -3243,14 +3266,19 @@
3243 3266
     reads_ratio,
3244 3267
     seqCount_col,
3245 3268
     ind_sample_key) {
3246
-    temp <- x[, list(sum = sum(.SD[[seqCount_col]])), by = eval(ind_sample_key)]
3247
-    temp <- dplyr::arrange(temp, dplyr::desc(.data$sum))
3269
+    temp <- x %>%
3270
+      dplyr::group_by(dplyr::across(dplyr::all_of(ind_sample_key))) %>%
3271
+      dplyr::summarise(sum = sum(.data[[seqCount_col]]), .groups = "drop") %>%
3272
+      dplyr::arrange(dplyr::desc(.data$sum))
3273
+    data.table::setDT(temp)
3248 3274
     ratio <- temp$sum[1] / temp$sum[2]
3249 3275
     if (!ratio > reads_ratio) {
3250 3276
         return(list(data = x, check = FALSE))
3251 3277
     }
3252 3278
     temp <- temp[1, mget(ind_sample_key)]
3253
-    x <- x[temp, on = ind_sample_key]
3279
+    x <- x %>%
3280
+      dplyr::semi_join(temp, by = ind_sample_key)
3281
+    data.table::setDT(x)
3254 3282
     return(list(data = x, check = TRUE))
3255 3283
 }
3256 3284
 
... ...
@@ -137,7 +137,12 @@ outlier_filter <- function(metadata,
137 137
         call_test <- function(test, vargs, df, test_name) {
138 138
             test_args_names <- do.call(rlang::fn_fmls_names, args = list(test))
139 139
             test_args_names <- test_args_names[test_args_names != "metadata"]
140
-            test_args <- vargs[names(vargs) %in% test_args_names]
140
+            test_args <- if ("report_path" %in% test_args_names) {
141
+              append(vargs[names(vargs) %in% test_args_names],
142
+                     list(report_path = report_path))
143
+            } else {
144
+              vargs[names(vargs) %in% test_args_names]
145
+            }
141 146
             f_call <- rlang::call2(test, metadata = metadata, !!!test_args)
142 147
             result <- rlang::eval_tidy(f_call)
143 148
             .validate_outlier_output_format(
... ...
@@ -281,6 +281,31 @@ clinical_relevant_suspicious_genes <- function() {
281 281
 #' Plots results of Fisher's exact test on gene frequency obtained via
282 282
 #' `gene_frequency_fisher()` as a scatterplot.
283 283
 #'
284
+#' @details
285
+#' ## Specifying genes to avoid highlighting
286
+#'
287
+#' In some cases, users might want to avoid highlighting certain genes
288
+#' even if their p-value is below the threshold. To do so, use the
289
+#' argument `do_not_highlight`: character vectors are appropriate for specific
290
+#' genes that are to be excluded, expressions or lambdas allow a finer control.
291
+#' For example we can supply:
292
+#' ```{r eval=FALSE}
293
+#' expr <- rlang::expr(!stringr::str_starts(GeneName, "MIR") &
294
+#'                       average_TxLen_1 >= 300)
295
+#' ```
296
+#'
297
+#' with this expression, genes that have a p-value < threshold and start with
298
+#' "MIR" or have an average_TxLen_1 lower than 300 are excluded from the
299
+#' highlighted points.
300
+#' NOTE: keep in mind that expressions are evaluated inside a `dplyr::filter`
301
+#' context.
302
+#'
303
+#' Similarly, lambdas are passed to the filtering function but only operate
304
+#' on the column containing the gene symbol.
305
+#' ```{r eval=FALSE}
306
+#' lambda <- ~ stringr::str_starts(.x, "MIR")
307
+#' ```
308
+#'
284 309
 #' @param fisher_df Test results obtained via `gene_frequency_fisher()`
285 310
 #' @param p_value_col Name of the column containing the p-value to consider
286 311
 #' @param annot_threshold Annotate with a different color if a point is below
... ...
@@ -288,6 +313,14 @@ clinical_relevant_suspicious_genes <- function() {
288 313
 #' @param annot_color The color in which points below the threshold should be
289 314
 #' annotated
290 315
 #' @param gene_sym_col The name of the column containing the gene symbol
316
+#' @param do_not_highlight Either `NULL`, a character vector, an expression
317
+#' or a purrr-style lambda. Tells the function to ignore the highlighting
318
+#' and labeling of these genes even if their p-value is below the threshold.
319
+#' See details.
320
+#' @param keep_not_highlighted If present, how should not highlighted genes
321
+#' be treated? If set to `TRUE` points are plotted and colored with the
322
+#' chosen color scale. If set to `FALSE` the points are removed entirely from
323
+#' the plot.
291 324
 #'
292 325
 #' @family Plotting functions
293 326
 #' @return A plot
... ...
@@ -310,11 +343,51 @@ fisher_scatterplot <- function(fisher_df,
310 343
     p_value_col = "Fisher_p_value_fdr",
311 344
     annot_threshold = 0.05,
312 345
     annot_color = "red",
313
-    gene_sym_col = "GeneName") {
314
-    below_threshold <- fisher_df %>%
346
+    gene_sym_col = "GeneName",
347
+    do_not_highlight = NULL,
348
+    keep_not_highlighted = TRUE) {
349
+    stopifnot(is.null(do_not_highlight) || is.character(do_not_highlight)
350
+              || rlang::is_expression(do_not_highlight))
351
+    if (is.null(do_not_highlight)) {
352
+      below_threshold <- fisher_df %>%
315 353
         dplyr::filter(.data[[p_value_col]] < annot_threshold)
316
-    above_threshold <- fisher_df %>%
354
+      above_threshold <- fisher_df %>%
317 355
         dplyr::filter(.data[[p_value_col]] >= annot_threshold)
356
+    } else if (is.character(do_not_highlight)) {
357
+      below_threshold <- fisher_df %>%
358
+        dplyr::filter(.data[[p_value_col]] < annot_threshold &
359
+                        !.data[[gene_sym_col]] %in% do_not_highlight)
360
+      if (keep_not_highlighted) {
361
+        above_threshold <- fisher_df %>%
362
+          dplyr::anti_join(below_threshold, by = gene_sym_col)
363
+      } else {
364
+        above_threshold <- fisher_df %>%
365
+          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
366
+      }
367
+    } else if (rlang::is_formula(do_not_highlight)) {
368
+      below_threshold <- fisher_df %>%
369
+        dplyr::filter(.data[[p_value_col]] < annot_threshold &
370
+                        !rlang::as_function(do_not_highlight)(
371
+                          .data[[gene_sym_col]]))
372
+      if (keep_not_highlighted) {
373
+        above_threshold <- fisher_df %>%
374
+          dplyr::anti_join(below_threshold, by = gene_sym_col)
375
+      } else {
376
+        above_threshold <- fisher_df %>%
377
+          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
378
+      }
379
+    } else {
380
+      below_threshold <- fisher_df %>%
381
+        dplyr::filter(.data[[p_value_col]] < annot_threshold &
382
+                        (rlang::eval_tidy(do_not_highlight)))
383
+      if (keep_not_highlighted) {
384
+        above_threshold <- fisher_df %>%
385
+          dplyr::anti_join(below_threshold, by = gene_sym_col)
386
+      } else {
387
+        above_threshold <- fisher_df %>%
388
+          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
389
+      }
390
+    }
318 391
     plot <- ggplot2::ggplot(
319 392
         above_threshold,
320 393
         ggplot2::aes_(
... ...
@@ -323,7 +396,7 @@ fisher_scatterplot <- function(fisher_df,
323 396
             color = ggplot2::sym(p_value_col)
324 397
         )
325 398
     ) +
326
-        ggplot2::geom_point() +
399
+        ggplot2::geom_point(alpha = 0.65) +
327 400
         ggplot2::geom_point(data = below_threshold, color = annot_color) +
328 401
         ggplot2::theme_bw() +
329 402
         ggplot2::labs(
... ...
@@ -879,9 +952,7 @@ top_abund_tableGrob <- function(df,
879 952
     tops_by_x <- purrr::map(distinct_x, function(x) {
880 953
         tmp <- top %>%
881 954
             dplyr::filter(
882
-                dplyr::across(
883
-                    dplyr::all_of(by), ~ .x == x
884
-                )
955
+              dplyr::if_all(.cols = dplyr::all_of(by), .fns = ~ .x == x)
885 956
             ) %>%
886 957
             dplyr::select(-dplyr::all_of(by))
887 958
         if (nrow(tmp) < top_n) {
... ...
@@ -324,7 +324,7 @@ compute_near_integrations <- function(x,
324 324
             maps <- data.table::rbindlist(list(maps, map_fine))
325 325
         }
326 326
     }
327
-    if (map_as_file) {
327
+    if (map_as_file & !is.null(file_path)) {
328 328
         ### Manage file
329 329
         withCallingHandlers(
330 330
             {
... ...
@@ -6,7 +6,6 @@ output: html_document
6 6
 ```{r include=FALSE}
7 7
 knitr::opts_chunk$set(echo = FALSE, warning = FALSE)
8 8
 options(DT.warn.size = FALSE)
9
-utils::globalVariables("where")
10 9
 ```
11 10
 
12 11
 VISPA2 stats info {data-orientation=rows}
... ...
@@ -62,7 +61,8 @@ cat("*Nothing to report*")
62 61
 
63 62
 ```{r eval=!is.null(iss_stats_miss) && nrow(iss_stats_miss) > 0, echo=FALSE}
64 63
 iss_stats_miss <- iss_stats_miss %>%
65
-  dplyr::mutate(dplyr::across(where(is.character), as.factor))
64
+  dplyr::mutate(dplyr::across(
65
+    tidyselect::vars_select_helpers$where(is.character), as.factor))
66 66
 
67 67
 datatable(iss_stats_miss,
68 68
           class = "stripe",
... ...
@@ -9,7 +9,9 @@ fisher_scatterplot(
9 9
   p_value_col = "Fisher_p_value_fdr",
10 10
   annot_threshold = 0.05,
11 11
   annot_color = "red",
12
-  gene_sym_col = "GeneName"
12
+  gene_sym_col = "GeneName",
13
+  do_not_highlight = NULL,
14
+  keep_not_highlighted = TRUE
13 15
 )
14 16
 }
15 17
 \arguments{
... ...
@@ -24,6 +26,16 @@ the significance threshold. Single numerical value.}
24 26
 annotated}
25 27
 
26 28
 \item{gene_sym_col}{The name of the column containing the gene symbol}
29
+
30
+\item{do_not_highlight}{Either \code{NULL}, a character vector, an expression
31
+or a purrr-style lambda. Tells the function to ignore the highlighting
32
+and labeling of these genes even if their p-value is below the threshold.
33
+See details.}
34
+
35
+\item{keep_not_highlighted}{If present, how should not highlighted genes
36
+be treated? If set to \code{TRUE} points are plotted and colored with the
37
+chosen color scale. If set to \code{FALSE} the points are removed entirely from
38
+the plot.}
27 39
 }
28 40
 \value{
29 41
 A plot
... ...
@@ -33,6 +45,28 @@ A plot
33 45
 Plots results of Fisher's exact test on gene frequency obtained via
34 46
 \code{gene_frequency_fisher()} as a scatterplot.
35 47
 }
48
+\details{
49
+\subsection{Specifying genes to avoid highlighting}{
50
+
51
+In some cases, users might want to avoid highlighting certain genes
52
+even if their p-value is below the threshold. To do so, use the
53
+argument \code{do_not_highlight}: character vectors are appropriate for specific
54
+genes that are to be excluded, expressions or lambdas allow a finer control.
55
+For example we can supply:\if{html}{\out{<div class="sourceCode r">}}\preformatted{expr <- rlang::expr(!stringr::str_starts(GeneName, "MIR") &
56
+                      average_TxLen_1 >= 300)
57
+}\if{html}{\out{</div>}}
58
+
59
+with this expression, genes that have a p-value < threshold and start with
60
+"MIR" or have an average_TxLen_1 lower than 300 are excluded from the
61
+highlighted points.
62
+NOTE: keep in mind that expressions are evaluated inside a \code{dplyr::filter}
63
+context.
64
+
65
+Similarly, lambdas are passed to the filtering function but only operate
66
+on the column containing the gene symbol.\if{html}{\out{<div class="sourceCode r">}}\preformatted{lambda <- ~ stringr::str_starts(.x, "MIR")
67
+}\if{html}{\out{</div>}}
68
+}
69
+}
36 70
 \examples{
37 71
 data("integration_matrices", package = "ISAnalytics")
38 72
 data("association_file", package = "ISAnalytics")
... ...
@@ -113,13 +113,13 @@ test_that(".check_same_info reports only projects of interest", {
113 113
 # Tests .identify_independent_samples
114 114
 #------------------------------------------------------------------------------#
115 115
 test_that(".identify_independent_samples splits joined_df", {
116
-    joined <- minimal_test_coll_meta[minimal_test_coll,
117
-        on = "CompleteAmplificationID"
118
-    ]
119
-    joined <- joined[, mget(c(
116
+    joined <- minimal_test_coll %>%
117
+      dplyr::left_join(minimal_test_coll_meta,
118
+                       by = "CompleteAmplificationID") %>%
119
+      dplyr::select(dplyr::all_of(c(
120 120
         colnames(minimal_test_coll), "SequencingDate",
121 121
         "ReplicateNumber", "ProjectID", "SubjectID"
122
-    ))]
122
+      )))
123 123
 
124 124
     split <- .identify_independent_samples(joined,
125 125
         indep_sample_id = c(
... ...
@@ -795,7 +795,8 @@ test_that(".collisions_check_input_af works as expected", {
795 795
     expect_error(
796 796
         {
797 797
             checks <- .collisions_check_input_af(minimal_test_coll_meta %>%
798
-                dplyr::mutate(SequencingDate = as.character(.data$SequencingDate)),
798
+                dplyr::mutate(SequencingDate = as.character(
799
+                  .data$SequencingDate)),
799 800
             date_col = "SequencingDate",
800 801
             independent_sample_id = c("SubjectID")
801 802
             )