Browse code

[U] For next version - updates to aggregate_metadata and others

Giulia Pais authored on 14/12/2021 12:20:39
Showing11 changed files

... ...
@@ -134,7 +134,13 @@ default_meta_agg <- function() {
134 134
         "BWA_MAPPED_OVERALL", ~ sum(.x, na.rm = TRUE),
135 135
         NA, "{.col}_sum",
136 136
         "ISS_MAPPED_PP", ~ sum(.x, na.rm = TRUE),
137
-        NA, "{.col}_sum"
137
+        NA, "{.col}_sum",
138
+        "PCRMethod", ~ paste0(unique(.x), collapse = "|"),
139
+        NA, "{.col}",
140
+        "NGSTechnology", ~ paste0(unique(.x), collapse = "|"),
141
+        NA, "{.col}",
142
+        "DNAnumber", ~ paste0(unique(.x), collapse = "|"),
143
+        NA, "{.col}"
138 144
     )
139 145
 }
140 146
 
... ...
@@ -1769,11 +1769,15 @@ purity_filter <- function(x,
1769 1769
 #'     dplyr::filter(.data$Tissue == "PB")
1770 1770
 #' source <- iss_source(df1, df2)
1771 1771
 #' source
1772
-#' ggplot2::ggplot(source$PT001, ggplot2::aes(x = as.factor(g2_TimePoint),
1773
-#'                                            y = sharing_perc, fill = g1)) +
1772
+#' ggplot2::ggplot(source$PT001, ggplot2::aes(
1773
+#'     x = as.factor(g2_TimePoint),
1774
+#'     y = sharing_perc, fill = g1
1775
+#' )) +
1774 1776
 #'     ggplot2::geom_col() +
1775
-#'     ggplot2::labs(x = "Time point", y = "Shared IS % with MNC BM",
1776
-#'                   title = "Source of is MNC BM vs MNC PB")
1777
+#'     ggplot2::labs(
1778
+#'         x = "Time point", y = "Shared IS % with MNC BM",
1779
+#'         title = "Source of is MNC BM vs MNC PB"
1780
+#'     )
1777 1781
 iss_source <- function(reference,
1778 1782
     selection,
1779 1783
     ref_group_key = c(
... ...
@@ -264,15 +264,6 @@ remove_collisions <- function(x,
264 264
         )
265 265
         pre_summary <- .summary_input(pre_process, quant_cols)
266 266
         per_pool_stats_pre <- .per_pool_stats(joined, quant_cols)
267
-        sharing_pre <- is_sharing(joined,
268
-            group_key = c(
269
-                "ProjectID",
270
-                "SubjectID"
271
-            ), n_comp = 2,
272
-            is_count = FALSE,
273
-            minimal = FALSE,
274
-            include_self_comp = TRUE
275
-        )
276 267
         coll_info <- list(
277 268
             coll_n = splitted_df$collisions %>%
278 269
                 dplyr::distinct(
... ...
@@ -293,14 +284,44 @@ remove_collisions <- function(x,
293 284
             post_joined,
294 285
             quant_cols
295 286
         )
296
-        sharing_post <- is_sharing(post_joined,
297
-            group_key = c(
298
-                "ProjectID",
299
-                "SubjectID"
300
-            ), n_comp = 2,
301
-            is_count = FALSE,
302
-            minimal = FALSE,
303
-            include_self_comp = TRUE
287
+        sharing_pre <- sharing_post <- NULL
288
+        withCallingHandlers(
289
+            {
290
+                withRestarts(
291
+                    {
292
+                        sharing_pre <- is_sharing(joined,
293
+                            group_key = c(
294
+                                "ProjectID",
295
+                                "SubjectID"
296
+                            ), n_comp = 2,
297
+                            is_count = FALSE,
298
+                            minimal = FALSE,
299
+                            include_self_comp = TRUE
300
+                        )
301
+                        sharing_post <- is_sharing(post_joined,
302
+                            group_key = c(
303
+                                "ProjectID",
304
+                                "SubjectID"
305
+                            ), n_comp = 2,
306
+                            is_count = FALSE,
307
+                            minimal = FALSE,
308
+                            include_self_comp = TRUE
309
+                        )
310
+                    },
311
+                    sharing_err = function(e) {
312
+                        rlang::inform(c(paste(
313
+                            "Unable to compute sharing:",
314
+                            conditionMessage(e)
315
+                        ),
316
+                        i = "Skipping"
317
+                        ))
318
+                    }
319
+                )
320
+            },
321
+            error = function(cnd) {
322
+                rest <- findRestart("sharing_err")
323
+                invokeRestart(rest, cnd)
324
+            }
304 325
         )
305 326
         summary_tbl <- .summary_table(
306 327
             before = joined, after = post_joined,
... ...
@@ -784,12 +784,24 @@ import_parallel_Vispa2Matrices <- function(association_file,
784 784
             !!!mult_args
785 785
         ))
786 786
     }
787
+    annotation_problems <- if (getOption("ISAnalytics.reports") == TRUE &
788
+        !is.null(report_path)) {
789
+        tmp <- if (!multi_quant_matrix) {
790
+            comparison_matrix(matrices)
791
+        } else {
792
+            matrices
793
+        }
794
+        annotation_issues(tmp)
795
+    } else {
796
+        NULL
797
+    }
787 798
     withCallingHandlers(
788 799
         {
789 800
             .produce_report("matrix_imp",
790 801
                 params = list(
791 802
                     files_found = files_found,
792
-                    files_imp = fimported
803
+                    files_imp = fimported,
804
+                    annot_prob = annotation_problems
793 805
                 ),
794 806
                 path = report_path
795 807
             )
... ...
@@ -894,6 +906,63 @@ import_parallel_Vispa2Matrices_auto <- function(association_file,
894 906
 }
895 907
 
896 908
 
909
+#' Check for genomic annotation problems in IS matrices.
910
+#'
911
+#' \lifecycle{experimental}
912
+#' This helper function checks if each individual integration site,
913
+#' identified by the triplet (chr, integration locus, strand),
914
+#' has been annotated with two or more distinct gene symbols.
915
+#'
916
+#' @param matrix Either a single matrix or a list of matrices, ideally obtained
917
+#' via `import_parallel_Vispa2Matrices()` or `import_single_Vispa2Matrix()`
918
+#'
919
+#' @return Either `NULL` if no issues were detected or 1 or more data frames
920
+#' with genomic coordinates of the IS and the number of distinct
921
+#' genes associated
922
+#' @export
923
+#'
924
+#' @family Import functions helpers
925
+#'
926
+#' @examples
927
+#' data("integration_matrices", package = "ISAnalytics")
928
+#' annotation_issues(integration_matrices)
929
+annotation_issues <- function(matrix) {
930
+    stopifnot(is.list(matrix))
931
+    find_probs <- function(m) {
932
+        needed <- c(mandatory_IS_vars(), annotation_IS_vars())
933
+        if (!all(needed %in% colnames(m))) {
934
+            rlang::inform(.missing_needed_cols(needed[!needed %in% colnames(m)]))
935
+            return(NULL)
936
+        }
937
+        tmp <- m %>%
938
+            dplyr::select(dplyr::all_of(c(
939
+                mandatory_IS_vars(),
940
+                annotation_IS_vars()
941
+            ))) %>%
942
+            dplyr::distinct() %>%
943
+            dplyr::group_by(dplyr::across(dplyr::all_of(mandatory_IS_vars()))) %>%
944
+            dplyr::summarise(distinct_genes = dplyr::n())
945
+        if (any(tmp$distinct_genes > 1)) {
946
+            tmp %>%
947
+                dplyr::filter(.data$distinct_genes > 1)
948
+        } else {
949
+            NULL
950
+        }
951
+    }
952
+    if (is.data.frame(matrix)) {
953
+        probs <- find_probs(matrix)
954
+        if (is.null(probs) & getOption("ISAnalytics.verbose") == TRUE) {
955
+            rlang::inform("No annotation issues found")
956
+        }
957
+        return(probs)
958
+    }
959
+    probs <- purrr::map(matrix, find_probs)
960
+    if (all(is.null(probs)) & getOption("ISAnalytics.verbose") == TRUE) {
961
+        rlang::inform("No annotation issues found")
962
+    }
963
+    return(probs)
964
+}
965
+
897 966
 
898 967
 #' Possible choices for the `quantification_type` parameter.
899 968
 #'
... ...
@@ -515,9 +515,9 @@ integration_alluvial_plot <- function(x,
515 515
 
516 516
     # Compute plots in parallel
517 517
     p <- BiocParallel::MulticoreParam(
518
-            stop.on.error = FALSE, progressbar = TRUE,
519
-            tasks = length(groups_to_plot), exportglobals = FALSE
520
-        )
518
+        stop.on.error = FALSE, progressbar = TRUE,
519
+        tasks = length(groups_to_plot), exportglobals = FALSE
520
+    )
521 521
 
522 522
 
523 523
     FUN <- function(group_df,
... ...
@@ -13,6 +13,7 @@ output:
13 13
 params:
14 14
   files_found: null
15 15
   files_imp: null
16
+  annot_prob: null
16 17
 ---
17 18
 
18 19
 ```{r setup, include=FALSE}
... ...
@@ -70,8 +71,9 @@ datatable(
70 71
     columnDefs = list(
71 72
       list(targets = 3,
72 73
            render = found_render)
73
-      )
74
-    )
74
+      ),
75
+      scrollY = 350
76
+    ),
75 77
 ) %>% formatStyle(columns = 4, fontWeight = 'bold', textAlign = 'center')
76 78
 ```
77 79
 
... ...
@@ -81,7 +83,7 @@ datatable(
81 83
   files_details,
82 84
   rownames = FALSE, 
83 85
   colnames = stringr::str_replace_all(colnames(files_details), "_", " "), 
84
-  filter = 'top'
86
+  filter = 'top', options = list(scrollY = 350)
85 87
 )
86 88
 ```
87 89
 
... ...
@@ -116,7 +118,8 @@ datatable(
116 118
   rownames = FALSE,
117 119
   colnames = stringr::str_replace_all(colnames(files_imp), "_", " "),
118 120
   options = list(
119
-            order = list(list(4, 'asc'))
121
+            order = list(list(4, 'asc')),
122
+            scrollY = 350
120 123
           )
121 124
 ) %>% formatStyle(columns = "Imported", 
122 125
               color = styleEqual(
... ...
@@ -134,6 +137,31 @@ cat("*Nothing to report*")
134 137
 Summary of files chosen for import. If `Imported` is false, something may
135 138
 have gone wrong during the import phase (most likely I/O errors).
136 139
 
140
+Annotation issues {data-orientation=rows}
141
+===============================================================================
142
+
143
+Row1 {data-height=600 .tabset .tabset-fade}
144
+-------------------------------------------------------------------------------
145
+### Possible annotation problems
146
+```{r eval=!is.null(params$annot_prob)}
147
+datatable(
148
+  annot_prob,
149
+  rownames = FALSE, 
150
+  colnames = stringr::str_replace_all(colnames(files_details), "_", " "), 
151
+  filter = 'top', options = list(scrollY = 350)
152
+)
153
+```
154
+
155
+```{r eval=is.null(params$annot_prob), results='asis'}
156
+cat("*Nothing to report*")
157
+```
158
+
159
+### About
160
+
161
+Report on annotation issues. Here are reported, if found, integration sites that
162
+have been annotated with 2 or more different genes. If the table isn't empty,
163
+you might want to check your matrices or re-perform the annotation step.
164
+
137 165
 Reproducibility {data-orientation=rows}
138 166
 ===============================================================================
139 167
 
140 168
new file mode 100644
... ...
@@ -0,0 +1,33 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/import-functions.R
3
+\name{annotation_issues}
4
+\alias{annotation_issues}
5
+\title{Check for genomic annotation problems in IS matrices.}
6
+\usage{
7
+annotation_issues(matrix)
8
+}
9
+\arguments{
10
+\item{matrix}{Either a single matrix or a list of matrices, ideally obtained
11
+via \code{import_parallel_Vispa2Matrices()} or \code{import_single_Vispa2Matrix()}}
12
+}
13
+\value{
14
+Either \code{NULL} if no issues were detected or 1 or more data frames
15
+with genomic coordinates of the IS and the number of distinct
16
+genes associated
17
+}
18
+\description{
19
+\lifecycle{experimental}
20
+This helper function checks if each individual integration site,
21
+identified by the triplet (chr, integration locus, strand),
22
+has been annotated with two or more distinct gene symbols.
23
+}
24
+\examples{
25
+data("integration_matrices", package = "ISAnalytics")
26
+annotation_issues(integration_matrices)
27
+}
28
+\seealso{
29
+Other Import functions helpers: 
30
+\code{\link{matching_options}()},
31
+\code{\link{quantification_types}()}
32
+}
33
+\concept{Import functions helpers}
... ...
@@ -62,11 +62,15 @@ df2 <- aggreg \%>\%
62 62
     dplyr::filter(.data$Tissue == "PB")
63 63
 source <- iss_source(df1, df2)
64 64
 source
65
-ggplot2::ggplot(source$PT001, ggplot2::aes(x = as.factor(g2_TimePoint),
66
-                                           y = sharing_perc, fill = g1)) +
65
+ggplot2::ggplot(source$PT001, ggplot2::aes(
66
+    x = as.factor(g2_TimePoint),
67
+    y = sharing_perc, fill = g1
68
+)) +
67 69
     ggplot2::geom_col() +
68
-    ggplot2::labs(x = "Time point", y = "Shared IS \% with MNC BM",
69
-                  title = "Source of is MNC BM vs MNC PB")
70
+    ggplot2::labs(
71
+        x = "Time point", y = "Shared IS \% with MNC BM",
72
+        title = "Source of is MNC BM vs MNC PB"
73
+    )
70 74
 }
71 75
 \seealso{
72 76
 Other Analysis functions: 
... ...
@@ -32,6 +32,7 @@ opts <- matching_options()
32 32
 \code{\link{import_parallel_Vispa2Matrices_auto}}
33 33
 
34 34
 Other Import functions helpers: 
35
+\code{\link{annotation_issues}()},
35 36
 \code{\link{quantification_types}()}
36 37
 }
37 38
 \concept{Import functions helpers}
... ...
@@ -33,6 +33,7 @@ quant_types <- quantification_types()
33 33
 \code{\link{import_parallel_Vispa2Matrices_auto}}
34 34
 
35 35
 Other Import functions helpers: 
36
+\code{\link{annotation_issues}()},
36 37
 \code{\link{matching_options}()}
37 38
 }
38 39
 \concept{Import functions helpers}
... ...
@@ -187,3 +187,46 @@ test_that("as_sparse_matrix works with list of matrices", {
187 187
         colnames(sparse[[2]])))
188 188
     expect_equal(nrow(sparse[[2]]), 3)
189 189
 })
190
+
191
+#------------------------------------------------------------------------------#
192
+# Tests annotation_issues
193
+#------------------------------------------------------------------------------#
194
+test_df_issues <- tibble::tribble(
195
+    ~chr, ~integration_locus, ~strand, ~GeneName, ~GeneStrand,
196
+    ~CompleteAmplificationID, ~Value,
197
+    "1", 123456, "+", "ABCDE", "+", "ID1", 56,
198
+    "1", 123456, "+", "ABCDE", "-", "ID2", 675,
199
+    "1", 123456, "+", "FGHIL", "-", "ID3", 67,
200
+    "2", 5674653, "-", "FGHIL", "-", "ID2", 873,
201
+    "1", 4578768, "-", "RSPQX", "-", "ID3", 983,
202
+)
203
+
204
+test_df_no_issues <- tibble::tribble(
205
+    ~chr, ~integration_locus, ~strand, ~GeneName, ~GeneStrand,
206
+    ~CompleteAmplificationID, ~Value,
207
+    "1", 123456, "+", "ABCDE", "+", "ID1", 56,
208
+    "1", 123456, "+", "ABCDE", "+", "ID2", 675,
209
+    "1", 123456, "+", "ABCDE", "+", "ID3", 67,
210
+    "2", 5674653, "-", "FGHIL", "-", "ID2", 873,
211
+    "1", 4578768, "-", "RSPQX", "-", "ID3", 983,
212
+)
213
+
214
+test_that("annotation_issues returns df if issues", {
215
+    res <- annotation_issues(test_df_issues)
216
+    expect_true(!is.null(res))
217
+    expect_true(nrow(res) == 1 & res$chr[1] == "1" &
218
+        res$integration_locus[1] == 123456 & res$strand == "+" &
219
+        res$distinct_genes == 3)
220
+})
221
+
222
+test_that("annotation_issues returns null if no issues", {
223
+    res <- annotation_issues(test_df_no_issues)
224
+    expect_null(res)
225
+})
226
+
227
+test_that("annotation_issues works with lists", {
228
+    res <- annotation_issues(list(a = test_df_issues, b = test_df_no_issues))
229
+    expect_true(!is.null(res))
230
+    expect_true(is.null(res[[2]]))
231
+    expect_true(nrow(res[[1]]) == 1)
232
+})