Browse code

[U] Update to 1.7.3

Giulia Pais authored on 17/06/2022 14:47:19
Showing1 changed files
... ...
@@ -700,7 +700,7 @@ gene_frequency_fisher <- function(cis_x,
700 700
         purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join))
701 701
     }
702 702
     if (nrow(merged) == 0) {
703
-        if (getOption("ISAnalytics.verbose") == TRUE) {
703
+        if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
704 704
             msg <- c("Data frame empty after filtering",
705 705
                 i = paste(
706 706
                     "Data frame is empty after applying filter on IS,",
... ...
@@ -778,7 +778,7 @@ gene_frequency_fisher <- function(cis_x,
778 778
             dplyr::filter(.data$to_exclude_from_test == FALSE) %>%
779 779
             dplyr::select(-.data$to_exclude_from_test)
780 780
         if (nrow(merged) == 0) {
781
-            if (getOption("ISAnalytics.verbose") == TRUE) {
781
+            if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
782 782
                 msg <- c("Data frame empty after filtering",
783 783
                     i = paste(
784 784
                         "Data frame is after removing unbalanced IS,",
... ...
@@ -951,7 +951,7 @@ sample_statistics <- function(x,
951 951
             result <- result %>%
952 952
                 dplyr::left_join(nIS, by = sample_key)
953 953
         } else {
954
-            if (getOption("ISAnalytics.verbose")) {
954
+            if (getOption("ISAnalytics.verbose", TRUE)) {
955 955
                 rlang::inform(.inform_skip_count_is())
956 956
             }
957 957
         }
... ...
@@ -1015,7 +1015,7 @@ sample_statistics <- function(x,
1015 1015
 #' ("SubjectID" column must be included in the input data frame).
1016 1016
 #' @param return_missing_as_df Returns those genes present in the input df
1017 1017
 #' but not in the refgenes as a data frame?
1018
-#' @param results_as_list Relevant only if `by` is not `NULL` - if `TRUE`
1018
+#' @param results_as_list If `TRUE`
1019 1019
 #' return the group computations as a named list, otherwise return a single
1020 1020
 #' df with an additional column containing the group id
1021 1021
 #'
... ...
@@ -1037,172 +1037,229 @@ CIS_grubbs <- function(x,
1037 1037
     by = NULL,
1038 1038
     return_missing_as_df = TRUE,
1039 1039
     results_as_list = TRUE) {
1040
-    ## Check x has the correct structure
1041
-    stopifnot(is.data.frame(x))
1042
-    ## Check dyn vars for required tags
1043
-    req_mand_vars <- .check_required_cols(c("chromosome", "locus"),
1044
-        vars_df = mandatory_IS_vars(TRUE),
1045
-        duplicate_politic = "error"
1046
-    )
1047
-    chrom_col <- req_mand_vars %>%
1048
-        dplyr::filter(.data$tag == "chromosome") %>%
1049
-        dplyr::pull(.data$names)
1050
-    locus_col <- req_mand_vars %>%
1051
-        dplyr::filter(.data$tag == "locus") %>%
1052
-        dplyr::pull(.data$names)
1053
-    strand_col <- if ("is_strand" %in% mandatory_IS_vars(TRUE)$tag) {
1054
-        col <- mandatory_IS_vars(TRUE) %>%
1055
-            dplyr::filter(.data$tag == "is_strand") %>%
1056
-            dplyr::pull(.data$names)
1057
-        if (col %in% colnames(x)) {
1058
-            col
1059
-        } else {
1060
-            NULL
1061
-        }
1062
-    } else {
1063
-        NULL
1064
-    }
1065
-    req_annot_col <- .check_required_cols(
1066
-        list(gene_symbol = "char", gene_strand = "char"),
1067
-        vars_df = annotation_IS_vars(TRUE),
1068
-        "error"
1040
+    res_checks <- .cis_param_check(
1041
+        x, genomic_annotation_file,
1042
+        grubbs_flanking_gene_bp,
1043
+        threshold_alpha,
1044
+        return_missing_as_df
1069 1045
     )
1070
-    gene_symbol_col <- req_annot_col %>%
1071
-        dplyr::filter(.data$tag == "gene_symbol") %>%
1072
-        dplyr::pull(.data$names)
1073
-    gene_strand_col <- req_annot_col %>%
1074
-        dplyr::filter(.data$tag == "gene_strand") %>%
1075
-        dplyr::pull(.data$names)
1076
-    cols_required <- c(chrom_col, locus_col, gene_symbol_col, gene_strand_col)
1077
-    if (!all(cols_required %in% colnames(x))) {
1078
-        rlang::abort(.missing_user_cols_error(
1079
-            cols_required[!cols_required %in% colnames(x)]
1080
-        ))
1081
-    }
1082
-    # Check other parameters
1083
-    stopifnot(is.data.frame(genomic_annotation_file) ||
1084
-        is.character(genomic_annotation_file))
1085
-    genomic_annotation_file <- genomic_annotation_file[1]
1086
-    if (is.character(genomic_annotation_file) &&
1087
-        !genomic_annotation_file %in% c("hg19", "mm9")) {
1088
-        err_msg <- c("Genomic annotation file unknown",
1089
-            x = paste(
1090
-                "Since ISAnalytics 1.5.4, if provided as",
1091
-                "character vector, `genomic_annotation_file`",
1092
-                "parameter must be one of 'hg19' or 'mm9'"
1093
-            ),
1094
-            i = paste(
1095
-                "For using other genome reference files",
1096
-                "import them in the R environment and pass",
1097
-                "them to the function"
1098
-            )
1099
-        )
1100
-        rlang::abort(err_msg, class = "genomic_file_char")
1101
-    }
1102
-    if (is.character(genomic_annotation_file)) {
1103
-        gen_file <- paste0("refGenes_", genomic_annotation_file)
1104
-        utils::data(list = gen_file, envir = rlang::current_env())
1105
-        refgenes <- rlang::eval_tidy(rlang::sym(gen_file))
1106
-    } else {
1107
-        # Check annotation file format
1108
-        refgenes <- genomic_annotation_file
1109
-        if (!all(refGene_table_cols() %in% colnames(refgenes))) {
1110
-            rlang::abort(.non_standard_annotation_structure())
1111
-        }
1112
-        refgenes <- tibble::as_tibble(refgenes) %>%
1113
-            dplyr::mutate(chrom = stringr::str_replace_all(
1114
-                .data$chrom,
1115
-                "chr", ""
1116
-            ))
1117
-    }
1118
-    stopifnot(is.numeric(grubbs_flanking_gene_bp) ||
1119
-        is.integer(grubbs_flanking_gene_bp))
1120
-    grubbs_flanking_gene_bp <- grubbs_flanking_gene_bp[1]
1121
-    stopifnot(is.numeric(threshold_alpha))
1122
-    threshold_alpha <- threshold_alpha[1]
1123 1046
     stopifnot(is.null(by) || is.character(by))
1124 1047
     if (!all(by %in% colnames(x))) {
1125 1048
         rlang::abort(.missing_user_cols_error(by[!by %in% colnames(x)]))
1126 1049
     }
1127
-    stopifnot(is.logical(return_missing_as_df))
1128
-    return_missing_as_df <- return_missing_as_df[1]
1050
+    result <- list()
1051
+    join_ref_res <- .cis_join_ref(x, res_checks)
1052
+    result <- append(result, join_ref_res[
1053
+        names(join_ref_res) %in% c("missing_genes", "missing_is")
1054
+    ])
1129 1055
     if (is.null(by)) {
1130
-        result <- .cis_grubb_calc(
1131
-            x = x,
1132
-            refgenes = refgenes,
1133
-            grubbs_flanking_gene_bp = grubbs_flanking_gene_bp,
1134
-            threshold_alpha = threshold_alpha,
1135
-            gene_symbol_col = gene_symbol_col,
1136
-            gene_strand_col = gene_strand_col,
1137
-            chr_col = chrom_col, locus_col = locus_col,
1138
-            strand_col = strand_col
1056
+        cis <- .cis_grubb_calc(
1057
+            x = join_ref_res$joint_ref,
1058
+            grubbs_flanking_gene_bp = res_checks$grubbs_flanking_gene_bp,
1059
+            threshold_alpha = res_checks$threshold_alpha,
1060
+            gene_symbol_col = res_checks$gene_symbol_col,
1061
+            gene_strand_col = res_checks$gene_strand_col,
1062
+            chr_col = res_checks$chrom_col, locus_col = res_checks$locus_col,
1063
+            strand_col = res_checks$strand_col
1139 1064
         )
1140
-        missing_df <- result$missing
1141 1065
     } else {
1142
-        grouped <- x %>%
1066
+        grouped <- join_ref_res$joint_ref %>%
1143 1067
             dplyr::group_by(dplyr::across(dplyr::all_of(by)))
1144
-        group_keys <- grouped %>%
1068
+        group_ks <- grouped %>%
1145 1069
             dplyr::group_keys() %>%
1146 1070
             tidyr::unite(col = "id", dplyr::everything()) %>%
1147 1071
             dplyr::pull(.data$id)
1148
-        split <- x %>%
1149
-            dplyr::group_by(dplyr::across(dplyr::all_of(by))) %>%
1072
+        split <- grouped %>%
1150 1073
             dplyr::group_split() %>%
1151
-            purrr::set_names(group_keys)
1152
-        result <- purrr::map(split, ~ .cis_grubb_calc(
1074
+            purrr::set_names(group_ks)
1075
+        cis <- purrr::map(split, ~ .cis_grubb_calc(
1153 1076
             x = .x,
1154
-            refgenes = refgenes,
1155
-            grubbs_flanking_gene_bp = grubbs_flanking_gene_bp,
1156
-            threshold_alpha = threshold_alpha,
1157
-            gene_symbol_col = gene_symbol_col,
1158
-            gene_strand_col = gene_strand_col,
1159
-            chr_col = chrom_col, locus_col = locus_col,
1160
-            strand_col = strand_col
1077
+            grubbs_flanking_gene_bp = res_checks$grubbs_flanking_gene_bp,
1078
+            threshold_alpha = res_checks$threshold_alpha,
1079
+            gene_symbol_col = res_checks$gene_symbol_col,
1080
+            gene_strand_col = res_checks$gene_strand_col,
1081
+            chr_col = res_checks$chrom_col, locus_col = res_checks$locus_col,
1082
+            strand_col = res_checks$strand_col
1161 1083
         ))
1162
-        missing_df <- purrr::map(result, ~ .x$missing) %>%
1163
-            purrr::reduce(dplyr::bind_rows) %>%
1164
-            dplyr::distinct()
1084
+        if (!results_as_list) {
1085
+            cis <- purrr::map2(cis, names(cis), ~ {
1086
+                .x %>%
1087
+                    dplyr::mutate(group = .y)
1088
+            }) %>% purrr::reduce(dplyr::bind_rows)
1089
+        }
1165 1090
     }
1166
-    if (nrow(missing_df) > 0 & getOption("ISAnalytics.verbose") == TRUE) {
1167
-        warn_miss <- c("Warning: missing genes in refgenes table",
1168
-            i = paste(paste(
1169
-                "A total of", nrow(missing_df),
1170
-                "genes",
1171
-                "were found in the input data but not",
1172
-                "in the refgene table. This may be caused by",
1173
-                "a mismatch in the annotation phase of",
1174
-                "the matrix. Here is a summary: "
1175
-            ),
1176
-            paste0(utils::capture.output({
1177
-                print(missing_df, n = Inf)
1178
-            }), collapse = "\n"),
1179
-            sep = "\n"
1091
+    result$cis <- cis
1092
+    return(result)
1093
+}
1094
+
1095
+
1096
+#' Compute CIS and Grubbs test over different time points and groups.
1097
+#'
1098
+#' @description
1099
+#' `r lifecycle::badge("experimental")`
1100
+#' Computes common insertion sites and Grubbs test for each separate group
1101
+#' and separating different time points among the same group. The logic
1102
+#' applied is the same as the function `CIS_grubbs()`.
1103
+#'
1104
+#' @inherit CIS_grubbs details
1105
+#'
1106
+#' @inheritParams CIS_grubbs
1107
+#' @param group A character vector of column names that identifies a group.
1108
+#' Each group must contain one or more time points.
1109
+#' @param timepoint_col What is the name of the column containing time points?
1110
+#' @param as_df Choose the result format: if `TRUE` the results are returned
1111
+#' as a single data frame containing a column for the group id and a column
1112
+#' for the time point, if `FALSE` results are returned in the form of nested
1113
+#' lists (one table for each time point and for each group), if `"group"`
1114
+#' results are returned as a list separated for each group but containing a
1115
+#' single table with all time points.
1116
+#' @param max_workers Maximum number of parallel workers. If `NULL` the
1117
+#' maximum number of workers is calculated automatically.
1118
+#'
1119
+#' @return A list with results and optionally missing genes info
1120
+#' @export
1121
+#'
1122
+#' @examples
1123
+#' data("integration_matrices", package = "ISAnalytics")
1124
+#' data("association_file", package = "ISAnalytics")
1125
+#' aggreg <- aggregate_values_by_key(
1126
+#'     x = integration_matrices,
1127
+#'     association_file = association_file,
1128
+#'     value_cols = c("seqCount", "fragmentEstimate")
1129
+#' )
1130
+#' cis_overtime <- CIS_grubbs_overtime(aggreg)
1131
+#' cis_overtime
1132
+CIS_grubbs_overtime <- function(x,
1133
+    genomic_annotation_file = "hg19",
1134
+    grubbs_flanking_gene_bp = 100000,
1135
+    threshold_alpha = 0.05,
1136
+    group = "SubjectID",
1137
+    timepoint_col = "TimePoint",
1138
+    as_df = TRUE,
1139
+    return_missing_as_df = TRUE,
1140
+    max_workers = NULL) {
1141
+    result <- list()
1142
+    res_checks <- .cis_param_check(
1143
+        x, genomic_annotation_file,
1144
+        grubbs_flanking_gene_bp, threshold_alpha,
1145
+        return_missing_as_df
1146
+    )
1147
+    stopifnot(is.character(group))
1148
+    stopifnot(is.character(timepoint_col))
1149
+    stopifnot(is.null(max_workers) || is.numeric(max_workers))
1150
+    timepoint_col <- timepoint_col[1]
1151
+    if (!all(c(group, timepoint_col) %in% colnames(x))) {
1152
+        rlang::abort(.missing_user_cols_error(
1153
+            c(group, timepoint_col)[!c(group, timepoint_col) %in% colnames(x)]
1154
+        ))
1155
+    }
1156
+    # Manage workers
1157
+    if (.Platform$OS.type == "windows" & is.null(max_workers)) {
1158
+        max_workers <- BiocParallel::snowWorkers()
1159
+    } else if (.Platform$OS.type != "windows" & is.null(max_workers)) {
1160
+        max_workers <- BiocParallel::multicoreWorkers()
1161
+    }
1162
+    stopifnot(is.logical(as_df) || is.character(as_df))
1163
+    result_as_df <- if (is.logical(as_df)) {
1164
+        if (as_df[1] == TRUE) {
1165
+            1
1166
+        } else {
1167
+            0
1168
+        }
1169
+    } else if (is.character(as_df) & as_df[1] == "group") {
1170
+        2
1171
+    } else {
1172
+        err_as_df <- c("The argument `as_df` must be one of the allowed values",
1173
+            x = paste(
1174
+                "Arg can be either logical (T/F) or",
1175
+                "equal to 'group'"
1180 1176
             ),
1181
-            i = paste(
1182
-                "NOTE: missing genes will be removed from",
1183
-                "the final output! Review results carefully"
1184
-            )
1177
+            i = paste("See `?CIS_grubbs_overtime`")
1185 1178
         )
1186
-        rlang::inform(warn_miss, class = "warn_miss_genes")
1179
+        rlang::abort(err_as_df)
1187 1180
     }
1188
-    if (!is.null(by)) {
1189
-        result <- if (results_as_list) {
1190
-            purrr::map(result, ~ .x$df)
1181
+    join_ref_res <- .cis_join_ref(x, res_checks)
1182
+    result <- append(result, join_ref_res[
1183
+        names(join_ref_res) %in% c("missing_genes", "missing_is")
1184
+    ])
1185
+    # --- Split according to group
1186
+    split <- join_ref_res$joint_ref %>%
1187
+        dplyr::group_by(dplyr::across(dplyr::all_of(group)))
1188
+    keys_g <- split %>%
1189
+        dplyr::group_keys() %>%
1190
+        tidyr::unite(col = "id") %>%
1191
+        dplyr::pull(.data$id)
1192
+    split <- split %>%
1193
+        dplyr::group_split() %>%
1194
+        purrr::set_names(keys_g)
1195
+
1196
+    # --- Calculate for each group and each tp
1197
+    tp_slice_cis <- function(df, timepoint_col, res_checks, result_as_df) {
1198
+        tmp <- df %>%
1199
+            dplyr::group_by(dplyr::across(dplyr::all_of(timepoint_col)))
1200
+        g_keys <- tmp %>%
1201
+            dplyr::group_keys() %>%
1202
+            dplyr::pull(!!timepoint_col)
1203
+        tmp <- tmp %>%
1204
+            dplyr::group_split() %>%
1205
+            purrr::set_names(g_keys)
1206
+        res <- if (result_as_df == 0) {
1207
+            purrr::map(tmp, ~ .cis_grubb_calc(
1208
+                x = .x,
1209
+                grubbs_flanking_gene_bp = res_checks$grubbs_flanking_gene_bp,
1210
+                threshold_alpha = res_checks$threshold_alpha,
1211
+                gene_symbol_col = res_checks$gene_symbol_col,
1212
+                gene_strand_col = res_checks$gene_strand_col,
1213
+                chr_col = res_checks$chrom_col, locus_col = res_checks$locus_col,
1214
+                strand_col = res_checks$strand_col
1215
+            ))
1191 1216
         } else {
1192
-            purrr::map2(result, names(result), ~ {
1193
-                .x$df %>%
1194
-                    dplyr::mutate(group = .y)
1195
-            }) %>% purrr::reduce(dplyr::bind_rows)
1196
-        }
1197
-        if (return_missing_as_df) {
1198
-            return(list(cis = result, missing_genes = missing_df))
1217
+            purrr::map2_df(tmp, names(tmp), ~ .cis_grubb_calc(
1218
+                x = .x,
1219
+                grubbs_flanking_gene_bp = res_checks$grubbs_flanking_gene_bp,
1220
+                threshold_alpha = res_checks$threshold_alpha,
1221
+                gene_symbol_col = res_checks$gene_symbol_col,
1222
+                gene_strand_col = res_checks$gene_strand_col,
1223
+                chr_col = res_checks$chrom_col, locus_col = res_checks$locus_col,
1224
+                strand_col = res_checks$strand_col
1225
+            ) %>% dplyr::mutate(!!timepoint_col := .y))
1199 1226
         }
1200
-        return(result)
1227
+        return(res)
1201 1228
     }
1202
-    if (return_missing_as_df) {
1203
-        return(list(cis = result$df, missing_genes = missing_df))
1229
+
1230
+    if (.Platform$OS.type == "windows") {
1231
+        p <- BiocParallel::SnowParam(
1232
+            stop.on.error = TRUE,
1233
+            progressbar = getOption("ISAnalytics.verbose", TRUE),
1234
+            tasks = length(split),
1235
+            workers = max_workers
1236
+        )
1237
+    } else {
1238
+        p <- BiocParallel::MulticoreParam(
1239
+            stop.on.error = TRUE,
1240
+            progressbar = getOption("ISAnalytics.verbose", TRUE),
1241
+            tasks = length(split),
1242
+            workers = max_workers
1243
+        )
1204 1244
     }
1205
-    return(result$df)
1245
+    cis_overtime <- BiocParallel::bplapply(split,
1246
+        FUN = tp_slice_cis,
1247
+        timepoint_col = timepoint_col,
1248
+        res_checks = res_checks,
1249
+        result_as_df = result_as_df,
1250
+        BPPARAM = p
1251
+    )
1252
+    BiocParallel::bpstop(p)
1253
+
1254
+    if (result_as_df == 1) {
1255
+        cis_overtime <- purrr::map2_df(
1256
+            cis_overtime, names(cis_overtime),
1257
+            ~ .x %>%
1258
+                dplyr::mutate(group = .y)
1259
+        )
1260
+    }
1261
+    result$cis <- cis_overtime
1262
+    return(result)
1206 1263
 }
1207 1264
 
1208 1265
 #' Integrations cumulative count in time by sample
... ...
@@ -1500,7 +1557,7 @@ is_sharing <- function(...,
1500 1557
             rlang::abort(.keys_not_char_err())
1501 1558
         }
1502 1559
         if (length(unique(group_keys)) == 1) {
1503
-            if (getOption("ISAnalytics.verbose") == TRUE) {
1560
+            if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
1504 1561
                 one_key_list <- c("Single key in list",
1505 1562
                     i = paste(
1506 1563
                         "Provided a single key in list,",
... ...
@@ -1670,7 +1727,7 @@ is_sharing <- function(...,
1670 1727
             venn = table_for_venn
1671 1728
         )
1672 1729
     }
1673
-    if (getOption("ISAnalytics.verbose") == TRUE) {
1730
+    if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
1674 1731
         rlang::inform("Done!")
1675 1732
     }
1676 1733
     return(sharing)
Browse code

[UPDATE] Final partial update for 1.5.4 - finished documentation, ready to merge

Giulia Pais authored on 20/04/2022 15:58:57
Showing1 changed files
... ...
@@ -4,7 +4,8 @@
4 4
 
5 5
 #' Computes the abundance for every integration event in the input data frame.
6 6
 #'
7
-#' \lifecycle{stable}
7
+#' @description
8
+#' `r lifecycle::badge("stable")`
8 9
 #' Abundance is obtained for every integration event by calculating the ratio
9 10
 #' between the single value and the total value for the given group.
10 11
 #'
... ...
@@ -13,6 +14,11 @@
13 14
 #' relative abundance column (and optionally a percentage abundance
14 15
 #' column) will be produced.
15 16
 #'
17
+#' @section Required tags:
18
+#' The function will explicitly check for the presence of these tags:
19
+#'
20
+#' * All columns declared in `mandatory_IS_vars()`
21
+#'
16 22
 #' @param x An integration matrix - aka a data frame that includes
17 23
 #' the `mandatory_IS_vars()` as columns. The matrix can either be aggregated
18 24
 #' (via `aggregate_values_by_key()`) or not.
... ...
@@ -29,12 +35,6 @@
29 35
 #'
30 36
 #' @family Analysis functions
31 37
 #'
32
-#' @importFrom magrittr `%>%`
33
-#' @importFrom dplyr group_by across all_of summarise left_join mutate
34
-#' @importFrom dplyr cur_column distinct select contains rename_with
35
-#' @importFrom rlang .data eval_tidy parse_expr abort
36
-#' @importFrom purrr map_lgl
37
-#' @importFrom stringr str_replace
38 38
 #' @return Either a single data frame with computed abundance values or
39 39
 #' a list of 2 data frames (abundance_df, quant_totals)
40 40
 #' @export
... ...
@@ -130,7 +130,7 @@ compute_abundance <- function(x,
130 130
 #' Filter data frames with custom predicates
131 131
 #'
132 132
 #' @description
133
-#' \lifecycle{experimental}
133
+#' `r lifecycle::badge("stable")`
134 134
 #' Filter a single data frame or a list of data frames with custom
135 135
 #' predicates assembled from the function parameters.
136 136
 #'
... ...
@@ -242,7 +242,7 @@ compute_abundance <- function(x,
242 242
 #' character vectors. Must be one of the allowed values between
243 243
 #' `c("<", ">", "==", "!=", ">=", "<=")`
244 244
 #'
245
-#' @family Analysis functions
245
+#' @family Data cleaning and pre-processing
246 246
 #'
247 247
 #' @return A data frame or a list of data frames
248 248
 #' @export
... ...
@@ -291,7 +291,8 @@ threshold_filter <- function(x,
291 291
 #' Sorts and keeps the top n integration sites based on the values
292 292
 #' in a given column.
293 293
 #'
294
-#' \lifecycle{experimental}
294
+#' @description
295
+#' `r lifecycle::badge("stable")`
295 296
 #' The input data frame will be sorted by the highest values in
296 297
 #' the columns specified and the top n rows will be returned as output.
297 298
 #' The user can choose to keep additional columns in the output
... ...
@@ -300,6 +301,11 @@ threshold_filter <- function(x,
300 301
 #' * `keep = "nothing"` only keeps the mandatory columns
301 302
 #' (`mandatory_IS_vars()`) plus the columns in the `columns` parameter.
302 303
 #'
304
+#' @section Required tags:
305
+#' The function will explicitly check for the presence of these tags:
306
+#'
307
+#' * All columns declared in `mandatory_IS_vars()`
308
+#'
303 309
 #' @param x An integration matrix (data frame containing
304 310
 #' `mandatory_IS_vars()`)
305 311
 #' @param n How many integrations should be sliced (in total or
... ...
@@ -316,8 +322,6 @@ threshold_filter <- function(x,
316 322
 #'
317 323
 #' @family Analysis functions
318 324
 #'
319
-#' @importFrom magrittr `%>%`
320
-#' @importFrom rlang abort
321 325
 #'
322 326
 #' @return Either a data frame with at most n rows or
323 327
 #' a data frames with at most n*(number of groups) rows.
... ...
@@ -344,7 +348,7 @@ threshold_filter <- function(x,
344 348
 #'     keep = "Value2",
345 349
 #'     key = "CompleteAmplificationID"
346 350
 #' )
347
-#top_abundant_is
351
+# top_abundant_is
348 352
 top_integrations <- function(x,
349 353
     n = 20,
350 354
     columns = "fragmentEstimate_sum_RelAbundance",
... ...
@@ -408,6 +412,75 @@ top_integrations <- function(x,
408 412
 }
409 413
 
410 414
 
415
+#' Top n targeted genes based on number of IS.
416
+#'
417
+#' @description
418
+#' `r lifecycle::badge("experimental")`
419
+#' Produces a summary of the number of integration events per gene, orders
420
+#' the table in decreasing order and slices the first n rows - either on
421
+#' all the data frame or by group.
422
+#'
423
+#' @details
424
+#' ## Gene grouping
425
+#' When producing a summary of IS by gene, there are different options that
426
+#' can be chosen.
427
+#' The argument `consider_chr` accounts for the fact that some genes (same
428
+#' gene symbol) may span more than one chromosome: if set to `TRUE`
429
+#' counts of IS will be separated for those genes that span 2 or more
430
+#' chromosomes - in other words they will be in 2 different rows of the
431
+#' output table. On the contrary, if the argument is set to `FALSE`,
432
+#' counts will be produced in a single row.
433
+#'
434
+#' NOTE: the function counts **DISTINCT** integration events, which logically
435
+#' corresponds to a union of sets. Be aware of the fact that counts per group
436
+#' and counts with different arguments might be different: if for example
437
+#' counts are performed by considering chromosome and there is one gene symbol
438
+#' with 2 different counts, the sum of those 2 will likely not be equal to
439
+#' the count obtained by performing the calculations without
440
+#' considering the chromosome.
441
+#'
442
+#' The same reasoning can be applied for the argument `consider_gene_strand`,
443
+#' that takes into account the strand of the gene.
444
+#'
445
+#' @section Required tags:
446
+#' The function will explicitly check for the presence of these tags:
447
+#'
448
+#' ```{r echo=FALSE, results="asis"}
449
+#' all_tags <- available_tags()
450
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
451
+#'  ~ "top_targeted_genes" %in% .x)][["tag"]])
452
+#'  cat(paste0("* ", needed, collapse="\n"))
453
+#' ```
454
+#'
455
+#' Note that the tags "gene_strand" and "chromosome" are explicitly required
456
+#' only if `consider_chr = TRUE` and/or `consider_gene_strand = TRUE`.
457
+#'
458
+#' @param x An integration matrix - must be annotated
459
+#' @param n Number of rows to slice
460
+#' @param key If slice has to be performed for each group, the character
461
+#' vector of column names that identify the groups. If `NULL` considers the
462
+#' whole input data frame.
463
+#' @param consider_chr Logical, should the chromosome be taken into account?
464
+#' See details.
465
+#' @param consider_gene_strand Logical, should the gene strand be taken into
466
+#' account? See details.
467
+#' @param as_df If computation is performed by group, `TRUE` returns all
468
+#' groups merged in a single data frame with a column containing the group id.
469
+#' If `FALSE` returns a named list.
470
+#'
471
+#' @importFrom rlang sym
472
+#'
473
+#' @return A data frame or a list of data frames
474
+#' @export
475
+#' @family Analysis functions
476
+#'
477
+#' @examples
478
+#' data("integration_matrices", package = "ISAnalytics")
479
+#' top_targ <- top_targeted_genes(
480
+#'     integration_matrices,
481
+#'     key = NULL
482
+#' )
483
+#' top_targ
411 484
 top_targeted_genes <- function(x,
412 485
     n = 20,
413 486
     key = c(
... ...
@@ -417,70 +490,134 @@ top_targeted_genes <- function(x,
417 490
     consider_chr = TRUE,
418 491
     consider_gene_strand = TRUE,
419 492
     as_df = TRUE) {
420
-  stopifnot(is.data.frame(x))
421
-  data.table::setDT(x)
422
-  stopifnot(is.numeric(n) || is.integer(n))
423
-  stopifnot(is.null(key) || is.character(key))
424
-  stopifnot(is.logical(consider_chr))
425
-  stopifnot(is.logical(consider_gene_strand))
493
+    stopifnot(is.data.frame(x))
494
+    data.table::setDT(x)
495
+    stopifnot(is.numeric(n) || is.integer(n))
496
+    stopifnot(is.null(key) || is.character(key))
497
+    stopifnot(is.logical(consider_chr))
498
+    stopifnot(is.logical(consider_gene_strand))
426 499
 
427
-  required_annot_tags <- c("gene_symbol")
428
-  if (consider_gene_strand) {
429
-    required_annot_tags <- c(required_annot_tags, "gene_strand")
430
-  }
431
-  annot_tag_cols <- .check_required_cols(required_annot_tags,
432
-                                         annotation_IS_vars(TRUE),
433
-                                         "error")
434
-  if (consider_chr) {
435
-    chr_tag_col <- .check_required_cols(c("chromosome", "locus"),
436
-                                        mandatory_IS_vars(TRUE),
437
-                                        "error")
438
-    annot_tag_cols <- annot_tag_cols %>%
439
-      dplyr::bind_rows(chr_tag_col)
440
-  }
441
-  data.table::setDT(annot_tag_cols)
442
-  cols_to_check <- c(annot_tag_cols$names, key)
443
-  if (!all(cols_to_check %in% colnames(x))) {
444
-    rlang::abort(.missing_needed_cols(
445
-      cols_to_check[!cols_to_check %in% colnames(x)]))
446
-  }
500
+    required_annot_tags <- c("gene_symbol")
501
+    if (consider_gene_strand) {
502
+        required_annot_tags <- c(required_annot_tags, "gene_strand")
503
+    }
504
+    annot_tag_cols <- .check_required_cols(
505
+        required_annot_tags,
506
+        annotation_IS_vars(TRUE),
507
+        "error"
508
+    )
509
+    if (consider_chr) {
510
+        chr_tag_col <- .check_required_cols(
511
+            c("chromosome", "locus"),
512
+            mandatory_IS_vars(TRUE),
513
+            "error"
514
+        )
515
+        annot_tag_cols <- annot_tag_cols %>%
516
+            dplyr::bind_rows(chr_tag_col)
517
+    }
518
+    data.table::setDT(annot_tag_cols)
519
+    cols_to_check <- c(annot_tag_cols$names, key)
520
+    if (!all(cols_to_check %in% colnames(x))) {
521
+        rlang::abort(.missing_needed_cols(
522
+            cols_to_check[!cols_to_check %in% colnames(x)]
523
+        ))
524
+    }
447 525
 
448
-  df_with_is_counts <- if (is.null(key)) {
449
-    .count_distinct_is_per_gene(
450
-      x = x, include_chr = consider_chr,
451
-      include_gene_strand = consider_gene_strand,
452
-      gene_sym_col = annot_tag_cols[
453
-        eval(sym("tag")) == "gene_symbol"][["names"]],
454
-      gene_strand_col = annot_tag_cols[
455
-        eval(sym("tag")) == "gene_strand"][["names"]],
456
-      chr_col = annot_tag_cols[eval(sym("tag")) == "chromosome"][["names"]],
457
-      mand_vars_to_check = mandatory_IS_vars(TRUE)
458
-    ) %>%
459
-      dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
460
-      dplyr::slice_head(n = n)
461
-  } else {
462
-    tmp <- x[, .count_distinct_is_per_gene(
463
-      x = .SD, include_chr = consider_chr,
464
-      include_gene_strand = consider_gene_strand,
465
-      gene_sym_col = annot_tag_cols[
466
-        eval(sym("tag")) == "gene_symbol"][["names"]],
467
-      gene_strand_col = annot_tag_cols[
468
-        eval(sym("tag")) == "gene_strand"][["names"]],
469
-      chr_col = annot_tag_cols[eval(sym("tag")) == "chromosome"][["names"]],
470
-      mand_vars_to_check = mandatory_IS_vars(TRUE)
471
-    ), by = eval(key)]
472
-    tmp[,
473
-        .SD %>% dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
474
-          dplyr::slice_head(n = n),
475
-        by = eval(key)]
476
-  }
477
-  if (as_df) {
478
-    return(df_with_is_counts)
479
-  }
480
-  return(split(df_with_is_counts, by = key))
526
+    df_with_is_counts <- if (is.null(key)) {
527
+        .count_distinct_is_per_gene(
528
+            x = x, include_chr = consider_chr,
529
+            include_gene_strand = consider_gene_strand,
530
+            gene_sym_col = annot_tag_cols[
531
+                eval(sym("tag")) == "gene_symbol"
532
+            ][["names"]],
533
+            gene_strand_col = annot_tag_cols[
534
+                eval(sym("tag")) == "gene_strand"
535
+            ][["names"]],
536
+            chr_col = annot_tag_cols[eval(sym("tag")) ==
537
+                "chromosome"][["names"]],
538
+            mand_vars_to_check = mandatory_IS_vars(TRUE)
539
+        ) %>%
540
+            dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
541
+            dplyr::slice_head(n = n)
542
+    } else {
543
+        tmp <- x[, .count_distinct_is_per_gene(
544
+            x = .SD, include_chr = consider_chr,
545
+            include_gene_strand = consider_gene_strand,
546
+            gene_sym_col = annot_tag_cols[
547
+                eval(sym("tag")) == "gene_symbol"
548
+            ][["names"]],
549
+            gene_strand_col = annot_tag_cols[
550
+                eval(sym("tag")) == "gene_strand"
551
+            ][["names"]],
552
+            chr_col = annot_tag_cols[eval(sym("tag")) ==
553
+                "chromosome"][["names"]],
554
+            mand_vars_to_check = mandatory_IS_vars(TRUE)
555
+        ), by = eval(key)]
556
+        tmp[,
557
+            .SD %>% dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
558
+                dplyr::slice_head(n = n),
559
+            by = eval(key)
560
+        ]
561
+    }
562
+    if (as_df) {
563
+        return(df_with_is_counts)
564
+    }
565
+    return(split(df_with_is_counts, by = key))
481 566
 }
482 567
 
483 568
 
569
+#' Compute Fisher's exact test on gene frequencies.
570
+#'
571
+#' @description
572
+#' `r lifecycle::badge("experimental")`
573
+#' Provided 2 data frames with calculations for CIS, via `CIS_grubbs()`,
574
+#' computes Fisher's exact test.
575
+#' Results can be plotted via `fisher_scatterplot()`.
576
+#'
577
+#' @param cis_x A data frame obtained via `CIS_grubbs()`
578
+#' @param cis_y A data frame obtained via `CIS_grubbs()`
579
+#' @param min_is_per_gene Used for pre-filtering purposes. Genes with a
580
+#' number of distinct integration less than this number will be filtered out
581
+#' prior calculations. Single numeric or integer.
582
+#' @param gene_set_method One between "intersection" and "union". When merging
583
+#' the 2 data frames, `intersection` will perform an inner join operation,
584
+#' while `union` will perform a full join operation.
585
+#' @param significance_threshold Significance threshold for the Fisher's
586
+#' test p-value
587
+#' @param remove_unbalanced_0 Remove from the final output those pairs in
588
+#' which there are no IS for one group or the other and the number of
589
+#' IS of the non-missing group are less than the mean number of IS for that
590
+#' group
591
+#'
592
+#' @template genes_db
593
+#'
594
+#' @section Required tags:
595
+#' The function will explicitly check for the presence of these tags:
596
+#'
597
+#' ```{r echo=FALSE, results="asis"}
598
+#' all_tags <- available_tags()
599
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
600
+#'  ~ "gene_frequency_fisher" %in% .x)][["tag"]])
601
+#'  cat(paste0("* ", needed, collapse="\n"))
602
+#' ```
603
+#'
604
+#' @return A data frame
605
+#' @export
606
+#' @family Analysis functions
607
+#'
608
+#' @examples
609
+#' data("integration_matrices", package = "ISAnalytics")
610
+#' data("association_file", package = "ISAnalytics")
611
+#' aggreg <- aggregate_values_by_key(
612
+#'     x = integration_matrices,
613
+#'     association_file = association_file,
614
+#'     value_cols = c("seqCount", "fragmentEstimate")
615
+#' )
616
+#' cis <- CIS_grubbs(aggreg, by = "SubjectID")
617
+#' fisher <- gene_frequency_fisher(cis$cis$PT001, cis$cis$PT002,
618
+#'     min_is_per_gene = 2
619
+#' )
620
+#' fisher
484 621
 gene_frequency_fisher <- function(cis_x,
485 622
     cis_y,
486 623
     min_is_per_gene = 3,
... ...
@@ -506,142 +643,169 @@ gene_frequency_fisher <- function(cis_x,
506 643
     stopifnot(is.logical(remove_unbalanced_0))
507 644
     ## -- Fetch gene symbol column
508 645
     gene_sym_col <- .check_required_cols(
509
-      "gene_symbol", annotation_IS_vars(TRUE), duplicate_politic = "error"
646
+        "gene_symbol", annotation_IS_vars(TRUE),
647
+        duplicate_politic = "error"
510 648
     )[["names"]]
511
-    req_cis_cols <- c(gene_sym_col, "n_IS_perGene", "average_TxLen",
512
-                      "raw_gene_integration_frequency")
649
+    req_cis_cols <- c(
650
+        gene_sym_col, "n_IS_perGene", "average_TxLen",
651
+        "raw_gene_integration_frequency"
652
+    )
513 653
     quiet_expand <- purrr::quietly(.expand_cis_df)
514
-    cols_for_join <- c(gene_sym_col,
515
-                     "Onco1_TS2", "ClinicalRelevance", "DOIReference",
516
-                     "KnownGeneClass", "KnownClonalExpansion",
517
-                     "CriticalForInsMut")
654
+    cols_for_join <- c(
655
+        gene_sym_col,
656
+        "Onco1_TS2", "ClinicalRelevance", "DOIReference",
657
+        "KnownGeneClass", "KnownClonalExpansion",
658
+        "CriticalForInsMut"
659
+    )
518 660
     ## --- Calculations to perform on each df
519 661
     append_calc <- function(df, group_n) {
520
-      if (!all(req_cis_cols %in% colnames(df))) {
521
-        rlang::abort(
522
-          .missing_needed_cols(req_cis_cols[!req_cis_cols %in% colnames(df)]))
523
-      }
524
-      modified <- quiet_expand(df, gene_sym_col,
525
-                                 onco_db_file, tumor_suppressors_db_file,
526
-                                 species, known_onco, suspicious_genes)$result
527
-      modified <- modified %>%
528
-        dplyr::mutate(
529
-          IS_per_kbGeneLen = .data$raw_gene_integration_frequency * 1000,
530
-          Sum_IS_per_kbGeneLen = sum(.data$IS_per_kbGeneLen, na.rm = TRUE),
531
-          IS_per_kbGeneLen_perMDepth_TPM = (.data$IS_per_kbGeneLen /
532
-                                              .data$Sum_IS_per_kbGeneLen) * 1e6
533
-        ) %>%
534
-        dplyr::filter(.data$n_IS_perGene >= min_is_per_gene) %>%
535
-        dplyr::select(dplyr::all_of(c(req_cis_cols, cols_for_join,
536
-                                      "IS_per_kbGeneLen",
537
-                                      "Sum_IS_per_kbGeneLen",
538
-                                      "IS_per_kbGeneLen_perMDepth_TPM")))
539
-      colnames(modified)[!colnames(modified) %in% cols_for_join] <- paste(
540
-        colnames(modified)[!colnames(modified) %in% cols_for_join], group_n,
541
-        sep = "_"
542
-      )
543
-      return(modified)
662
+        if (!all(req_cis_cols %in% colnames(df))) {
663
+            rlang::abort(
664
+                .missing_needed_cols(req_cis_cols[!req_cis_cols %in%
665
+                    colnames(df)])
666
+            )
667
+        }
668
+        modified <- quiet_expand(
669
+            df, gene_sym_col,
670
+            onco_db_file, tumor_suppressors_db_file,
671
+            species, known_onco, suspicious_genes
672
+        )$result
673
+        modified <- modified %>%
674
+            dplyr::mutate(
675
+                IS_per_kbGeneLen = .data$raw_gene_integration_frequency * 1000,
676
+                Sum_IS_per_kbGeneLen = sum(.data$IS_per_kbGeneLen,
677
+                    na.rm = TRUE
678
+                ),
679
+                IS_per_kbGeneLen_perMDepth_TPM = (.data$IS_per_kbGeneLen /
680
+                    .data$Sum_IS_per_kbGeneLen) * 1e6
681
+            ) %>%
682
+            dplyr::filter(.data$n_IS_perGene >= min_is_per_gene) %>%
683
+            dplyr::select(dplyr::all_of(c(
684
+                req_cis_cols, cols_for_join,
685
+                "IS_per_kbGeneLen",
686
+                "Sum_IS_per_kbGeneLen",
687
+                "IS_per_kbGeneLen_perMDepth_TPM"
688
+            )))
689
+        colnames(modified)[!colnames(modified) %in% cols_for_join] <- paste(
690
+            colnames(modified)[!colnames(modified) %in% cols_for_join], group_n,
691
+            sep = "_"
692
+        )
693
+        return(modified)
544 694
     }
545
-    cis_mod <- purrr::map2(list(cis_x, cis_y), c(1,2), append_calc)
695
+    cis_mod <- purrr::map2(list(cis_x, cis_y), c(1, 2), append_calc)
546 696
     ## --- Merge the two in 1 df
547 697
     merged <- if (gene_set_method == "union") {
548
-      purrr::reduce(cis_mod, ~ dplyr::full_join(.x, .y, by = cols_for_join))
698
+        purrr::reduce(cis_mod, ~ dplyr::full_join(.x, .y, by = cols_for_join))
549 699
     } else {
550
-      purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join))
700
+        purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join))
551 701
     }
552 702
     if (nrow(merged) == 0) {
553
-      if (getOption("ISAnalytics.verbose") == TRUE) {
554
-        msg <- c("Data frame empty after filtering",
555
-                 i = paste("Data frame is empty after applying filter on IS,",
556
-                           "is your filter too stringent?"),
557
-                 x = "Nothing to return")
558
-        rlang::inform(msg, class = "empty_df_gene_freq")
559
-      }
560
-      return(NULL)
703
+        if (getOption("ISAnalytics.verbose") == TRUE) {
704
+            msg <- c("Data frame empty after filtering",
705
+                i = paste(
706
+                    "Data frame is empty after applying filter on IS,",
707
+                    "is your filter too stringent?"
708
+                ),
709
+                x = "Nothing to return"
710
+            )
711
+            rlang::inform(msg, class = "empty_df_gene_freq")
712
+        }
713
+        return(NULL)
561 714
     }
562 715
     ## --- Actual computation of fisher test: test is applied on each row
563 716
     ## (each gene)
564 717
     merged <- merged %>%
565
-      dplyr::mutate(
566
-        tot_n_IS_perGene_1 = sum(cis_x$n_IS_perGene, na.rm = TRUE),
567
-        tot_n_IS_perGene_2 = sum(cis_y$n_IS_perGene, na.rm = TRUE)
568
-      )
718
+        dplyr::mutate(
719
+            tot_n_IS_perGene_1 = sum(cis_x$n_IS_perGene, na.rm = TRUE),
720
+            tot_n_IS_perGene_2 = sum(cis_y$n_IS_perGene, na.rm = TRUE)
721
+        )
569 722
     compute_fisher <- function(...) {
570
-      row <- list(...)
571
-      n_IS_perGene_1 <- row$n_IS_perGene_1
572
-      n_IS_perGene_2 <- row$n_IS_perGene_2
573
-      n_IS_perGene_1[which(is.na(n_IS_perGene_1))] <- 0
574
-      n_IS_perGene_2[which(is.na(n_IS_perGene_2))] <- 0
575
-      matrix <- matrix(
576
-        data = c(n_IS_perGene_1,
577
-                 row$tot_n_IS_perGene_1 - n_IS_perGene_1,
578
-                 n_IS_perGene_2,
579
-                 row$tot_n_IS_perGene_2 - n_IS_perGene_2),
580
-        nrow = 2,
581
-        dimnames = list(G1 = c("IS_of_gene", "TotalIS"),
582
-                        G2 = c("IS_of_gene", "TotalIS"))
583
-      )
584
-      ft <- stats::fisher.test(matrix)
585
-      return(ft$p.value)
723
+        row <- list(...)
724
+        n_IS_perGene_1 <- row$n_IS_perGene_1
725
+        n_IS_perGene_2 <- row$n_IS_perGene_2
726
+        n_IS_perGene_1[which(is.na(n_IS_perGene_1))] <- 0
727
+        n_IS_perGene_2[which(is.na(n_IS_perGene_2))] <- 0
728
+        matrix <- matrix(
729
+            data = c(
730
+                n_IS_perGene_1,
731
+                row$tot_n_IS_perGene_1 - n_IS_perGene_1,
732
+                n_IS_perGene_2,
733
+                row$tot_n_IS_perGene_2 - n_IS_perGene_2
734
+            ),
735
+            nrow = 2,
736
+            dimnames = list(
737
+                G1 = c("IS_of_gene", "TotalIS"),
738
+                G2 = c("IS_of_gene", "TotalIS")
739
+            )
740
+        )
741
+        ft <- stats::fisher.test(matrix)
742
+        return(ft$p.value)
586 743
     }
587 744
     merged <- merged %>%
588
-      dplyr::mutate(
589
-        Fisher_p_value = purrr::pmap_dbl(., compute_fisher)
590
-      ) %>%
591
-      dplyr::mutate(
592
-        Fisher_p_value_significant = dplyr::if_else(
593
-          condition = .data$Fisher_p_value < significance_threshold,
594
-          true = TRUE, false = FALSE
745
+        dplyr::mutate(
746
+            Fisher_p_value = purrr::pmap_dbl(., compute_fisher)
747
+        ) %>%
748
+        dplyr::mutate(
749
+            Fisher_p_value_significant = dplyr::if_else(
750
+                condition = .data$Fisher_p_value < significance_threshold,
751
+                true = TRUE, false = FALSE
752
+            )
595 753
         )
596
-      )
597 754
     ## --- Removing unbalanced 0s if requested - this scenario applies
598 755
     ## only if "union" is selected as method for join
599 756
     if (remove_unbalanced_0) {
600
-      mean_is_per_gene_1 <- ceiling(mean(merged$n_IS_perGene_1, na.rm = TRUE))
601
-      mean_is_per_gene_2 <- ceiling(mean(merged$n_IS_perGene_2, na.rm = TRUE))
602
-      test_exclude <- function(...) {
603
-        row <- list(...)
604
-        if (is.na(row$n_IS_perGene_1) || is.na(row$n_IS_perGene_2)) {
605
-          to_ex <- ifelse(
606
-              test = ((row$n_IS_perGene_1 < mean_is_per_gene_1) &
607
-                (is.na(row$n_IS_perGene_2))) |
608
-                ((is.na(row$n_IS_perGene_1)) &
609
-                   (row$n_IS_perGene_2 < mean_is_per_gene_2)),
610
-              yes = TRUE,
611
-              no = FALSE
612
-          )
613
-          return(to_ex)
757
+        mean_is_per_gene_1 <- ceiling(mean(merged$n_IS_perGene_1, na.rm = TRUE))
758
+        mean_is_per_gene_2 <- ceiling(mean(merged$n_IS_perGene_2, na.rm = TRUE))
759
+        test_exclude <- function(...) {
760
+            row <- list(...)
761
+            if (is.na(row$n_IS_perGene_1) || is.na(row$n_IS_perGene_2)) {
762
+                to_ex <- ifelse(
763
+                    test = ((row$n_IS_perGene_1 < mean_is_per_gene_1) &
764
+                        (is.na(row$n_IS_perGene_2))) |
765
+                        ((is.na(row$n_IS_perGene_1)) &
766
+                            (row$n_IS_perGene_2 < mean_is_per_gene_2)),
767
+                    yes = TRUE,
768
+                    no = FALSE
769
+                )
770
+                return(to_ex)
771
+            }
772
+            return(FALSE)
614 773
         }
615
-        return(FALSE)
616
-      }
617
-      merged <- merged %>%
618
-        dplyr::mutate(
619
-          to_exclude_from_test = purrr::pmap(., test_exclude)
620
-        ) %>%
621
-        dplyr::filter(.data$to_exclude_from_test == FALSE) %>%
622
-        dplyr::select(-.data$to_exclude_from_test)
623
-      if (nrow(merged) == 0) {
624
-        if (getOption("ISAnalytics.verbose") == TRUE) {
625
-          msg <- c("Data frame empty after filtering",
626
-                   i = paste("Data frame is after removing unbalanced IS,",
627
-                             "nothing to return"))
628
-          rlang::inform(msg, class = "empty_df_gene_freq_unbal")
774
+        merged <- merged %>%
775
+            dplyr::mutate(
776
+                to_exclude_from_test = purrr::pmap(., test_exclude)
777
+            ) %>%
778
+            dplyr::filter(.data$to_exclude_from_test == FALSE) %>%
779
+            dplyr::select(-.data$to_exclude_from_test)
780
+        if (nrow(merged) == 0) {
781
+            if (getOption("ISAnalytics.verbose") == TRUE) {
782
+                msg <- c("Data frame empty after filtering",
783
+                    i = paste(
784
+                        "Data frame is after removing unbalanced IS,",
785
+                        "nothing to return"
786
+                    )
787
+                )
788
+                rlang::inform(msg, class = "empty_df_gene_freq_unbal")
789
+            }
790
+            return(NULL)
629 791
         }
630
-        return(NULL)
631
-      }
632 792
     }
633 793
     ## --- Apply statistical corrections to p-value
634 794
     merged <- merged %>%
635
-      dplyr::mutate(
636
-        Fisher_p_value_fdr = p.adjust(.data$Fisher_p_value, method = "fdr",
637
-                                      n = length(.data$Fisher_p_value)),
638
-        Fisher_p_value_benjamini = p.adjust(.data$Fisher_p_value, method = "BY",
639
-                                            n = length(.data$Fisher_p_value)),
640
-        minus_log10_pvalue = -log(.data$Fisher_p_value, base = 10)
641
-      ) %>%
642
-      dplyr::mutate(
643
-        minus_log10_pvalue_fdr = -log(.data$Fisher_p_value_fdr, base = 10),
644
-      )
795
+        dplyr::mutate(
796
+            Fisher_p_value_fdr = stats::p.adjust(.data$Fisher_p_value,
797
+                method = "fdr",
798
+                n = length(.data$Fisher_p_value)
799
+            ),
800
+            Fisher_p_value_benjamini = stats::p.adjust(.data$Fisher_p_value,
801
+                method = "BY",
802
+                n = length(.data$Fisher_p_value)
803
+            ),
804
+            minus_log10_pvalue = -log(.data$Fisher_p_value, base = 10)
805
+        ) %>%
806
+        dplyr::mutate(
807
+            minus_log10_pvalue_fdr = -log(.data$Fisher_p_value_fdr, base = 10),
808
+        )
645 809
     return(merged)
646 810
 }
647 811
 
... ...
@@ -650,7 +814,7 @@ gene_frequency_fisher <- function(cis_x,
650 814
 #' the metadata data frame accordingly.
651 815
 #'
652 816
 #' @description
653
-#' \lifecycle{experimental}
817
+#' `r lifecycle::badge("stable")`
654 818
 #' The function operates on a data frame by grouping the content by
655 819
 #' the sample key and computing every function specified on every
656 820
 #' column in the `value_columns` parameter. After that the metadata
... ...
@@ -686,14 +850,17 @@ gene_frequency_fisher <- function(cis_x,
686 850
 #' @param functions A named list of function or purrr-style lambdas
687 851
 #' @param add_integrations_count Add the count of distinct integration sites
688 852
 #' for each group? Can be computed only if `x` contains the mandatory columns
689
-#' `chr`, `integration_locus`, `strand`
853
+#' `mandatory_IS_vars()`
854
+#'
855
+#' @section Required tags:
856
+#' The function will explicitly check for the presence of these tags:
857
+#'
858
+#' * All columns declared in `mandatory_IS_vars()`
859
+#'
860
+#' These are checked only if `add_integrations_count = TRUE`.
690 861
 #'
691 862
 #' @family Analysis functions
692
-#' @importFrom rlang eval_tidy expr abort .data sym inform
693
-#' @importFrom purrr is_function is_formula map_lgl walk map set_names
694
-#' @importFrom dplyr group_by across all_of summarise rename_with bind_cols
695
-#' @importFrom dplyr n_distinct left_join
696
-#' @importFrom magrittr `%>%`
863
+#' @importFrom rlang .data sym
697 864
 #'
698 865
 #' @return A list with modified x and metadata data frames
699 866
 #' @export
... ...
@@ -797,7 +964,8 @@ sample_statistics <- function(x,
797 964
 
798 965
 #' Grubbs test for Common Insertion Sites (CIS).
799 966
 #'
800
-#' \lifecycle{stable}
967
+#' @description
968
+#' `r lifecycle::badge("stable")`
801 969
 #' Statistical approach for the validation of common insertion sites
802 970
 #' significance based on the comparison of the integration frequency
803 971
 #' at the CIS gene with respect to other genes contained in the
... ...
@@ -824,6 +992,16 @@ sample_statistics <- function(x,
824 992
 #'
825 993
 #' `r refGene_table_cols()`
826 994
 #'
995
+#' @section Required tags:
996
+#' The function will explicitly check for the presence of these tags:
997
+#'
998
+#' ```{r echo=FALSE, results="asis"}
999
+#' all_tags <- available_tags()
1000
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
1001
+#'  ~ "CIS_grubbs" %in% .x)][["tag"]])
1002
+#'  cat(paste0("* ", needed, collapse="\n"))
1003
+#' ```
1004
+#'
827 1005
 #' @param x An integration matrix, must include the `mandatory_IS_vars()`
828 1006
 #' columns and the `annotation_IS_vars()` columns
829 1007
 #' @param genomic_annotation_file Database file for gene annotation,
... ...
@@ -834,17 +1012,16 @@ sample_statistics <- function(x,
834 1012
 #' NULL, the function will perform calculations for each group and return
835 1013
 #' a list of data frames with the results. E.g. for `by = "SubjectID"`,
836 1014
 #' CIS will be computed for each distinct SubjectID found in the table
837
-#' (of course, "SubjectID" column must be included in the input data frame).
1015
+#' ("SubjectID" column must be included in the input data frame).
1016
+#' @param return_missing_as_df Returns those genes present in the input df
1017
+#' but not in the refgenes as a data frame?
1018
+#' @param results_as_list Relevant only if `by` is not `NULL` - if `TRUE`
1019
+#' return the group computations as a named list, otherwise return a single
1020
+#' df with an additional column containing the group id
838 1021
 #'
839 1022
 #' @family Analysis functions
840 1023
 #'
841
-#' @importFrom tibble as_tibble
842
-#' @importFrom rlang .data abort current_env eval_tidy sym
843
-#' @importFrom magrittr `%>%`
844
-#' @importFrom utils read.csv
845
-#' @importFrom stringr str_replace_all
846
-#' @importFrom tidyr unite
847
-#' @importFrom purrr set_names map
1024
+#' @importFrom rlang .data sym
848 1025
 #'
849 1026
 #' @return A data frame
850 1027
 #' @export
... ...
@@ -852,7 +1029,7 @@ sample_statistics <- function(x,
852 1029
 #' @examples
853 1030
 #' data("integration_matrices", package = "ISAnalytics")
854 1031
 #' cis <- CIS_grubbs(integration_matrices)
855
-#' head(cis)
1032
+#' cis
856 1033
 CIS_grubbs <- function(x,
857 1034
     genomic_annotation_file = "hg19",
858 1035
     grubbs_flanking_gene_bp = 100000,
... ...
@@ -996,7 +1173,7 @@ CIS_grubbs <- function(x,
996 1173
                 "a mismatch in the annotation phase of",
997 1174
                 "the matrix. Here is a summary: "
998 1175
             ),
999
-            paste0(capture.output({
1176
+            paste0(utils::capture.output({
1000 1177
                 print(missing_df, n = Inf)
1001 1178
             }), collapse = "\n"),
1002 1179
             sep = "\n"
... ...
@@ -1030,36 +1207,11 @@ CIS_grubbs <- function(x,
1030 1207
 
1031 1208
 #' Integrations cumulative count in time by sample
1032 1209
 #'
1033
-#' \lifecycle{experimental}
1034
-#' This function computes the cumulative number of integrations
1035
-#' observed in each sample at different time points by assuming that
1036
-#' if an integration is observed at time point "t" then it is also observed in
1037
-#' time point "t+1".
1210
+#' @description
1211
+#' `r lifecycle::badge("deprecated")`
1212
+#' This function was deprecated in favour of a single function,
1213
+#' please use `cumulative_is` instead.
1038 1214
 #'
1039
-#' @details
1040
-#' ## Input data frame
1041
-#' The user can provide as input for the `x` parameter both a simple
1042
-#' integration matrix AND setting the `aggregate` parameter to TRUE,
1043
-#' or provide an already aggregated matrix via
1044
-#' \link{aggregate_values_by_key}.
1045
-#' If the user supplies a matrix to be aggregated the `association_file`
1046
-#' parameter must not be NULL: aggregation will be done by an internal
1047
-#' call to the aggregation function.
1048
-#' If the user supplies an already aggregated matrix, the `key` parameter
1049
-#' is the key used for aggregation -
1050
-#' **NOTE: for this operation is mandatory
1051
-#' that the time point column is included in the key.**
1052
-#' ## Assumptions on time point format
1053
-#' By using the functions provided by this package, when imported,
1054
-#' an association file will be correctly formatted for future usage.
1055
-#' In the formatting process there is also a padding operation performed on
1056
-#' time points: this means the functions expects the time point column to
1057
-#' be of type character and to be correctly padded with 0s. If the
1058
-#' chosen column for time point is detected as numeric the function will
1059
-#' attempt the conversion to character and automatic padding.
1060
-#' If you choose to import the association file not using the
1061
-#' \link{import_association_file} function, be sure to check the format of
1062
-#' the chosen column to avoid undesired results.
1063 1215
 #'
1064 1216
 #' @param x A simple integration matrix or an aggregated matrix (see details)
1065 1217
 #' @param association_file NULL or the association file for x if `aggregate`
... ...
@@ -1071,20 +1223,9 @@ CIS_grubbs <- function(x,
1071 1223
 #' @param aggregate Should x be aggregated?
1072 1224
 #' @param ... Additional parameters to pass to `aggregate_values_by_key`
1073 1225
 #'
1074
-#' @family Analysis functions
1075
-#'
1076
-#' @importFrom dplyr mutate filter across all_of select summarise group_by
1077
-#' @importFrom dplyr arrange group_split first full_join starts_with distinct
1078
-#' @importFrom dplyr semi_join n rename
1079
-#' @importFrom magrittr `%>%`
1080
-#' @importFrom rlang .data abort inform `:=`
1081
-#' @importFrom stringr str_pad
1082
-#' @importFrom purrr reduce is_empty
1083
-#' @importFrom tidyr pivot_longer
1084
-#' @importFrom stats na.omit
1085
-#'
1086 1226
 #' @return A data frame
1087 1227
 #' @export
1228
+#' @keywords internal
1088 1229
 #'
1089 1230
 #' @examples
1090 1231
 #' data("integration_matrices", package = "ISAnalytics")
... ...
@@ -1114,7 +1255,8 @@ cumulative_count_union <- function(x,
1114 1255
         what = "cumulative_count_union()",
1115 1256
         with = "cumulative_is()",
1116 1257
         details = c(paste(
1117
-            "Use option `counts = TRUE`. Function will be likely dropped in the",
1258
+            "Use option `counts = TRUE`.",
1259
+            "Function will be likely dropped in the",
1118 1260
             "next release cycle"
1119 1261
         ))
1120 1262
     )
... ...
@@ -1124,9 +1266,10 @@ cumulative_count_union <- function(x,
1124 1266
     )
1125 1267
 }
1126 1268
 
1127
-#' Expands integration matrix with the cumulative is union over time.
1269
+#' Expands integration matrix with the cumulative IS union over time.
1128 1270
 #'
1129
-#' @description \lifecycle{experimental}
1271
+#' @description
1272
+#' `r lifecycle::badge("experimental")`
1130 1273
 #' Given an input integration matrix that can be grouped over time,
1131 1274
 #' this function adds integrations in groups assuming that
1132 1275
 #' if an integration is observed at time point "t" then it is also observed in
... ...
@@ -1141,6 +1284,14 @@ cumulative_count_union <- function(x,
1141 1284
 #' @param expand If `FALSE`, for each group, the set of integration sites is
1142 1285
 #' returned in a separate column as a nested table, otherwise the resulting
1143 1286
 #' column is unnested.
1287
+#' @param counts Add cumulative counts? Logical
1288
+#'
1289
+#' @section Required tags:
1290
+#' The function will explicitly check for the presence of these tags:
1291
+#'
1292
+#' * All columns declared in `mandatory_IS_vars()`
1293
+#' * Checks if the matrix is annotated by assessing presence of
1294
+#' `annotation_IS_vars()`
1144 1295
 #'
1145 1296
 #' @family Analysis functions
1146 1297
 #' @return A data frame
... ...
@@ -1210,12 +1361,12 @@ cumulative_is <- function(x,
1210 1361
             .keep_all = TRUE
1211 1362
         )
1212 1363
     data.table::setDT(temp)
1213
-    temp <- temp[, .(is = list(.SD)), by = key]
1364
+    temp <- temp[, list(is = list(.SD)), by = key]
1214 1365
     no_tp_key <- key[key != timepoint_col]
1215 1366
     split <- split(temp, by = no_tp_key)
1216 1367
     cumulate <- purrr::map(split, function(x) {
1217 1368
         x[, cumulative_is := purrr::accumulate(
1218
-            is,
1369
+            get("is"),
1219 1370
             ~ data.table::funion(.x, .y)
1220 1371
         )]
1221 1372
     })
... ...
@@ -1246,13 +1397,14 @@ cumulative_is <- function(x,
1246 1397
 
1247 1398
 #' Sharing of integration sites between given groups.
1248 1399
 #'
1249
-#' \lifecycle{experimental}
1400
+#' @description
1401
+#' `r lifecycle::badge("stable")`
1250 1402
 #' Computes the amount of integration sites shared between the groups identified
1251 1403
 #' in the input data.
1252 1404
 #'
1253 1405
 #' @details
1254
-#' An integration site is always identified by the triple
1255
-#' `(chr, integration_locus, strand)`, thus these columns must be present
1406
+#' An integration site is always identified by the combination of fields in
1407
+#' `mandatory_IS_vars()`, thus these columns must be present
1256 1408
 #' in the input(s).
1257 1409
 #'
1258 1410
 #' The function accepts multiple inputs for different scenarios, please refer
... ...
@@ -1270,6 +1422,11 @@ cumulative_is <- function(x,
1270 1422
 #' function \code{\link{sharing_heatmap}} or via the function
1271 1423
 #' \code{\link{sharing_venn}}
1272 1424
 #'
1425
+#' @section Required tags:
1426
+#' The function will explicitly check for the presence of these tags:
1427
+#'
1428
+#' * All columns declared in `mandatory_IS_vars()`
1429
+#'
1273 1430
 #' @param ... One or more integration matrices
1274 1431
 #' @param group_key Character vector of column names which identify a
1275 1432
 #' single group. An associated group id will be derived by concatenating
... ...
@@ -1522,7 +1679,8 @@ is_sharing <- function(...,
1522 1679
 
1523 1680
 #' Find the source of IS by evaluating sharing.
1524 1681
 #'
1525
-#' @description \lifecycle{experimental}
1682
+#' @description
1683
+#' `r lifecycle::badge("stable")`
1526 1684
 #' The function computes the sharing between a reference group of interest
1527 1685
 #' for each time point and a selection of groups of interest. In this way
1528 1686
 #' it is possible to observe the percentage of shared integration sites between
Browse code

[UPDATE] Partial update 1.5.4 - fixed all functions, increased package coverage, documentation still to fix

Giulia Pais authored on 15/04/2022 16:47:59
Showing1 changed files
... ...
@@ -59,24 +59,20 @@ compute_abundance <- function(x,
59 59
     if (.check_mandatory_vars(x) == FALSE) {
60 60
         rlang::abort(.missing_mand_vars())
61 61
     }
62
-    stopifnot(is.logical(percentage) & length(percentage) == 1)
63
-    if (!all(columns %in% colnames(x)) | !all(key %in% colnames(x))) {
62
+    stopifnot(is.logical(percentage))
63
+    percentage <- percentage[1]
64
+    if (!all(c(columns, key) %in% colnames(x))) {
64 65
         missing_cols <- c(
65 66
             columns[!columns %in% colnames(x)],
66 67
             key[!key %in% colnames(x)]
67 68
         )
68 69
         rlang::abort(.missing_user_cols_error(missing_cols))
69 70
     }
70
-    non_num_cols <- purrr::map_lgl(columns, function(col) {
71
-        expr <- rlang::expr(`$`(x, !!col))
72
-        if (is.numeric(rlang::eval_tidy(expr)) |
73
-            is.integer(rlang::eval_tidy(expr))) {
74
-            return(FALSE)
75
-        } else {
76
-            return(TRUE)
77
-        }
78
-    })
79
-    if (any(non_num_cols)) {
71
+    non_num_cols <- purrr::map_lgl(
72
+        columns,
73
+        ~ is.numeric(x[[.x]]) || is.integer(x[[.x]])
74
+    )
75
+    if (any(!non_num_cols)) {
80 76
         rlang::abort(.non_num_user_cols_error(columns[non_num_cols]))
81 77
     }
82 78
     stopifnot(is.logical(keep_totals) || keep_totals == "df")
... ...
@@ -131,182 +127,6 @@ compute_abundance <- function(x,
131 127
     }
132 128
 }
133 129
 
134
-
135
-#' obtain a single integration matrix from individual quantification
136
-#' matrices.
137
-#'
138
-#' \lifecycle{stable}
139
-#' Takes a list of integration matrices referring to different quantification
140
-#' types and merges them in a single data frame that has multiple
141
-#' value columns, each renamed according to their quantification type
142
-#' of reference.
143
-#'
144
-#' @param x A named list of integration matrices, ideally obtained via
145
-#' \link{import_parallel_Vispa2Matrices_interactive} or
146
-#' \link{import_parallel_Vispa2Matrices_auto}. Names must be
147
-#' quantification types.
148
-#' @param fragmentEstimate The name of the output column for fragment
149
-#' estimate values
150
-#' @param seqCount The name of the output column for sequence
151
-#' count values
152
-#' @param barcodeCount The name of the output column for barcode count
153
-#' values
154
-#' @param cellCount The name of the output column for cell count values
155
-#' @param ShsCount The name of the output column for Shs count values
156
-#'
157
-#' @importFrom purrr walk map2 reduce
158
-#' @importFrom dplyr rename full_join intersect
159
-#' @importFrom magrittr `%>%`
160
-#' @importFrom rlang .data `:=`
161
-#'
162
-#' @family Analysis functions
163
-#'
164