Browse code

[FIX] Fixed internal of top_cis_overtime_heatmap

Giulia Pais authored on 17/06/2022 17:10:34
Showing1 changed files
... ...
@@ -556,8 +556,8 @@ top_cis_overtime_heatmap <- function(x,
556 556
         dplyr::group_by(dplyr::across(dplyr::all_of(
557 557
             c(group_col, timepoint_col)
558 558
         ))) %>%
559
-        dplyr::group_modify(.f = arrange_slice_top) %>%
560
-        dplyr::ungroup()
559
+        dplyr::group_map(.f = arrange_slice_top, .keep = TRUE) %>%
560
+        purrr::reduce(dplyr::bind_rows)
561 561
 
562 562
     groups <- unique(slice_groups_tps[[group_col]])
563 563
 
Browse code

[HOTFIX] Fixed typo in top_cis_overtime_heatmap

Giulia Pais authored on 17/06/2022 16:58:09
Showing1 changed files
... ...
@@ -496,7 +496,7 @@ top_cis_overtime_heatmap <- function(x,
496 496
                 ) %>%
497 497
                     dplyr::mutate(!!group_col := .y)
498 498
             } else if (is.data.frame(tmp)) {
499
-                temp %>% dplyr::mutate(!!group_col := .y)
499
+                tmp %>% dplyr::mutate(!!group_col := .y)
500 500
             } else {
501 501
                 non_list_el <- c("Element is not list or df",
502 502
                     x = paste(
Browse code

[U] Update to 1.7.3

Giulia Pais authored on 17/06/2022 14:47:19
Showing1 changed files
... ...
@@ -97,10 +97,10 @@ CIS_volcano_plot <- function(x,
97 97
         "neg_zscore_minus_log2_int_freq_tolerance"
98 98
     )
99 99
     cis_grubbs_df <- if (!all(min_cis_col %in% colnames(x))) {
100
-        if (getOption("ISAnalytics.verbose") == TRUE) {
100
+        if (getOption("ISAnalytics.verbose", TRUE) == TRUE) {
101 101
             rlang::inform("Calculating CIS_grubbs for x...")
102 102
         }
103
-        CIS_grubbs(x, return_missing_as_df = FALSE)
103
+        (CIS_grubbs(x, return_missing_as_df = FALSE))$cis
104 104
     } else {
105 105
         x
106 106
     }
... ...
@@ -274,6 +274,540 @@ clinical_relevant_suspicious_genes <- function() {
274 274
     )
275 275
 }
276 276
 
277
+#' Heatmaps for the top N common insertion sites over time.
278
+#'
279
+#' @description
280
+#' `r lifecycle::badge("experimental")`
281
+#' This function computes the visualization of the results of the function
282
+#' `CIS_grubbs_overtime()` in the form of heatmaps for the top N selected
283
+#' genes over time.
284
+#'
285
+#' @template genes_db
286
+#' @family Plotting functions
287
+#'
288
+#' @inheritParams CIS_volcano_plot
289
+#' @param x Output of the function `CIS_grubbs_overtime()`, either in single
290
+#' data frame form or nested lists
291
+#' @param n_genes Number of top genes to consider
292
+#' @param timepoint_col The name of the time point column in `x`
293
+#' @param group_col The name of the group column in `x`
294
+#' @param plot_values Which kind of values should be plotted? Can either be
295
+#' `"p"` for the p-value or `"minus_log_p"` for a scaled p-value of the
296
+#' Grubbs test
297
+#' @param p_value_correction One among `"bonferroni"` and `"fdr"`
298
+#' @param prune_tp_treshold Minimum number of genes to retain a time point.
299
+#' See details.
300
+#' @param gene_selection_param The descriptive statistic measure to decide
301
+#' which genes to plot, possible choices are
302
+#' `"trimmed", "n", "mean", "sd", "median","mad", "min", "max"`. See details.
303
+#' @param fill_0_selection Fill NA values with 0s before computing statistics
304
+#' for each gene? (TRUE/FALSE)
305
+#' @param fill_NA_in_heatmap Fill NA values with 0 when plotting the heatmap?
306
+#' (TRUE/FALSE)
307
+#' @param heatmap_color_palette Colors for values in the heatmaps,
308
+#' either `"default"` or a function producing
309
+#' a color palette, obtainable via `grDevices::colorRampPalette`.
310
+#' @param title_generator Either `NULL` or a function. See details.
311
+#' @param save_as_files Should heatmaps be saved to files on disk? (TRUE/FALSE)
312
+#' @param files_format The extension of the files produced, supported
313
+#' formats are `"pdf", "png", "tiff", "bmp", "jpg"`. Relevant only if
314
+#' `files_format = TRUE`
315
+#' @param folder_path Path to the folder where files will be saved
316
+#' @param ... Other params to pass to `pheatmap::pheatmap`
317
+#'
318
+#' @details
319
+#' ## Top N gene selection
320
+#' Since the genes present in different time point slices are likely different,
321
+#' the decision process to select the final top N genes to represent in the
322
+#' heatmap follows this logic:
323
+#'
324
+#' * Each time point slice is arranged either in ascending order (if we want to
325
+#' plot the p-value) or in descending order (if we want to plot the scaled
326
+#' p-value) and the top n genes are selected
327
+#' * A series of statistics are computed over the union set of genes on ALL
328
+#' time points (min, max, mean, ...)
329
+#' * A decision is taken by considering the ordered `gene_selection_param`
330
+#' (order depends once again if the values are scaled or not), and the first
331
+#' N genes are selected for plotting.
332
+#'
333
+#' ### Filling NA values prior calculations
334
+#' It is possible to fill NA values (aka missing combinations of GENE/TP) with
335
+#' 0s prior computing the descriptive statistics on which gene selection is
336
+#' based. Please keep in mind that this has an impact on the final result,
337
+#' since for computing metrics such as the mean, NA values are usually removed,
338
+#' decreasing the overall number of values considered - this does not hold
339
+#' when NA values are substituted with 0s.
340
+#'
341
+#' ### The statistics
342
+#' Statistics are computed for each gene over all time points of each group.
343
+#' More in detail, `n`: counts the number of instances (rows)
344
+#' in which the genes appears, aka it counts the time points in which the gene
345
+#' is present. NOTE: if
346
+#' `fill_0_selection` option is set to `TRUE` this value will be equal for
347
+#' all genes! All other statistics as per the argument `gene_selection_param`
348
+#' map to the corresponding R functions with the exception of `trimmed` which
349
+#' is a simple call to the `mean` function with the argument `trimmed = 0.1`.
350
+#'
351
+#' ## Aesthetics
352
+#' It is possible to customise the appearence of the plot through different
353
+#' parameters.
354
+#'
355
+#' * `fill_NA_in_heatmap` tells the function whether missing combinations of
356
+#' GENE/TP should be plotted as NA or filled with a value (1 if p-value, 0
357
+#' if scaled p-value)
358
+#' * A title generator function can be provided to dynamically create a title
359
+#' for the plots: the function can accept two positional arguments for
360
+#' the group identifier and the number of selected genes respectively. If one or
361
+#' none of the arguments are of interest, they can be absorbed with `...`.
362
+#' * `heatmap_color_palette` can be used to specify a function from which
363
+#' colors are sampled (refers to the colors of values only)
364
+#' * To change the colors associated with annotations instead, use the
365
+#' argument `annotation_colors` of `pheatmap::pheatmap()` - it must be set to a
366
+#' list with this format:
367
+#' ```
368
+#' list(
369
+#'   KnownGeneClass = c("OncoGene" = color_spec,
370
+#'                      "Other" = color_spec,
371
+#'                      "TumSuppressor" = color_spec),
372
+#'   ClinicalRelevance = c("TRUE" = color_spec,
373
+#'                         "FALSE" = color_spec),
374
+#'   CriticalForInsMut = c("TRUE" = color_spec,
375
+#'                         "FALSE" = color_spec)
376
+#' )
377
+#' ```
378
+#'
379
+#' @return Either a list of graphical objects or a list of paths where
380
+#' plots were saved
381
+#' @export
382
+#'
383
+#' @examples
384
+#' data("integration_matrices", package = "ISAnalytics")
385
+#' data("association_file", package = "ISAnalytics")
386
+#' aggreg <- aggregate_values_by_key(
387
+#'     x = integration_matrices,
388
+#'     association_file = association_file,
389
+#'     value_cols = c("seqCount", "fragmentEstimate")
390
+#' )
391
+#' cis_overtime <- CIS_grubbs_overtime(aggreg)
392
+#' hmaps <- top_cis_overtime_heatmap(cis_overtime$cis,
393
+#'     fill_NA_in_heatmap = TRUE
394
+#' )
395
+#'
396
+#' # To re-plot:
397
+#' # grid::grid.newpage()
398
+#' # grid::grid.draw(hmaps$PT001$gtable)
399
+top_cis_overtime_heatmap <- function(x,
400
+    n_genes = 20,
401
+    timepoint_col = "TimePoint",
402
+    group_col = "group",
403
+    onco_db_file = "proto_oncogenes",
404
+    tumor_suppressors_db_file = "tumor_suppressors",
405
+    species = "human",
406
+    known_onco = known_clinical_oncogenes(),
407
+    suspicious_genes =
408
+        clinical_relevant_suspicious_genes(),
409
+    significance_threshold = 0.05,
410
+    plot_values = c("minus_log_p", "p"),
411
+    p_value_correction = c("fdr", "bonferroni"),
412
+    prune_tp_treshold = 20,
413
+    gene_selection_param = c(
414
+        "trimmed", "n", "mean", "sd", "median",
415
+        "mad", "min", "max"
416
+    ),
417
+    fill_0_selection = TRUE,
418
+    fill_NA_in_heatmap = FALSE,
419
+    heatmap_color_palette = "default",
420
+    title_generator = NULL,
421
+    save_as_files = FALSE,
422
+    files_format = c("pdf", "png", "tiff", "bmp", "jpg"),
423
+    folder_path = NULL,
424
+    ...) {
425
+    # --- Preliminary checks
426
+    if (!requireNamespace("pheatmap", quietly = TRUE)) {
427
+        rlang::abort(.missing_pkg_error("pheatmap"))
428
+    }
429
+    stopifnot(is.list(x))
430
+    stopifnot(is.numeric(n_genes))
431
+    if (is.list(x) & !is.data.frame(x) & is.null(names(x))) {
432
+        err_named <- c("Input list must have names",
433
+            x = paste(
434
+                "Input should follow the output format of",
435
+                "`CIS_grubbs_overtime()`"
436
+            )
437
+        )
438
+        rlang::abort(err_named)
439
+    } else if (is.data.frame(x) &
440
+        !all(c(timepoint_col, group_col) %in% colnames(x))) {
441
+        err_df <- c("Input df is missing columns",
442
+            x = paste(
443
+                "Input should follow the output format of",
444
+                "`CIS_grubbs_overtime()`"
445
+            ),
446
+            x = paste(
447
+                "Time point column ('", timepoint_col,
448
+                "') and/or group column ('", group_col,
449
+                "') are missing"
450
+            )
451
+        )
452
+        rlang::abort(err_df)
453
+    }
454
+    p_value_correction <- rlang::arg_match(p_value_correction)
455
+    plot_values <- rlang::arg_match(plot_values)
456
+    values_to_plot <- if (plot_values == "p") {
457
+        paste0("tdist_", p_value_correction)
458
+    } else {
459
+        paste0("minus_log_p_", p_value_correction)
460
+    }
461
+    gene_selection_param <- rlang::arg_match(gene_selection_param)
462
+    stopifnot(is.numeric(prune_tp_treshold))
463
+    stopifnot(is.logical(fill_0_selection))
464
+    fill_0_selection <- fill_0_selection[1]
465
+    stopifnot(is.logical(fill_NA_in_heatmap))
466
+    fill_NA_in_heatmap <- fill_NA_in_heatmap[1]
467
+    stopifnot(is.logical(save_as_files))
468
+    save_as_files <- save_as_files[1]
469
+    stopifnot(is.null(folder_path) || is.character(folder_path))
470
+    if (is.character(folder_path[1])) {
471
+        fs::dir_create(folder_path[1])
472
+    }
473
+    stopifnot(
474
+        (is.character(heatmap_color_palette) &
475
+            heatmap_color_palette == "default") ||
476
+            is.function(heatmap_color_palette)
477
+    )
478
+    files_format <- rlang::arg_match(files_format)
479
+    if (save_as_files == TRUE & is.null(folder_path) &
480
+        getOption("ISAnalytics.verbose", TRUE) == TRUE) {
481
+        warn_msg <- c("Warning: you did not set a folder for saving files",
482
+            i = "Returning heatmaps as a list in R env"
483
+        )
484
+        rlang::inform(warn_msg, class = "no_fold_files")
485
+    }
486
+    stopifnot(is.null(title_generator) || is.function(title_generator))
487
+    dots <- rlang::dots_list(..., .named = TRUE)
488
+    # --- If input is in list form, convert in single df
489
+    if (is.list(x) & !is.data.frame(x)) {
490
+        x <- purrr::map2_df(x, names(x), ~ {
491
+            tmp <- .x
492
+            if (is.list(tmp) & !is.data.frame(tmp)) {
493
+                purrr::map2_df(
494
+                    tmp, names(tmp),
495
+                    ~ .x %>% dplyr::mutate(!!timepoint_col := .y)
496
+                ) %>%
497
+                    dplyr::mutate(!!group_col := .y)
498
+            } else if (is.data.frame(tmp)) {
499
+                temp %>% dplyr::mutate(!!group_col := .y)
500
+            } else {
501
+                non_list_el <- c("Element is not list or df",
502
+                    x = paste(
503
+                        "Element", .y, "in x is not a list or a",
504
+                        "data frame"
505
+                    )
506
+                )
507
+                rlang::abort(non_list_el)
508
+            }
509
+        })
510
+    }
511
+    gene_sym_col <- annotation_IS_vars(TRUE) %>%
512
+        dplyr::filter(.data$tag == "gene_symbol") %>%
513
+        dplyr::pull(.data$names)
514
+    # --- Expand the df with gene info
515
+    expanded <- .expand_cis_df(
516
+        x, gene_sym_col,
517
+        onco_db_file, tumor_suppressors_db_file,
518
+        species, known_onco, suspicious_genes
519
+    )
520
+    # --- Add log conversions if needed
521
+    if (plot_values == "minus_log_p") {
522
+        expanded <- expanded %>%
523
+            dplyr::mutate(
524
+                minus_log_p_bonferroni = -log(.data$tdist_bonferroni,
525
+                    base = 10
526
+                ),
527
+                minus_log_p_fdr = -log(.data$tdist_fdr, base = 10)
528
+            )
529
+    }
530
+    # --- For each combo (group, tp) arrange and slice the top n
531
+    arrange_slice_top <- function(group_df, ...) {
532
+        if (nrow(group_df) < prune_tp_treshold) {
533
+            return(NULL)
534
+        }
535
+        if (plot_values == "p") {
536
+            return(
537
+                group_df %>%
538
+                    dplyr::arrange(.data[[values_to_plot]]) %>%
539
+                    dplyr::slice_head(n = n_genes)
540
+            )
541
+        } else {
542
+            return(
543
+                group_df %>%
544
+                    dplyr::arrange(dplyr::desc(.data[[values_to_plot]])) %>%
545
+                    dplyr::slice_head(n = n_genes)
546
+            )
547
+        }
548
+    }
549
+    slice_groups_tps <- expanded %>%
550
+        dplyr::select(
551
+            dplyr::all_of(c(
552
+                gene_sym_col, timepoint_col,
553
+                group_col, values_to_plot
554
+            ))
555
+        ) %>%
556
+        dplyr::group_by(dplyr::across(dplyr::all_of(
557
+            c(group_col, timepoint_col)
558
+        ))) %>%
559
+        dplyr::group_modify(.f = arrange_slice_top) %>%
560
+        dplyr::ungroup()
561
+
562
+    groups <- unique(slice_groups_tps[[group_col]])
563
+
564
+    # --- Evaluate statistics for genes in top n slices
565
+    eval_candidates <- function(group_id) {
566
+        df <- slice_groups_tps %>%
567
+            dplyr::filter(.data[[group_col]] == group_id) %>%
568
+            dplyr::select(-.data[[group_col]])
569
+        if (fill_0_selection == TRUE) {
570
+            value_fill <- list(0)
571
+            names(value_fill) <- values_to_plot
572
+            df <- df %>%
573
+                tidyr::complete(.data[[timepoint_col]], .data[[gene_sym_col]],
574
+                    fill = value_fill
575
+                )
576
+        }
577
+        df %>%
578
+            dplyr::group_by(dplyr::across(dplyr::all_of(gene_sym_col))) %>%
579
+            dplyr::summarise(
580
+                n = dplyr::n(),
581
+                mean = mean(.data[[values_to_plot]], na.rm = TRUE),
582
+                sd = stats::sd(.data[[values_to_plot]], na.rm = TRUE),
583
+                median = stats::median(.data[[values_to_plot]], na.rm = TRUE),
584
+                trimmed = mean(.data[[values_to_plot]],
585
+                    na.rm = TRUE,
586
+                    trim = 0.1
587
+                ),
588
+                mad = stats::mad(.data[[values_to_plot]], na.rm = TRUE),
589
+                min = min(.data[[values_to_plot]], na.rm = TRUE),
590
+                max = max(.data[[values_to_plot]], na.rm = TRUE),
591
+                .groups = "drop"
592
+            )
593
+    }
594
+    candidates <- purrr::map(groups, eval_candidates) %>%
595
+        purrr::set_names(groups)
596
+
597
+    # --- Select from candidates according to gene_selection_param
598
+    select_from_candidates <- function(group_df) {
599
+        if (gene_selection_param == "n") {
600
+            return(
601
+                group_df %>%
602
+                    dplyr::arrange(dplyr::desc(.data$n)) %>%
603
+                    dplyr::slice_head(n = n_genes) %>%
604
+                    dplyr::pull(.data[[gene_sym_col]])
605
+            )
606
+        }
607
+        if (plot_values == "p") {
608
+            ## Order ascending
609
+            return(
610
+                group_df %>%
611
+                    dplyr::arrange(.data[[gene_selection_param]]) %>%
612
+                    dplyr::slice_head(n = n_genes) %>%
613
+                    dplyr::pull(.data[[gene_sym_col]])
614
+            )
615
+        } else {
616
+            ## Order descending
617
+            return(
618
+                group_df %>%
619
+                    dplyr::arrange(dplyr::desc(
620
+                        .data[[gene_selection_param]]
621
+                    )) %>%
622
+                    dplyr::slice_head(n = n_genes) %>%
623
+                    dplyr::pull(.data[[gene_sym_col]])
624
+            )
625
+        }
626
+    }
627
+    gene_selection <- purrr::map(candidates, select_from_candidates)
628
+
629
+    # --- Extract only relevant genes from input
630
+    genes_to_map <- purrr::map(groups, ~ expanded %>%
631
+        dplyr::filter(group == .x) %>%
632
+        dplyr::filter(.data[[timepoint_col]] %in%
633
+            unique((slice_groups_tps %>%
634
+                dplyr::filter(group == .x))[[timepoint_col]])) %>%
635
+        dplyr::filter(.data[[gene_sym_col]] %in%
636
+            gene_selection[[.x]]) %>%
637
+        dplyr::select(
638
+            dplyr::all_of(c(
639
+                gene_sym_col, timepoint_col,
640
+                group_col, values_to_plot
641
+            )),
642
+            .data$CriticalForInsMut, .data$KnownGeneClass,
643
+            .data$ClinicalRelevance
644
+        )) %>%
645
+        purrr::set_names(groups)
646
+
647
+    # --- Obtain heatmaps
648
+    ## --- Define global defaults
649
+    if (is.character(heatmap_color_palette)) {
650
+        heatmap_color_palette <- if (plot_values == "minus_log_p") {
651
+            grDevices::colorRampPalette(
652
+                c(
653
+                    "steelblue", "white", "red", "firebrick", "firebrick", "darkred",
654
+                    "darkred", "violet", "violet"
655
+                )
656
+            )
657
+        } else {
658
+            grDevices::colorRampPalette(
659
+                rev(c(
660
+                    "steelblue", "white", "red", "firebrick", "darkred",
661
+                    "violet"
662
+                )),
663
+                bias = 10
664
+            )
665
+        }
666
+    }
667
+    annotation_palette <- if ("annotation_colors" %in% names(dots)) {
668
+        dots$annotation_colors
669
+    } else {
670
+        list(
671
+            KnownGeneClass = c(
672
+                "OncoGene" = "red2",
673
+                "Other" = "palegreen",
674
+                "TumSuppressor" = "dodgerblue4"
675
+            ),
676
+            ClinicalRelevance = c(
677
+                "TRUE" = "gold",
678
+                "FALSE" = "gray90"
679
+            ),
680
+            CriticalForInsMut = c(
681
+                "TRUE" = "red2",
682
+                "FALSE" = "gray90"
683
+            )
684
+        )
685
+    }
686
+    plotting_step <- if (plot_values == "minus_log_p") {
687
+        round((-log(0.05, base = 10) / 3), 3)
688
+    } else {
689
+        round(1 / 100, 3)
690
+    }
691
+
692
+    trace_heatmap <- function(data_df) {
693
+        ## --- Fix annotations (fill na, convert to char)
694
+        data_df <- data_df %>%
695
+            tidyr::replace_na(list(ClinicalRelevance = FALSE)) %>%
696
+            dplyr::mutate(
697
+                CriticalForInsMut = as.character(.data$CriticalForInsMut),
698
+                ClinicalRelevance = as.character(.data$ClinicalRelevance)
699
+            )
700
+        ## --- Obtain data matrix and annotations
701
+        wide <- if (fill_NA_in_heatmap == TRUE) {
702
+            data_df %>%
703
+                tidyr::pivot_wider(
704
+                    names_from = timepoint_col,
705
+                    values_from = values_to_plot,
706
+                    values_fill = dplyr::if_else(plot_values == "p",
707
+                        1, 0
708
+                    )
709
+                )
710
+        } else {
711
+            data_df %>%
712
+                tidyr::pivot_wider(
713
+                    names_from = timepoint_col,
714
+                    values_from = values_to_plot
715
+                )
716
+        }
717
+        matrix <- wide %>%
718
+            dplyr::select(dplyr::all_of(unique(data_df[[timepoint_col]]))) %>%
719
+            as.matrix()
720
+        rownames(matrix) <- wide[[gene_sym_col]]
721
+        annotations <- wide %>%
722
+            dplyr::select(
723
+                .data$CriticalForInsMut, .data$KnownGeneClass,
724
+                .data$ClinicalRelevance
725
+            ) %>%
726
+            as.data.frame()
727
+        rownames(annotations) <- wide[[gene_sym_col]]
728
+        ## --- Obtain other params
729
+        plot_breaks <- if (plot_values == "minus_log_p") {
730
+            seq(
731
+                0, ceiling(max(data_df[[values_to_plot]], na.rm = TRUE)),
732
+                plotting_step
733
+            )
734
+        } else {
735
+            seq(0, 1, plotting_step)
736
+        }
737
+        params_to_pass <- list()
738
+        params_to_pass$color <- heatmap_color_palette(length(plot_breaks) + 1)
739
+        params_to_pass$annotation_colors <- annotation_palette
740
+        params_to_pass$annotation_row <- annotations
741
+        params_to_pass$breaks <- plot_breaks
742
+        if (save_as_files == TRUE & !is.null(folder_path)) {
743
+            file_name <- paste0(
744
+                data_df$group[1], ".", lubridate::today(),
745
+                "_top", n_genes, "-CIS-overtime_using-",
746
+                gene_selection_param, ".", files_format
747
+            )
748
+            params_to_pass$filename <- fs::path(folder_path[1], file_name)
749
+        }
750
+        params_to_pass$cluster_rows <- if ("cluster_rows" %in% names(dots)) {
751
+            dots$cluster_rows
752
+        } else {
753
+            TRUE
754
+        }
755
+        params_to_pass$cluster_cols <- if ("cluster_cols" %in% names(dots)) {
756
+            dots$cluster_cols
757
+        } else {
758
+            FALSE
759
+        }
760
+        params_to_pass$scale <- if ("scale" %in% names(dots)) {
761
+            dots$scale
762
+        } else {
763
+            "none"
764
+        }
765
+        params_to_pass$display_numbers <- if ("display_numbers" %in%
766
+            names(dots)) {
767
+            dots$display_numbers
768
+        } else {
769
+            TRUE
770
+        }
771
+        params_to_pass$number_format <- if ("number_format" %in% names(dots)) {
772
+            dots$number_format
773
+        } else {
774
+            "%.2f"
775
+        }
776
+        params_to_pass$main <- if (!is.null(title_generator)) {
777
+            title_generator(data_df$group[1], n_genes)
778
+        } else {
779
+            method_str <- if (plot_values == "minus_log_p") {
780
+                paste0(
781
+                    "-log(p-value/", p_value_correction, ")\n",
782
+                    "[CIS iif p-value < 0.05; -log(0.05) = ",
783
+                    (round(-log(0.05, base = 10), 3)), "]"
784
+                )
785
+            } else {
786
+                paste(
787
+                    "p-value/", p_value_correction, "\n",
788
+                    "[CIS iif p-value < 0.05]"
789
+                )
790
+            }
791
+            paste0(
792
+                "Patient ", data_df$group[1],
793
+                " - Top ", n_genes,
794
+                " hotspot genes\nAnalysis over time, ",
795
+                method_str
796
+            )
797
+        }
798
+        dots <- dots[!names(dots) %in% c(names(params_to_pass), "mat")]
799
+        params_to_pass <- append(params_to_pass, dots)
800
+        ## --- Plot
801
+        map <- rlang::exec(pheatmap::pheatmap, mat = matrix, !!!params_to_pass)
802
+        if ("filename" %in% names(params_to_pass)) {
803
+            map <- params_to_pass[["filename"]]
804
+        }
805
+        return(map)
806
+    }
807
+
808
+    purrr::map(genes_to_map, trace_heatmap)
809
+}
810
+
277 811
 #' Plot results of gene frequency Fisher's exact test.
278 812
 #'
279 813
 #' @description
... ...
@@ -346,47 +880,47 @@ fisher_scatterplot <- function(fisher_df,
346 880
     gene_sym_col = "GeneName",
347 881
     do_not_highlight = NULL,
348 882
     keep_not_highlighted = TRUE) {
349
-    stopifnot(is.null(do_not_highlight) || is.character(do_not_highlight)
350
-              || rlang::is_expression(do_not_highlight))
883
+    stopifnot(is.null(do_not_highlight) || is.character(do_not_highlight) ||
884
+        rlang::is_expression(do_not_highlight))
351 885
     if (is.null(do_not_highlight)) {
352
-      below_threshold <- fisher_df %>%
353
-        dplyr::filter(.data[[p_value_col]] < annot_threshold)
354
-      above_threshold <- fisher_df %>%
355
-        dplyr::filter(.data[[p_value_col]] >= annot_threshold)
356
-    } else if (is.character(do_not_highlight)) {
357
-      below_threshold <- fisher_df %>%
358
-        dplyr::filter(.data[[p_value_col]] < annot_threshold &
359
-                        !.data[[gene_sym_col]] %in% do_not_highlight)
360
-      if (keep_not_highlighted) {
886
+        below_threshold <- fisher_df %>%
887
+            dplyr::filter(.data[[p_value_col]] < annot_threshold)
361 888
         above_threshold <- fisher_df %>%
362
-          dplyr::anti_join(below_threshold, by = gene_sym_col)
363
-      } else {
364
-        above_threshold <- fisher_df %>%
365
-          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
366
-      }
889
+            dplyr::filter(.data[[p_value_col]] >= annot_threshold)
890
+    } else if (is.character(do_not_highlight)) {
891
+        below_threshold <- fisher_df %>%
892
+            dplyr::filter(.data[[p_value_col]] < annot_threshold &
893
+                !.data[[gene_sym_col]] %in% do_not_highlight)
894
+        if (keep_not_highlighted) {
895
+            above_threshold <- fisher_df %>%
896
+                dplyr::anti_join(below_threshold, by = gene_sym_col)
897
+        } else {
898
+            above_threshold <- fisher_df %>%
899
+                dplyr::filter(.data[[p_value_col]] >= annot_threshold)
900
+        }
367 901
     } else if (rlang::is_formula(do_not_highlight)) {
368
-      below_threshold <- fisher_df %>%
369
-        dplyr::filter(.data[[p_value_col]] < annot_threshold &
370
-                        !rlang::as_function(do_not_highlight)(
371
-                          .data[[gene_sym_col]]))
372
-      if (keep_not_highlighted) {
373
-        above_threshold <- fisher_df %>%
374
-          dplyr::anti_join(below_threshold, by = gene_sym_col)
375
-      } else {
376
-        above_threshold <- fisher_df %>%
377
-          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
378
-      }
902
+        below_threshold <- fisher_df %>%
903
+            dplyr::filter(.data[[p_value_col]] < annot_threshold &
904
+                !rlang::as_function(do_not_highlight)(
905
+                    .data[[gene_sym_col]]))
906
+        if (keep_not_highlighted) {
907
+            above_threshold <- fisher_df %>%
908
+                dplyr::anti_join(below_threshold, by = gene_sym_col)
909
+        } else {
910
+            above_threshold <- fisher_df %>%
911
+                dplyr::filter(.data[[p_value_col]] >= annot_threshold)
912
+        }
379 913
     } else {
380
-      below_threshold <- fisher_df %>%
381
-        dplyr::filter(.data[[p_value_col]] < annot_threshold &
382
-                        (rlang::eval_tidy(do_not_highlight)))
383
-      if (keep_not_highlighted) {
384
-        above_threshold <- fisher_df %>%
385
-          dplyr::anti_join(below_threshold, by = gene_sym_col)
386
-      } else {
387
-        above_threshold <- fisher_df %>%
388
-          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
389
-      }
914
+        below_threshold <- fisher_df %>%
915
+            dplyr::filter(.data[[p_value_col]] < annot_threshold &
916
+                (rlang::eval_tidy(do_not_highlight)))
917
+        if (keep_not_highlighted) {
918
+            above_threshold <- fisher_df %>%
919
+                dplyr::anti_join(below_threshold, by = gene_sym_col)
920
+        } else {
921
+            above_threshold <- fisher_df %>%
922
+                dplyr::filter(.data[[p_value_col]] >= annot_threshold)
923
+        }
390 924
     }
391 925
     plot <- ggplot2::ggplot(
392 926
         above_threshold,
... ...
@@ -952,7 +1486,7 @@ top_abund_tableGrob <- function(df,
952 1486
     tops_by_x <- purrr::map(distinct_x, function(x) {
953 1487
         tmp <- top %>%
954 1488
             dplyr::filter(
955
-              dplyr::if_all(.cols = dplyr::all_of(by), .fns = ~ .x == x)
1489
+                dplyr::if_all(.cols = dplyr::all_of(by), .fns = ~ .x == x)
956 1490
             ) %>%
957 1491
             dplyr::select(-dplyr::all_of(by))
958 1492
         if (nrow(tmp) < top_n) {
Browse code

[FIX] Fixes for next version

Giulia Pais authored on 04/05/2022 12:58:45
Showing1 changed files
... ...
@@ -281,6 +281,31 @@ clinical_relevant_suspicious_genes <- function() {
281 281
 #' Plots results of Fisher's exact test on gene frequency obtained via
282 282
 #' `gene_frequency_fisher()` as a scatterplot.
283 283
 #'
284
+#' @details
285
+#' ## Specifying genes to avoid highlighting
286
+#'
287
+#' In some cases, users might want to avoid highlighting certain genes
288
+#' even if their p-value is below the threshold. To do so, use the
289
+#' argument `do_not_highlight`: character vectors are appropriate for specific
290
+#' genes that are to be excluded, expressions or lambdas allow a finer control.
291
+#' For example we can supply:
292
+#' ```{r eval=FALSE}
293
+#' expr <- rlang::expr(!stringr::str_starts(GeneName, "MIR") &
294
+#'                       average_TxLen_1 >= 300)
295
+#' ```
296
+#'
297
+#' with this expression, genes that have a p-value < threshold and start with
298
+#' "MIR" or have an average_TxLen_1 lower than 300 are excluded from the
299
+#' highlighted points.
300
+#' NOTE: keep in mind that expressions are evaluated inside a `dplyr::filter`
301
+#' context.
302
+#'
303
+#' Similarly, lambdas are passed to the filtering function but only operate
304
+#' on the column containing the gene symbol.
305
+#' ```{r eval=FALSE}
306
+#' lambda <- ~ stringr::str_starts(.x, "MIR")
307
+#' ```
308
+#'
284 309
 #' @param fisher_df Test results obtained via `gene_frequency_fisher()`
285 310
 #' @param p_value_col Name of the column containing the p-value to consider
286 311
 #' @param annot_threshold Annotate with a different color if a point is below
... ...
@@ -288,6 +313,14 @@ clinical_relevant_suspicious_genes <- function() {
288 313
 #' @param annot_color The color in which points below the threshold should be
289 314
 #' annotated
290 315
 #' @param gene_sym_col The name of the column containing the gene symbol
316
+#' @param do_not_highlight Either `NULL`, a character vector, an expression
317
+#' or a purrr-style lambda. Tells the function to ignore the highlighting
318
+#' and labeling of these genes even if their p-value is below the threshold.
319
+#' See details.
320
+#' @param keep_not_highlighted If present, how should not highlighted genes
321
+#' be treated? If set to `TRUE` points are plotted and colored with the
322
+#' chosen color scale. If set to `FALSE` the points are removed entirely from
323
+#' the plot.
291 324
 #'
292 325
 #' @family Plotting functions
293 326
 #' @return A plot
... ...
@@ -310,11 +343,51 @@ fisher_scatterplot <- function(fisher_df,
310 343
     p_value_col = "Fisher_p_value_fdr",
311 344
     annot_threshold = 0.05,
312 345
     annot_color = "red",
313
-    gene_sym_col = "GeneName") {
314
-    below_threshold <- fisher_df %>%
346
+    gene_sym_col = "GeneName",
347
+    do_not_highlight = NULL,
348
+    keep_not_highlighted = TRUE) {
349
+    stopifnot(is.null(do_not_highlight) || is.character(do_not_highlight)
350
+              || rlang::is_expression(do_not_highlight))
351
+    if (is.null(do_not_highlight)) {
352
+      below_threshold <- fisher_df %>%
315 353
         dplyr::filter(.data[[p_value_col]] < annot_threshold)
316
-    above_threshold <- fisher_df %>%
354
+      above_threshold <- fisher_df %>%
317 355
         dplyr::filter(.data[[p_value_col]] >= annot_threshold)
356
+    } else if (is.character(do_not_highlight)) {
357
+      below_threshold <- fisher_df %>%
358
+        dplyr::filter(.data[[p_value_col]] < annot_threshold &
359
+                        !.data[[gene_sym_col]] %in% do_not_highlight)
360
+      if (keep_not_highlighted) {
361
+        above_threshold <- fisher_df %>%
362
+          dplyr::anti_join(below_threshold, by = gene_sym_col)
363
+      } else {
364
+        above_threshold <- fisher_df %>%
365
+          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
366
+      }
367
+    } else if (rlang::is_formula(do_not_highlight)) {
368
+      below_threshold <- fisher_df %>%
369
+        dplyr::filter(.data[[p_value_col]] < annot_threshold &
370
+                        !rlang::as_function(do_not_highlight)(
371
+                          .data[[gene_sym_col]]))
372
+      if (keep_not_highlighted) {
373
+        above_threshold <- fisher_df %>%
374
+          dplyr::anti_join(below_threshold, by = gene_sym_col)
375
+      } else {
376
+        above_threshold <- fisher_df %>%
377
+          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
378
+      }
379
+    } else {
380
+      below_threshold <- fisher_df %>%
381
+        dplyr::filter(.data[[p_value_col]] < annot_threshold &
382
+                        (rlang::eval_tidy(do_not_highlight)))
383
+      if (keep_not_highlighted) {
384
+        above_threshold <- fisher_df %>%
385
+          dplyr::anti_join(below_threshold, by = gene_sym_col)
386
+      } else {
387
+        above_threshold <- fisher_df %>%
388
+          dplyr::filter(.data[[p_value_col]] >= annot_threshold)
389
+      }
390
+    }
318 391
     plot <- ggplot2::ggplot(
319 392
         above_threshold,
320 393
         ggplot2::aes_(
... ...
@@ -323,7 +396,7 @@ fisher_scatterplot <- function(fisher_df,
323 396
             color = ggplot2::sym(p_value_col)
324 397
         )
325 398
     ) +
326
-        ggplot2::geom_point() +
399
+        ggplot2::geom_point(alpha = 0.65) +
327 400
         ggplot2::geom_point(data = below_threshold, color = annot_color) +
328 401
         ggplot2::theme_bw() +
329 402
         ggplot2::labs(
... ...
@@ -879,9 +952,7 @@ top_abund_tableGrob <- function(df,
879 952
     tops_by_x <- purrr::map(distinct_x, function(x) {
880 953
         tmp <- top %>%
881 954
             dplyr::filter(
882
-                dplyr::across(
883
-                    dplyr::all_of(by), ~ .x == x
884
-                )
955
+              dplyr::if_all(.cols = dplyr::all_of(by), .fns = ~ .x == x)
885 956
             ) %>%
886 957
             dplyr::select(-dplyr::all_of(by))
887 958
         if (nrow(tmp) < top_n) {
Browse code

[UPDATE] Update to 1.5.4

Merge branch 'dynamicVarsFeature'

# Conflicts:
# DESCRIPTION

Giulia Pais authored on 20/04/2022 19:09:10
Showing0 changed files
Browse code

[UPDATE] Final partial update for 1.5.4 - finished documentation, ready to merge

Giulia Pais authored on 20/04/2022 15:58:57
Showing1 changed files
... ...
@@ -4,7 +4,8 @@
4 4
 
5 5
 #' Trace volcano plot for computed CIS data.
6 6
 #'
7
-#' \lifecycle{stable}
7
+#' @description
8
+#' `r lifecycle::badge("stable")`
8 9
 #' Traces a volcano plot for IS frequency and CIS results.
9 10
 #'
10 11
 #' @details
... ...
@@ -14,40 +15,10 @@
14 15
 #' In the first case an internal call to
15 16
 #' the function `CIS_grubbs()` is performed.
16 17
 #'
17
-#' ## Oncogene and tumor suppressor genes files
18
-#' These files are included in the package for user convenience and are
19
-#' simply UniProt files with gene annotations for human and mouse.
20
-#' For more details on how this files were generated use the help
21
-#' `?tumor_suppressors`, `?proto_oncogenes`
22
-#'
23
-#' ## Known oncogenes
24
-#' The default values are included in this package and
25
-#' it can be accessed by doing:
26
-#'
27
-#' ```{r}
28
-#' head(known_clinical_oncogenes())
29
-#'
30
-#' ```
31
-#' If the user wants to change this parameter the input data frame must
32
-#' preserve the column structure. The same goes for the `suspicious_genes`
33
-#' parameter (DOIReference column is optional):
34
-#'
35
-#' ```{r}
36
-#' head(clinical_relevant_suspicious_genes())
37
-#'
38
-#' ```
39 18
 #' @family Plotting functions
40 19
 #'
41 20
 #' @param x Either a simple integration matrix or a data frame resulting
42 21
 #' from the call to \link{CIS_grubbs} with `add_standard_padjust = TRUE`
43
-#' @param onco_db_file Uniprot file for proto-oncogenes (see details).
44
-#' If different from default, should be supplied as a path to a file.
45
-#' @param tumor_suppressors_db_file Uniprot file for tumor-suppressor genes.
46
-#' If different from default, should be supplied as a path to a file.
47
-#' @param species One between `"human"`, `"mouse"` and `"all"`
48
-#' @param known_onco Data frame with known oncogenes. See details.
49
-#' @param suspicious_genes Data frame with clinical relevant suspicious
50
-#' genes. See details.
51 22
 #' @param significance_threshold The significance threshold
52 23
 #' @param annotation_threshold_ontots Value above which genes are annotated
53 24
 #' with colorful labels
... ...
@@ -62,13 +33,19 @@
62 33
 #' ggplot2. If TRUE the function returns a list containing both the plot
63 34
 #' and the data frame.
64 35
 #'
65
-#' @importFrom ggplot2 ggplot aes_ geom_point geom_hline scale_y_continuous
66
-#' @importFrom ggplot2 scale_x_continuous unit theme element_text labs
67
-#' @importFrom ggrepel geom_label_repel
68
-#' @importFrom dplyr left_join mutate filter
69
-#' @importFrom rlang .data inform
70
-#' @importFrom purrr is_empty
71
-#' @importFrom stringr str_to_lower
36
+#' @template genes_db
37
+#'
38
+#' @section Required tags:
39
+#' The function will explicitly check for the presence of these tags:
40
+#'
41
+#' ```{r echo=FALSE, results="asis"}
42
+#' all_tags <- available_tags()
43
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
44
+#'  ~ "CIS_volcano_plot" %in% .x)][["tag"]])
45
+#'  cat(paste0("* ", needed, collapse="\n"))
46
+#' ```
47
+#'
48
+#' @importFrom rlang .data
72 49
 #'
73 50
 #' @return A plot or a list containing a plot and a data frame
74 51
 #' @export
... ...
@@ -123,17 +100,19 @@ CIS_volcano_plot <- function(x,
123 100
         if (getOption("ISAnalytics.verbose") == TRUE) {
124 101
             rlang::inform("Calculating CIS_grubbs for x...")
125 102
         }
126
-        CIS_grubbs(x)
103
+        CIS_grubbs(x, return_missing_as_df = FALSE)
127 104
     } else {
128 105
         x
129 106
     }
130 107
     gene_sym_col <- annotation_IS_vars(TRUE) %>%
131
-      dplyr::filter(.data$tag == "gene_symbol") %>%
132
-      dplyr::pull(.data$names)
108
+        dplyr::filter(.data$tag == "gene_symbol") %>%
109
+        dplyr::pull(.data$names)
133 110
     ## Add info to CIS
134
-    cis_grubbs_df <- .expand_cis_df(cis_grubbs_df, gene_sym_col,
135
-                                    onco_db_file, tumor_suppressors_db_file,
136
-                                    species, known_onco, suspicious_genes)
111
+    cis_grubbs_df <- .expand_cis_df(
112
+        cis_grubbs_df, gene_sym_col,
113
+        onco_db_file, tumor_suppressors_db_file,
114
+        species, known_onco, suspicious_genes
115
+    )
137 116
     cis_grubbs_df <- cis_grubbs_df %>%
138 117
         dplyr::mutate(minus_log_p = -log(.data$tdist_bonferroni_default,
139 118
             base = 10
... ...
@@ -295,34 +274,73 @@ clinical_relevant_suspicious_genes <- function() {
295 274
     )
296 275
 }
297 276
 
277
+#' Plot results of gene frequency Fisher's exact test.
278
+#'
279
+#' @description
280
+#' `r lifecycle::badge("stable")`
281
+#' Plots results of Fisher's exact test on gene frequency obtained via
282
+#' `gene_frequency_fisher()` as a scatterplot.
283
+#'
284
+#' @param fisher_df Test results obtained via `gene_frequency_fisher()`
285
+#' @param p_value_col Name of the column containing the p-value to consider
286
+#' @param annot_threshold Annotate with a different color if a point is below
287
+#' the significance threshold. Single numerical value.
288
+#' @param annot_color The color in which points below the threshold should be
289
+#' annotated
290
+#' @param gene_sym_col The name of the column containing the gene symbol
291
+#'
292
+#' @family Plotting functions
293
+#' @return A plot
294
+#' @export
295
+#'
296
+#' @examples
297
+#' data("integration_matrices", package = "ISAnalytics")
298
+#' data("association_file", package = "ISAnalytics")
299
+#' aggreg <- aggregate_values_by_key(
300
+#'     x = integration_matrices,
301
+#'     association_file = association_file,
302
+#'     value_cols = c("seqCount", "fragmentEstimate")
303
+#' )
304
+#' cis <- CIS_grubbs(aggreg, by = "SubjectID")
305
+#' fisher <- gene_frequency_fisher(cis$cis$PT001, cis$cis$PT002,
306
+#'     min_is_per_gene = 2
307
+#' )
308
+#' fisher_scatterplot(fisher)
298 309
 fisher_scatterplot <- function(fisher_df,
299
-                               p_value_col = "Fisher_p_value_fdr",
300
-                               annot_threshold = 0.05,
301
-                               annot_color = "red",
302
-                               gene_sym_col = "GeneName") {
303
-  below_threshold <- fisher_df %>%
304
-    dplyr::filter(.data[[p_value_col]] < annot_threshold)
305
-  above_threshold <- fisher_df %>%
306
-    dplyr::filter(.data[[p_value_col]] >= annot_threshold)
307
-  plot <- ggplot2::ggplot(above_threshold,
308
-                          ggplot2::aes_(x = ~ IS_per_kbGeneLen_1,
309
-                                        y = ~ IS_per_kbGeneLen_2,
310
-                                        color = ggplot2::sym(p_value_col))) +
311
-    ggplot2::geom_point() +
312
-    ggplot2::geom_point(data = below_threshold, color = annot_color) +
313
-    ggplot2::theme_bw() +
314
-    ggplot2::labs(x = "Gene frequency G1", y = "Gene frequency G2",
315
-                  color = "Fisher's test p-value") +
316
-    ggrepel::geom_label_repel(data = below_threshold,
317
-                              ggplot2::aes_string(label = gene_sym_col),
318
-                              box.padding = ggplot2::unit(0.35, "lines"),
319
-                              point.padding = ggplot2::unit(0.3, "lines"),
320
-                              max.overlaps = Inf, color = annot_color,
321
-                              fill = ggplot2::alpha("white", alpha = 0.6)
310
+    p_value_col = "Fisher_p_value_fdr",
311
+    annot_threshold = 0.05,
312
+    annot_color = "red",
313
+    gene_sym_col = "GeneName") {
314
+    below_threshold <- fisher_df %>%
315
+        dplyr::filter(.data[[p_value_col]] < annot_threshold)
316
+    above_threshold <- fisher_df %>%
317
+        dplyr::filter(.data[[p_value_col]] >= annot_threshold)
318
+    plot <- ggplot2::ggplot(
319
+        above_threshold,
320
+        ggplot2::aes_(
321
+            x = ~IS_per_kbGeneLen_1,
322
+            y = ~IS_per_kbGeneLen_2,
323
+            color = ggplot2::sym(p_value_col)
324
+        )
322 325
     ) +
323
-    ggplot2::scale_x_continuous() +
324
-    ggplot2::scale_y_continuous()
325
-  return(plot)
326
+        ggplot2::geom_point() +
327
+        ggplot2::geom_point(data = below_threshold, color = annot_color) +
328
+        ggplot2::theme_bw() +
329
+        ggplot2::labs(
330
+            x = "Gene frequency G1", y = "Gene frequency G2",
331
+            color = "Fisher's test p-value"
332
+        ) +
333
+        ggrepel::geom_label_repel(
334
+            data = below_threshold,
335
+            ggplot2::aes_string(label = gene_sym_col),
336
+            box.padding = ggplot2::unit(0.35, "lines"),
337
+            point.padding = ggplot2::unit(0.3, "lines"),
338
+            max.overlaps = Inf, color = annot_color,
339
+            fill = ggplot2::alpha("white", alpha = 0.6)
340
+        ) +
341
+        ggplot2::scale_x_continuous() +
342
+        ggplot2::scale_y_continuous()
343
+    return(plot)
326 344
 }
327 345
 
328 346
 #' Plot of the estimated HSC population size for each patient.
... ...
@@ -417,28 +435,43 @@ HSC_population_plot <- function(estimates,
417 435
 
418 436
 #' Alluvial plots for IS distribution in time.
419 437
 #'
420
-#' \lifecycle{experimental}
438
+#' @description
439
+#' `r lifecycle::badge("stable")`
421 440
 #' Alluvial plots allow the visualization of integration sites distribution
422 441
 #' in different points in time in the same group.
423 442
 #' This functionality requires the suggested package
424 443
 #' [ggalluvial](https://corybrunson.github.io/ggalluvial/).
425 444
 #'
426 445
 #' @details
427
-#' ### Input data frame
446
+#' ## Input data frame
428 447
 #' The input data frame must contain all the columns specified in the
429 448
 #' arguments `group`, `plot_x`, `plot_y` and `alluvia`. The standard
430 449
 #' input for this function is the data frame obtained via the
431 450
 #' \link{compute_abundance} function.
432 451
 #'
433
-#' ### Plotting threshold on y
452
+#' ## Plotting threshold on y
434 453
 #' The plotting threshold on the quantification on the y axis has the
435 454
 #' function to highlight only relevant information on the plot and reduce
436 455
 #' computation time. The default value is 1, that acts on the default column
437
-#' plotted on the y axis which holds a percentage value. This translates
456
+#' plotted on the y axis which contains a percentage value. This translates
438 457
 #' in natural language roughly as "highlight with colors only those
439 458
 #' integrations (alluvia) that at least in 1 point in time have an
440 459
 #' abundance value >= 1 %". The remaining integrations will be plotted
441
-#' as transparent in the strata.
460
+#' as a unique layer in the column, colored as specified by the argument
461
+#' `empty_space_color`.
462
+#'
463
+#' ## Customizing the plot
464
+#' The returned plots are ggplot2 objects and can therefore further modified
465
+#' as any other ggplot2 object. For example, if the user decides to change the
466
+#' fill scale it is sufficient to do
467
+#'
468
+#' ```{r eval=FALSE}
469
+#' plot +
470
+#'   ggplot2::scale_fill_viridis_d(...) + # or any other discrete fill scale
471
+#'   ggplot2::theme(...) # change theme options
472
+#' ```
473
+#' NOTE: if you requested the computation of the top ten abundant tables and
474
+#' you want the colors to match you should re-compute them
442 475
 #'
443 476
 #' @param x A data frame. See details.
444 477
 #' @param group Character vector containing the column names that identify
... ...
@@ -549,8 +582,10 @@ integration_alluvial_plot <- function(x,
549 582
             group_df, plot_x, plot_y,
550 583
             alluvia, alluvia_plot_y_threshold
551 584
         )
552
-        alluv_plot <- .alluvial_plot(cleaned, plot_x, plot_y,
553
-                                     empty_space_color)
585
+        alluv_plot <- .alluvial_plot(
586
+            cleaned, plot_x, plot_y,
587
+            empty_space_color
588
+        )
554 589
         if (top_abundant_tbl == TRUE) {
555 590
             summary_tbls <- NULL
556 591
             withCallingHandlers(
... ...
@@ -569,10 +604,10 @@ integration_alluvial_plot <- function(x,
569 604
                             )
570 605
                         },
571 606
                         missing = function() {
572
-                          msg <- paste(
573
-                            "Failed to produce top",
574
-                            "abundance tables, skipping"
575
-                          )
607
+                            msg <- paste(
608
+                                "Failed to produce top",
609
+                                "abundance tables, skipping"
610
+                            )
576 611
                             rlang::inform(msg)
577 612
                         }
578 613
                     )
... ...
@@ -626,7 +661,6 @@ integration_alluvial_plot <- function(x,
626 661
     tbl <- if (length(alluvia) > 1) {
627 662
         df %>%
628 663
             tidyr::unite(col = "alluvia_id", {{ alluvia }})
629
-
630 664
     } else {
631 665
         df %>%
632 666
             dplyr::rename(alluvia_id = alluvia)
... ...
@@ -643,8 +677,10 @@ integration_alluvial_plot <- function(x,
643 677
         dplyr::pull(.data[["alluvia_id"]])
644 678
     # Modify ids - identifiers that are below threshold are converted to
645 679
     # NA and the quantities in y are aggregated
646
-    tbl[!eval(sym("alluvia_id")) %in% alluvia_to_plot,
647
-        c("alluvia_id") := NA_character_]
680
+    tbl[
681
+        !eval(sym("alluvia_id")) %in% alluvia_to_plot,
682
+        c("alluvia_id") := NA_character_
683
+    ]
648 684
     tbl <- tbl[, setNames(list(sum(.SD)), plot_y),
649 685
         by = c(plot_x, "alluvia_id"), .SDcols = plot_y
650 686
     ]
... ...
@@ -669,11 +705,12 @@ integration_alluvial_plot <- function(x,
669 705
         dplyr::group_by(dplyr::across(dplyr::all_of(plot_x))) %>%
670 706
         dplyr::summarise(count = .data$count[1], .groups = "drop")
671 707
     tbl <- tbl %>%
672
-      dplyr::group_by(dplyr::across(dplyr::all_of(plot_x))) %>%
673
-      dplyr::arrange(dplyr::desc(dplyr::across(dplyr::all_of(plot_y))),
674
-                     .by_group = TRUE) %>%
675
-      dplyr::mutate(alluvia_id = forcats::as_factor(.data$alluvia_id)) %>%
676
-      dplyr::ungroup()
708
+        dplyr::group_by(dplyr::across(dplyr::all_of(plot_x))) %>%
709
+        dplyr::arrange(dplyr::desc(dplyr::across(dplyr::all_of(plot_y))),
710
+            .by_group = TRUE
711
+        ) %>%
712
+        dplyr::mutate(alluvia_id = forcats::as_factor(.data$alluvia_id)) %>%
713
+        dplyr::ungroup()
677 714
     alluv <- ggplot2::ggplot(
678 715
         tbl,
679 716
         ggplot2::aes_(
... ...
@@ -682,19 +719,19 @@ integration_alluvial_plot <- function(x,
682 719
             alluvium = ~alluvia_id
683 720
         )
684 721
     ) +
685
-      ggalluvial::stat_stratum(ggplot2::aes_(stratum = ~alluvia_id),
686
-                               na.rm = FALSE,
687
-                               fill = empty_space_color
688
-      ) +
689
-      ggalluvial::geom_alluvium(ggplot2::aes_(fill = ~alluvia_id),
690
-                                na.rm = FALSE,
691
-                                alpha = .75,
692
-                                aes.bind = "alluvia"
693
-      )
722
+        ggalluvial::stat_stratum(ggplot2::aes_(stratum = ~alluvia_id),
723
+            na.rm = FALSE,
724
+            fill = empty_space_color
725
+        ) +
726
+        ggalluvial::geom_alluvium(ggplot2::aes_(fill = ~alluvia_id),
727
+            na.rm = FALSE,
728
+            alpha = .75,
729
+            aes.bind = "alluvia"
730
+        )
694 731
     alluv <- alluv +
695
-      ggplot2::scale_fill_hue(na.value = NA_character_) +
696
-      ggplot2::theme_bw() +
697
-      ggplot2::theme(legend.position = "none") +
732
+        ggplot2::scale_fill_hue(na.value = NA_character_) +
733
+        ggplot2::theme_bw() +
734
+        ggplot2::theme(legend.position = "none") +
698 735
         ggplot2::geom_text(
699 736
             data = labels,
700 737
             ggplot2::aes_(
... ...
@@ -890,7 +927,8 @@ top_abund_tableGrob <- function(df,
890 927
 
891 928
 #' Plot IS sharing heatmaps.
892 929
 #'
893
-#' \lifecycle{experimental}
930
+#' @description
931
+#' `r lifecycle::badge("stable")`
894 932
 #' Displays the IS sharing calculated via \link{is_sharing} as heatmaps.
895 933
 #'
896 934
 #' @param sharing_df The data frame containing the IS sharing data
... ...
@@ -1089,7 +1127,8 @@ sharing_heatmap <- function(sharing_df,
1089 1127
 
1090 1128
 #' Produce tables to plot sharing venn or euler diagrams.
1091 1129
 #'
1092
-#' @description \lifecycle{experimental}
1130
+#' @description
1131
+#' `r lifecycle::badge("stable")`
1093 1132
 #' This function processes a sharing data frame obtained via `is_sharing()`
1094 1133
 #' with the option `table_for_venn = TRUE` to obtain a list of objects
1095 1134
 #' that can be plotted as venn or euler diagrams.
... ...
@@ -1182,7 +1221,8 @@ sharing_venn <- function(sharing_df,
1182 1221
 
1183 1222
 #' Trace a circos plot of genomic densities.
1184 1223
 #'
1185
-#' @description \lifecycle{experimental}
1224
+#' @description
1225
+#' `r lifecycle::badge("stable")`
1186 1226
 #' For this functionality
1187 1227
 #' the suggested package
1188 1228
 #' [circlize](https://cran.r-project.org/web/packages/circlize/index.html)
... ...
@@ -1320,11 +1360,11 @@ circos_genomic_density <- function(data,
1320 1360
                 rlang::as_function(device)
1321 1361
             )]
1322 1362
         if (device == "pdf") {
1323
-          device_args <- device_args[!names(device_args) %in% c("file")]
1324
-          rlang::exec(.fn = device, file = file_path, !!!device_args)
1363
+            device_args <- device_args[!names(device_args) %in% c("file")]
1364
+            rlang::exec(.fn = device, file = file_path, !!!device_args)
1325 1365
         } else {
1326
-          device_args <- device_args[!names(device_args) %in% c("filename")]
1327
-          rlang::exec(.fn = device, filename = file_path, !!!device_args)
1366
+            device_args <- device_args[!names(device_args) %in% c("filename")]
1367
+            rlang::exec(.fn = device, filename = file_path, !!!device_args)
1328 1368
         }
1329 1369
     }
1330 1370
     ## Draw plot
Browse code

[UPDATE] Partial update 1.5.4 - fixed all functions, increased package coverage, documentation still to fix

Giulia Pais authored on 15/04/2022 16:47:59
Showing1 changed files
... ...
@@ -114,12 +114,6 @@ CIS_volcano_plot <- function(x,
114 114
     } else {
115 115
         title_prefix <- paste(title_prefix, collapse = " ")
116 116
     }
117
-    ## Load onco and ts
118
-    oncots_to_use <- .load_onco_ts_genes(
119
-        onco_db_file,
120
-        tumor_suppressors_db_file,
121
-        species
122
-    )
123 117
     ## Check if CIS function was already called
124 118
     min_cis_col <- c(
125 119
         "tdist_bonferroni_default", "tdist_fdr",
... ...
@@ -133,12 +127,13 @@ CIS_volcano_plot <- function(x,
133 127
     } else {
134 128
         x
135 129
     }
136
-    ## Join all dfs by gene
137
-    cis_grubbs_df <- cis_grubbs_df %>%
138
-        dplyr::left_join(oncots_to_use, by = "GeneName") %>%
139
-        dplyr::left_join(known_onco, by = "GeneName") %>%
140
-        dplyr::left_join(suspicious_genes, by = "GeneName")
130
+    gene_sym_col <- annotation_IS_vars(TRUE) %>%
131
+      dplyr::filter(.data$tag == "gene_symbol") %>%
132
+      dplyr::pull(.data$names)
141 133
     ## Add info to CIS
134
+    cis_grubbs_df <- .expand_cis_df(cis_grubbs_df, gene_sym_col,
135
+                                    onco_db_file, tumor_suppressors_db_file,
136
+                                    species, known_onco, suspicious_genes)
142 137
     cis_grubbs_df <- cis_grubbs_df %>%
143 138
         dplyr::mutate(minus_log_p = -log(.data$tdist_bonferroni_default,
144 139
             base = 10
... ...
@@ -153,20 +148,6 @@ CIS_volcano_plot <- function(x,
153 148
                 no = FALSE
154 149
             )
155 150
         )
156
-    cis_grubbs_df <- cis_grubbs_df %>%
157
-        dplyr::mutate(
158
-            KnownGeneClass = ifelse(
159
-                is.na(.data$Onco1_TS2),
160
-                yes = "Other",
161
-                no = ifelse(.data$Onco1_TS2 == 1,
162
-                    yes = "OncoGene",
163
-                    no = "TumSuppressor"
164
-                )
165
-            ),
166
-            CriticalForInsMut = ifelse(!is.na(.data$KnownClonalExpansion),
167
-                yes = TRUE, no = FALSE
168
-            )
169
-        )
170 151
     significance_threshold_minus_log_p <- -log(significance_threshold,
171 152
         base = 10
172 153
     )
... ...
@@ -199,7 +180,7 @@ CIS_volcano_plot <- function(x,
199 180
                 cis_grubbs_df,
200 181
                 .data$tdist_fdr < significance_threshold
201 182
             ),
202
-            ggplot2::aes_(label = ~GeneName),
183
+            ggplot2::aes_string(label = gene_sym_col),
203 184
             box.padding = ggplot2::unit(0.35, "lines"),
204 185
             point.padding = ggplot2::unit(0.3, "lines"),
205 186
             color = "white",
... ...
@@ -231,7 +212,7 @@ CIS_volcano_plot <- function(x,
231 212
         ggplot2::labs(
232 213
             title = paste(
233 214
                 title_prefix,
234
-                "- Volcano plot of IS gene frequency and",
215
+                "Volcano plot of IS gene frequency and",
235 216
                 "CIS results"
236 217
             ),
237 218
             y = "P-value Grubbs test (-log10(p))",
... ...
@@ -253,14 +234,14 @@ CIS_volcano_plot <- function(x,
253 234
         ## Look for the genes (case insensitive)
254 235
         to_highlight <- cis_grubbs_df %>%
255 236
             dplyr::filter(
256
-                stringr::str_to_lower(.data$GeneName) %in%
237
+                stringr::str_to_lower(.data[[gene_sym_col]]) %in%
257 238
                     stringr::str_to_lower(highlight_genes),
258 239
                 .data$tdist_fdr >= significance_threshold
259 240
             )
260 241
         plot_cis_fdr_slice <- plot_cis_fdr_slice +
261 242
             ggrepel::geom_label_repel(
262 243
                 data = to_highlight,
263
-                ggplot2::aes_(label = ~GeneName),
244
+                ggplot2::aes_string(label = gene_sym_col),
264 245
                 box.padding = ggplot2::unit(0.35, "lines"),
265 246
                 point.padding = ggplot2::unit(0.3, "lines"),
266 247
                 color = "white",
... ...
@@ -314,6 +295,36 @@ clinical_relevant_suspicious_genes <- function() {
314 295
     )
315 296
 }
316 297
 
298
+fisher_scatterplot <- function(fisher_df,
299
+                               p_value_col = "Fisher_p_value_fdr",
300
+                               annot_threshold = 0.05,
301
+                               annot_color = "red",
302
+                               gene_sym_col = "GeneName") {
303
+  below_threshold <- fisher_df %>%
304
+    dplyr::filter(.data[[p_value_col]] < annot_threshold)
305
+  above_threshold <- fisher_df %>%
306
+    dplyr::filter(.data[[p_value_col]] >= annot_threshold)
307
+  plot <- ggplot2::ggplot(above_threshold,
308
+                          ggplot2::aes_(x = ~ IS_per_kbGeneLen_1,
309
+                                        y = ~ IS_per_kbGeneLen_2,
310
+                                        color = ggplot2::sym(p_value_col))) +
311
+    ggplot2::geom_point() +
312
+    ggplot2::geom_point(data = below_threshold, color = annot_color) +
313
+    ggplot2::theme_bw() +
314
+    ggplot2::labs(x = "Gene frequency G1", y = "Gene frequency G2",
315
+                  color = "Fisher's test p-value") +
316
+    ggrepel::geom_label_repel(data = below_threshold,
317
+                              ggplot2::aes_string(label = gene_sym_col),
318
+                              box.padding = ggplot2::unit(0.35, "lines"),
319
+                              point.padding = ggplot2::unit(0.3, "lines"),
320
+                              max.overlaps = Inf, color = annot_color,
321
+                              fill = ggplot2::alpha("white", alpha = 0.6)
322
+    ) +
323
+    ggplot2::scale_x_continuous() +
324
+    ggplot2::scale_y_continuous()
325
+  return(plot)
326
+}
327
+
317 328
 #' Plot of the estimated HSC population size for each patient.
318 329
 #'
319 330
 #' @param estimates The estimates data frame, obtained via
... ...
@@ -440,6 +451,11 @@ HSC_population_plot <- function(estimates,
440 451
 #' threshold on y will be plotted in grey and aggregated. See details.
441 452
 #' @param top_abundant_tbl Logical. Produce the summary top abundant tables
442 453
 #' via \link{top_abund_tableGrob}?
454
+#' @param empty_space_color Color of the empty portion of the bars (IS below
455
+#' the threshold). Can be either a string of known colors, an hex code or
456
+#' `NA_character` to set the space transparent. All color
457
+#' specs accepted in ggplot2
458
+#' are suitable here.
443 459
 #' @param ... Additional arguments to pass on to \link{top_abund_tableGrob}
444 460
 #'
445 461
 #' @family Plotting functions
... ...
@@ -484,6 +500,7 @@ integration_alluvial_plot <- function(x,
484 500
     alluvia = mandatory_IS_vars(),
485 501
     alluvia_plot_y_threshold = 1,
486 502
     top_abundant_tbl = TRUE,
503
+    empty_space_color = "grey90",
487 504
     ...) {
488 505
     if (!requireNamespace("ggalluvial", quietly = TRUE)) {
489 506
         rlang::abort(.missing_pkg_error("ggalluvial"))
... ...
@@ -526,12 +543,14 @@ integration_alluvial_plot <- function(x,
526 543
     alluvia,
527 544
     alluvia_plot_y_threshold,
528 545
     top_abundant_tbl,
546
+    empty_space_color,
529 547
     other_params) {
530 548
         cleaned <- .clean_data(
531 549
             group_df, plot_x, plot_y,
532 550
             alluvia, alluvia_plot_y_threshold
533 551
         )
534
-        alluv_plot <- .alluvial_plot(cleaned, plot_x, plot_y)
552
+        alluv_plot <- .alluvial_plot(cleaned, plot_x, plot_y,
553
+                                     empty_space_color)
535 554
         if (top_abundant_tbl == TRUE) {
536 555
             summary_tbls <- NULL
537 556
             withCallingHandlers(
... ...
@@ -550,10 +569,11 @@ integration_alluvial_plot <- function(x,
550 569
                             )
551 570
                         },
552 571
                         missing = function() {
553
-                            rlang::inform(paste(
554
-                                "Failed to produce top",
555
-                                "abundance tables, skipping"
556
-                            ))
572
+                          msg <- paste(
573
+                            "Failed to produce top",
574
+                            "abundance tables, skipping"
575
+                          )
576
+                            rlang::inform(msg)
557 577
                         }
558 578
                     )
559 579
                 },
... ...
@@ -584,6 +604,7 @@ integration_alluvial_plot <- function(x,
584 604
         alluvia = alluvia,
585 605
         alluvia_plot_y_threshold = alluvia_plot_y_threshold,
586 606
         top_abundant_tbl = top_abundant_tbl,
607
+        empty_space_color = empty_space_color,
587 608
         other_params = dot_args,
588 609
         BPPARAM = p
589 610
     )
... ...
@@ -604,13 +625,13 @@ integration_alluvial_plot <- function(x,
604 625
     alluvia_plot_y_threshold) {
605 626
     tbl <- if (length(alluvia) > 1) {
606 627
         df %>%
607
-            tidyr::unite(col = "alluvia_id", {{ alluvia }}) %>%
608
-            data.table::setDT()
628
+            tidyr::unite(col = "alluvia_id", {{ alluvia }})
629
+
609 630
     } else {
610 631
         df %>%
611
-            dplyr::rename(alluvia_id = alluvia) %>%
612
-            data.table::setDT()
632
+            dplyr::rename(alluvia_id = alluvia)
613 633
     }
634
+    data.table::setDT(tbl)
614 635
     ## Getting counts by x
615 636
     counts_by_x <- tbl %>%
616 637
         dplyr::group_by(dplyr::across({{ plot_x }})) %>%
... ...
@@ -622,7 +643,8 @@ integration_alluvial_plot <- function(x,
622 643
         dplyr::pull(.data[["alluvia_id"]])
623