... | ... |
@@ -2,6 +2,16 @@ |
2 | 2 |
title: "NEWS" |
3 | 3 |
output: github_document |
4 | 4 |
--- |
5 |
+# ISAnalytics 1.3.6 (2021-10-05) |
|
6 |
+ |
|
7 |
+## NEW |
|
8 |
+ |
|
9 |
+* Added new feature `purity_filter()` |
|
10 |
+ |
|
11 |
+## FIXES |
|
12 |
+ |
|
13 |
+* Fixed small issue in printing information in reports |
|
14 |
+ |
|
5 | 15 |
# ISAnalytics 1.3.5 (2021-09-21) |
6 | 16 |
|
7 | 17 |
## MAJOR CHANGES |
... | ... |
@@ -1,6 +1,16 @@ |
1 | 1 |
NEWS |
2 | 2 |
================ |
3 | 3 |
|
4 |
+# ISAnalytics 1.3.6 (2021-10-05) |
|
5 |
+ |
|
6 |
+## NEW |
|
7 |
+ |
|
8 |
+- Added new feature `purity_filter()` |
|
9 |
+ |
|
10 |
+## FIXES |
|
11 |
+ |
|
12 |
+- Fixed small issue in printing information in reports |
|
13 |
+ |
|
4 | 14 |
# ISAnalytics 1.3.5 (2021-09-21) |
5 | 15 |
|
6 | 16 |
## MAJOR CHANGES |
... | ... |
@@ -1462,6 +1462,255 @@ is_sharing <- function(..., |
1462 | 1462 |
} |
1463 | 1463 |
|
1464 | 1464 |
|
1465 |
+#' Filter integration sites based on purity. |
|
1466 |
+#' |
|
1467 |
+#' @description |
|
1468 |
+#' \lifecycle{experimental} |
|
1469 |
+#' Filter that targets possible contamination between cell lines based on |
|
1470 |
+#' a numeric quantification (likely abundance or sequence count). |
|
1471 |
+#' |
|
1472 |
+#' @details |
|
1473 |
+#' ## Setting input arguments |
|
1474 |
+#' |
|
1475 |
+#' The input matrix can be re-aggregated with the provided `group_key` |
|
1476 |
+#' argument. This key contains the names of the columns to group on |
|
1477 |
+#' (besides the columns holding genomic coordinates of the integration |
|
1478 |
+#' sites) and must be contained in at least one of `x` or `lineages` |
|
1479 |
+#' data frames. If the key is not found only in `x`, then a join operation |
|
1480 |
+#' with the `lineages` data frame is performed on the common column(s) |
|
1481 |
+#' `join_on`. |
|
1482 |
+#' |
|
1483 |
+#' ## Group selection |
|
1484 |
+#' It is possible for the user to specify on which groups the logic of the |
|
1485 |
+#' filter should be applied to. For example: if we have |
|
1486 |
+#' `group_key = c("HematoLineage")` and we set |
|
1487 |
+#' `selected_groups = c("CD34", "Myeloid","Lymphoid")` |
|
1488 |
+#' it means that a single integration will be evaluated for the filter only |
|
1489 |
+#' for groups that have the values of "CD34", "Myeloid" and "Lymphoid" in |
|
1490 |
+#' the "HematoLineage" column. |
|
1491 |
+#' If the same integration is present in other groups it is |
|
1492 |
+#' kept as it is. `selected_groups` can be set to `NULL` if we want |
|
1493 |
+#' the logic to apply to every group present in the data frame, |
|
1494 |
+#' it can be set as a simple character vector as the example above if |
|
1495 |
+#' the group key has length 1 (and there is no need to filter on time point). |
|
1496 |
+#' If the group key is longer than 1 then the filter is applied only on the |
|
1497 |
+#' first element of the key. |
|
1498 |
+#' |
|
1499 |
+#' If a more refined selection on groups is needed, a data frame can |
|
1500 |
+#' be provided instead: |
|
1501 |
+#' |
|
1502 |
+#' ``` |
|
1503 |
+#' group_key = c("CellMarker", "Tissue") |
|
1504 |
+#' selected_groups = tibble::tribble( |
|
1505 |
+#' ~ CellMarker, ~ Tissue, |
|
1506 |
+#' "CD34", "BM", |
|
1507 |
+#' "CD14", "BM", |
|
1508 |
+#' "CD14", "PB" |
|
1509 |
+#' ) |
|
1510 |
+#' ``` |
|
1511 |
+#' |
|
1512 |
+#' Columns in the data frame should be the same as group key (plus, |
|
1513 |
+#' eventually, the time point column). In this example only those groups |
|
1514 |
+#' identified by the rows in the provided data frame are processed. |
|
1515 |
+#' |
|
1516 |
+#' @family Analysis functions |
|
1517 |
+#' |
|
1518 |
+#' @param x An aggregated integration matrix, obtained via |
|
1519 |
+#' `aggregate_values_by_key()` |
|
1520 |
+#' @param lineages A data frame containing cell lineages information |
|
1521 |
+#' @param aggregation_key The key used for aggregating `x` |
|
1522 |
+#' @param group_key A character vector of column names for re-aggregation. |
|
1523 |
+#' Column names must be either in `x` or in `lineages`. See details. |
|
1524 |
+#' @param selected_groups Either NULL, a character vector or a |
|
1525 |
+#' data frame for group selection. See details. |
|
1526 |
+#' @param join_on Common columns to perform a join operation on |
|
1527 |
+#' @param min_value A minimum value to filter the input matrix. Integrations |
|
1528 |
+#' with a value strictly lower than `min_value` are excluded (dropped) from |
|
1529 |
+#' the output. |
|
1530 |
+#' @param impurity_threshold The ratio threshold for impurity in groups |
|
1531 |
+#' @param by_timepoint Should filtering be applied on each time point? If |
|
1532 |
+#' `FALSE`, all time points are merged together |
|
1533 |
+#' @param timepoint_column Column in `x` containing the time point |
|
1534 |
+#' @param value_column Column in `x` containing the numeric |
|
1535 |
+#' quantification of interest |
|
1536 |
+#' |
|
1537 |
+#' @return A data frame |
|
1538 |
+#' @export |
|
1539 |
+#' |
|
1540 |
+#' @examples |
|
1541 |
+#' data("integration_matrices", package = "ISAnalytics") |
|
1542 |
+#' data("association_file", package = "ISAnalytics") |
|
1543 |
+#' aggreg <- aggregate_values_by_key( |
|
1544 |
+#' x = rlang::current_env()$integration_matrices, |
|
1545 |
+#' association_file = rlang::current_env()$association_file, |
|
1546 |
+#' value_cols = c("seqCount", "fragmentEstimate") |
|
1547 |
+#' ) |
|
1548 |
+#' filtered_by_purity <- purity_filter(x = aggreg, |
|
1549 |
+#' value_column = "seqCount_sum") |
|
1550 |
+#' head(filtered_by_purity) |
|
1551 |
+purity_filter <- function(x, |
|
1552 |
+ lineages = blood_lineages_default(), |
|
1553 |
+ aggregation_key = c("SubjectID", "CellMarker", |
|
1554 |
+ "Tissue", "TimePoint"), |
|
1555 |
+ group_key = c("CellMarker", "Tissue"), |
|
1556 |
+ selected_groups = NULL, |
|
1557 |
+ join_on = "CellMarker", |
|
1558 |
+ min_value = 3, |
|
1559 |
+ impurity_threshold = 10, |
|
1560 |
+ by_timepoint = TRUE, |
|
1561 |
+ timepoint_column = "TimePoint", |
|
1562 |
+ value_column = "seqCount_sum") { |
|
1563 |
+ ## Checks |
|
1564 |
+ #### - Base |
|
1565 |
+ stopifnot(is.data.frame(x)) |
|
1566 |
+ stopifnot(is.character(aggregation_key)) |
|
1567 |
+ stopifnot(is.character(group_key)) |
|
1568 |
+ stopifnot(is.logical(by_timepoint)) |
|
1569 |
+ stopifnot(is.numeric(min_value) || is.integer(min_value)) |
|
1570 |
+ stopifnot(is.numeric(impurity_threshold) || is.integer(impurity_threshold)) |
|
1571 |
+ stopifnot(is.character(value_column)) |
|
1572 |
+ stopifnot(is.null(selected_groups) || is.character(selected_groups) || |
|
1573 |
+ is.data.frame(selected_groups)) |
|
1574 |
+ #### - Keys |
|
1575 |
+ if (!all(aggregation_key %in% colnames(x))) { |
|
1576 |
+ rlang::abort(.missing_user_cols_error( |
|
1577 |
+ aggregation_key[!aggregation_key %in% colnames(x)] |
|
1578 |
+ )) |
|
1579 |
+ } |
|
1580 |
+ if (!value_column[1] %in% colnames(x)) { |
|
1581 |
+ rlang::abort(.missing_user_cols_error(value_column)) |
|
1582 |
+ } |
|
1583 |
+ to_join <- if (all(group_key %in% colnames(x))) { |
|
1584 |
+ ### If the groups rely only on attributes not related to lineages |
|
1585 |
+ ### and are contained in the input matrix |
|
1586 |
+ FALSE |
|
1587 |
+ } else { |
|
1588 |
+ ### If lineages info is needed |
|
1589 |
+ stopifnot(is.data.frame(lineages)) |
|
1590 |
+ if (!all(group_key %in% unique(c(colnames(x), colnames(lineages))))) { |
|
1591 |
+ missing_cols <- group_key[!group_key %in% unique(c(colnames(x), |
|
1592 |
+ colnames(lineages)))] |
|
1593 |
+ rlang::abort(.missing_user_cols_error(missing_cols)) |
|
1594 |
+ } |
|
1595 |
+ if (!(all(join_on %in% colnames(x)) & |
|
1596 |
+ all(join_on %in% colnames(lineages))) |
|
1597 |
+ ) { |
|
1598 |
+ missing_common <- c("Missing common column(s) to join on", |
|
1599 |
+ i = paste("The column(s) provided in argument", |
|
1600 |
+ "`join_on` is missing from one or", |
|
1601 |
+ "both data frames, aborting")) |
|
1602 |
+ rlang::abort(missing_common) |
|
1603 |
+ } |
|
1604 |
+ TRUE |
|
1605 |
+ } |
|
1606 |
+ if (by_timepoint) { |
|
1607 |
+ stopifnot(is.character(timepoint_column)) |
|
1608 |
+ timepoint_column <- timepoint_column[1] |
|
1609 |
+ if (!timepoint_column %in% colnames(x)) { |
|
1610 |
+ rlang::abort(.missing_user_cols_error(timepoint_column)) |
|
1611 |
+ } |
|
1612 |
+ group_key <- union(group_key, timepoint_column) |
|
1613 |
+ } |
|
1614 |
+ ## Pre-processing |
|
1615 |
+ #### - Join if needed |
|
1616 |
+ if (to_join) { |
|
1617 |
+ x <- x %>% |
|
1618 |
+ dplyr::left_join(lineages, by = join_on) |
|
1619 |
+ } |
|
1620 |
+ #### - Group and sum |
|
1621 |
+ is_vars <- if (.is_annotated(x)) { |
|
1622 |
+ c(mandatory_IS_vars(), annotation_IS_vars()) |
|
1623 |
+ } else { |
|
1624 |
+ mandatory_IS_vars() |
|
1625 |
+ } |
|
1626 |
+ grouped <- x %>% |
|
1627 |
+ dplyr::group_by(dplyr::across(dplyr::all_of(c(is_vars, group_key)))) %>% |
|
1628 |
+ dplyr::summarise(Value = sum(.data[[value_column]]), |
|
1629 |
+ .groups = "drop") |
|
1630 |
+ #### - value filter |
|
1631 |
+ filtered_value <- threshold_filter(x = grouped, |
|
1632 |
+ threshold = min_value, |
|
1633 |
+ cols_to_compare = "Value", |
|
1634 |
+ comparators = ">=") |
|
1635 |
+ #### - Separating IS 1: group filtering |
|
1636 |
+ pre_filt <- list() |
|
1637 |
+ if (is.null(selected_groups) || purrr::is_empty(selected_groups)) { |
|
1638 |
+ pre_filt[["process"]] <- filtered_value |
|
1639 |
+ pre_filt[["keep"]] <- filtered_value[0, ] |
|
1640 |
+ } else if (is.character(selected_groups)) { |
|
1641 |
+ pre_filt[["process"]] <- filtered_value %>% |
|
1642 |
+ dplyr::filter(.data[[group_key[1]]] %in% selected_groups) |
|
1643 |
+ pre_filt[["keep"]] <- filtered_value %>% |
|
1644 |
+ dplyr::filter(!.data[[group_key[1]]] %in% selected_groups) |
|
1645 |
+ } else { |
|
1646 |
+ ok_cols <- colnames(selected_groups)[colnames(selected_groups) %in% |
|
1647 |
+ group_key] |
|
1648 |
+ selected_groups <- selected_groups %>% |
|
1649 |
+ dplyr::select(dplyr::all_of(ok_cols)) %>% |
|
1650 |
+ dplyr::distinct() |
|
1651 |
+ if (ncol(selected_groups) == 0 || |
|
1652 |
+ nrow(selected_groups) == 0) { |
|
1653 |
+ pre_filt[["process"]] <- filtered_value |
|
1654 |
+ pre_filt[["keep"]] <- filtered_value[0, ] |
|
1655 |
+ } else { |
|
1656 |
+ pre_filt[["process"]] <- dplyr::inner_join(filtered_value, |
|
1657 |
+ selected_groups, |
|
1658 |
+ by = ok_cols) |
|
1659 |
+ pre_filt[["keep"]] <- dplyr::anti_join(filtered_value, |
|
1660 |
+ selected_groups, |
|
1661 |
+ by = ok_cols) |
|
1662 |
+ } |
|
1663 |
+ } |
|
1664 |
+ if (nrow(pre_filt$process) == 0) { |
|
1665 |
+ if (getOption("ISAnalytics.verbose")) { |
|
1666 |
+ rlang::inform("No iss to process, done") |
|
1667 |
+ } |
|
1668 |
+ return(filtered_value) |
|
1669 |
+ } |
|
1670 |
+ #### - Separating IS 2: iss that are shared between groups are going to be |
|
1671 |
+ #### processed, unique iss are kept as they are |
|
1672 |
+ vars_to_group <- if (by_timepoint) { |
|
1673 |
+ c(is_vars, timepoint_column) |
|
1674 |
+ } else { |
|
1675 |
+ is_vars |
|
1676 |
+ } |
|
1677 |
+ by_is <- pre_filt$process %>% |
|
1678 |
+ dplyr::group_by(dplyr::across(dplyr::all_of(vars_to_group))) %>% |
|
1679 |
+ dplyr::summarise(n = n(), .groups = "drop") |
|
1680 |
+ to_process <- by_is %>% |
|
1681 |
+ dplyr::filter(.data$n > 1) %>% |
|
1682 |
+ dplyr::select(-.data$n) %>% |
|
1683 |
+ dplyr::inner_join(pre_filt$process, by = vars_to_group) |
|
1684 |
+ if (nrow(to_process) == 0) { |
|
1685 |
+ ## If there are no shared iss there is nothing to process, |
|
1686 |
+ ## return just the filtered matrix |
|
1687 |
+ return(filtered_value) |
|
1688 |
+ } |
|
1689 |
+ to_keep <- by_is %>% |
|
1690 |
+ dplyr::filter(.data$n == 1) %>% |
|
1691 |
+ dplyr::select(-.data$n) %>% |
|
1692 |
+ dplyr::inner_join(pre_filt$process, by = vars_to_group) |
|
1693 |
+ #### - Process groups |
|
1694 |
+ .filter_by_purity <- function(group) { |
|
1695 |
+ max_val <- max(group$Value) |
|
1696 |
+ processed <- group %>% |
|
1697 |
+ dplyr::mutate(remove = (max_val / .data$Value) > |
|
1698 |
+ impurity_threshold) %>% |
|
1699 |
+ dplyr::filter(remove == FALSE) %>% |
|
1700 |
+ dplyr::select(-.data$remove) |
|
1701 |
+ processed |
|
1702 |
+ } |
|
1703 |
+ processed_iss <- to_process %>% |
|
1704 |
+ dplyr::group_by(dplyr::across(vars_to_group)) %>% |
|
1705 |
+ dplyr::group_modify(~ .filter_by_purity(.x)) %>% |
|
1706 |
+ dplyr::ungroup() |
|
1707 |
+ #### - Re-compose matrix |
|
1708 |
+ final <- processed_iss %>% |
|
1709 |
+ dplyr::bind_rows(to_keep) %>% |
|
1710 |
+ dplyr::bind_rows(pre_filt$keep) |
|
1711 |
+ final |
|
1712 |
+} |
|
1713 |
+ |
|
1465 | 1714 |
#' A set of pre-defined functions for `sample_statistics`. |
1466 | 1715 |
#' |
1467 | 1716 |
#' @return A named list of functions/purrr-style lambdas |
... | ... |
@@ -137,6 +137,16 @@ options("ISAnalytics.reports" = TRUE) |
137 | 137 |
Show more |
138 | 138 |
</summary> |
139 | 139 |
|
140 |
+# ISAnalytics 1.3.6 (2021-10-05) |
|
141 |
+ |
|
142 |
+## NEW |
|
143 |
+ |
|
144 |
+- Added new feature `purity_filter()` |
|
145 |
+ |
|
146 |
+## FIXES |
|
147 |
+ |
|
148 |
+- Fixed small issue in printing information in reports |
|
149 |
+ |
|
140 | 150 |
# ISAnalytics 1.3.5 (2021-09-21) |
141 | 151 |
|
142 | 152 |
## MAJOR CHANGES |
... | ... |
@@ -129,16 +129,16 @@ logic <- if (length(params$key) > 1) { |
129 | 129 |
combined <- rbind(base_flag, c(flag_logic, "")) |
130 | 130 |
paste( |
131 | 131 |
"* Key length > 1, flagging formula used: ", |
132 |
- paste(combined, collapse = " ") |
|
132 |
+ paste0(combined, collapse = " ") |
|
133 | 133 |
) |
134 | 134 |
} else { |
135 | 135 |
base_flag <- paste0( |
136 |
- "(tdist_", key, " < ", params$outlier_thresh, |
|
137 |
- " & zscore_", key, " < 0)" |
|
136 |
+ "(tdist_", params$key, " < ", params$outlier_thresh, |
|
137 |
+ " & zscore_", params$key, " < 0)" |
|
138 | 138 |
) |
139 | 139 |
paste( |
140 | 140 |
"* Flagging formula used: ", |
141 |
- paste(base_flag) |
|
141 |
+ paste0(base_flag) |
|
142 | 142 |
) |
143 | 143 |
} |
144 | 144 |
log2 <- if (params$log2_req) { |
... | ... |
@@ -80,6 +80,7 @@ Other Analysis functions: |
80 | 80 |
\code{\link{cumulative_count_union}()}, |
81 | 81 |
\code{\link{cumulative_is}()}, |
82 | 82 |
\code{\link{is_sharing}()}, |
83 |
+\code{\link{purity_filter}()}, |
|
83 | 84 |
\code{\link{sample_statistics}()}, |
84 | 85 |
\code{\link{separate_quant_matrices}()}, |
85 | 86 |
\code{\link{threshold_filter}()}, |
... | ... |
@@ -81,29 +81,29 @@ For more details on how this files were generated use the help |
81 | 81 |
|
82 | 82 |
The default values are included in this package and |
83 | 83 |
it can be accessed by doing:\if{html}{\out{<div class="r">}}\preformatted{head(known_clinical_oncogenes()) |
84 |
-#> # A tibble: 5 × 2 |
|
85 |
-#> GeneName KnownClonalExpansion |
|
86 |
-#> <chr> <lgl> |
|
87 |
-#> 1 MECOM TRUE |
|
88 |
-#> 2 CCND2 TRUE |
|
89 |
-#> 3 TAL1 TRUE |
|
90 |
-#> 4 LMO2 TRUE |
|
91 |
-#> 5 HMGA2 TRUE |
|
92 |
-}\if{html}{\out{</div>}} |
|
84 |
+}\if{html}{\out{</div>}}\preformatted{## # A tibble: 5 × 2 |
|
85 |
+## GeneName KnownClonalExpansion |
|
86 |
+## <chr> <lgl> |
|
87 |
+## 1 MECOM TRUE |
|
88 |
+## 2 CCND2 TRUE |
|
89 |
+## 3 TAL1 TRUE |
|
90 |
+## 4 LMO2 TRUE |
|
91 |
+## 5 HMGA2 TRUE |
|
92 |
+} |
|
93 | 93 |
|
94 | 94 |
If the user wants to change this parameter the input data frame must |
95 | 95 |
preserve the column structure. The same goes for the \code{suspicious_genes} |
96 | 96 |
parameter (DOIReference column is optional):\if{html}{\out{<div class="r">}}\preformatted{head(clinical_relevant_suspicious_genes()) |
97 |
-#> # A tibble: 6 × 3 |
|
98 |
-#> GeneName ClinicalRelevance DOIReference |
|
99 |
-#> <chr> <lgl> <chr> |
|
100 |
-#> 1 DNMT3A TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
101 |
-#> 2 TET2 TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
102 |
-#> 3 ASXL1 TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
103 |
-#> 4 JAK2 TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
104 |
-#> 5 CBL TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
105 |
-#> 6 TP53 TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
106 |
-}\if{html}{\out{</div>}} |
|
97 |
+}\if{html}{\out{</div>}}\preformatted{## # A tibble: 6 × 3 |
|
98 |
+## GeneName ClinicalRelevance DOIReference |
|
99 |
+## <chr> <lgl> <chr> |
|
100 |
+## 1 DNMT3A TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
101 |
+## 2 TET2 TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
102 |
+## 3 ASXL1 TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
103 |
+## 4 JAK2 TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
104 |
+## 5 CBL TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
105 |
+## 6 TP53 TRUE https://doi.org/10.1182/blood-2018-01-829937 |
|
106 |
+} |
|
107 | 107 |
} |
108 | 108 |
} |
109 | 109 |
\examples{ |
... | ... |
@@ -70,6 +70,7 @@ Other Analysis functions: |
70 | 70 |
\code{\link{cumulative_count_union}()}, |
71 | 71 |
\code{\link{cumulative_is}()}, |
72 | 72 |
\code{\link{is_sharing}()}, |
73 |
+\code{\link{purity_filter}()}, |
|
73 | 74 |
\code{\link{sample_statistics}()}, |
74 | 75 |
\code{\link{separate_quant_matrices}()}, |
75 | 76 |
\code{\link{threshold_filter}()}, |
... | ... |
@@ -62,6 +62,7 @@ Other Analysis functions: |
62 | 62 |
\code{\link{cumulative_count_union}()}, |
63 | 63 |
\code{\link{cumulative_is}()}, |
64 | 64 |
\code{\link{is_sharing}()}, |
65 |
+\code{\link{purity_filter}()}, |
|
65 | 66 |
\code{\link{sample_statistics}()}, |
66 | 67 |
\code{\link{separate_quant_matrices}()}, |
67 | 68 |
\code{\link{threshold_filter}()}, |
... | ... |
@@ -91,6 +91,7 @@ Other Analysis functions: |
91 | 91 |
\code{\link{compute_abundance}()}, |
92 | 92 |
\code{\link{cumulative_is}()}, |
93 | 93 |
\code{\link{is_sharing}()}, |
94 |
+\code{\link{purity_filter}()}, |
|
94 | 95 |
\code{\link{sample_statistics}()}, |
95 | 96 |
\code{\link{separate_quant_matrices}()}, |
96 | 97 |
\code{\link{threshold_filter}()}, |
... | ... |
@@ -57,6 +57,7 @@ Other Analysis functions: |
57 | 57 |
\code{\link{compute_abundance}()}, |
58 | 58 |
\code{\link{cumulative_count_union}()}, |
59 | 59 |
\code{\link{is_sharing}()}, |
60 |
+\code{\link{purity_filter}()}, |
|
60 | 61 |
\code{\link{sample_statistics}()}, |
61 | 62 |
\code{\link{separate_quant_matrices}()}, |
62 | 63 |
\code{\link{threshold_filter}()}, |
... | ... |
@@ -95,6 +95,7 @@ Other Analysis functions: |
95 | 95 |
\code{\link{compute_abundance}()}, |
96 | 96 |
\code{\link{cumulative_count_union}()}, |
97 | 97 |
\code{\link{cumulative_is}()}, |
98 |
+\code{\link{purity_filter}()}, |
|
98 | 99 |
\code{\link{sample_statistics}()}, |
99 | 100 |
\code{\link{separate_quant_matrices}()}, |
100 | 101 |
\code{\link{threshold_filter}()}, |
101 | 102 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,128 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/analysis-functions.R |
|
3 |
+\name{purity_filter} |
|
4 |
+\alias{purity_filter} |
|
5 |
+\title{Filter integration sites based on purity.} |
|
6 |
+\usage{ |
|
7 |
+purity_filter( |
|
8 |
+ x, |
|
9 |
+ lineages = blood_lineages_default(), |
|
10 |
+ aggregation_key = c("SubjectID", "CellMarker", "Tissue", "TimePoint"), |
|
11 |
+ group_key = c("CellMarker", "Tissue"), |
|
12 |
+ selected_groups = NULL, |
|
13 |
+ join_on = "CellMarker", |
|
14 |
+ min_value = 3, |
|
15 |
+ impurity_threshold = 10, |
|
16 |
+ by_timepoint = TRUE, |
|
17 |
+ timepoint_column = "TimePoint", |
|
18 |
+ value_column = "seqCount_sum" |
|
19 |
+) |
|
20 |
+} |
|
21 |
+\arguments{ |
|
22 |
+\item{x}{An aggregated integration matrix, obtained via |
|
23 |
+\code{aggregate_values_by_key()}} |
|
24 |
+ |
|
25 |
+\item{lineages}{A data frame containing cell lineages information} |
|
26 |
+ |
|
27 |
+\item{aggregation_key}{The key used for aggregating \code{x}} |
|
28 |
+ |
|
29 |
+\item{group_key}{A character vector of column names for re-aggregation. |
|
30 |
+Column names must be either in \code{x} or in \code{lineages}. See details.} |
|
31 |
+ |
|
32 |
+\item{selected_groups}{Either NULL, a character vector or a |
|
33 |
+data frame for group selection. See details.} |
|
34 |
+ |
|
35 |
+\item{join_on}{Common columns to perform a join operation on} |
|
36 |
+ |
|
37 |
+\item{min_value}{A minimum value to filter the input matrix. Integrations |
|
38 |
+with a value strictly lower than \code{min_value} are excluded (dropped) from |
|
39 |
+the output.} |
|
40 |
+ |
|
41 |
+\item{impurity_threshold}{The ratio threshold for impurity in groups} |
|
42 |
+ |
|
43 |
+\item{by_timepoint}{Should filtering be applied on each time point? If |
|
44 |
+\code{FALSE}, all time points are merged together} |
|
45 |
+ |
|
46 |
+\item{timepoint_column}{Column in \code{x} containing the time point} |
|
47 |
+ |
|
48 |
+\item{value_column}{Column in \code{x} containing the numeric |
|
49 |
+quantification of interest} |
|
50 |
+} |
|
51 |
+\value{ |
|
52 |
+A data frame |
|
53 |
+} |
|
54 |
+\description{ |
|
55 |
+\lifecycle{experimental} |
|
56 |
+Filter that targets possible contamination between cell lines based on |
|
57 |
+a numeric quantification (likely abundance or sequence count). |
|
58 |
+} |
|
59 |
+\details{ |
|
60 |
+\subsection{Setting input arguments}{ |
|
61 |
+ |
|
62 |
+The input matrix can be re-aggregated with the provided \code{group_key} |
|
63 |
+argument. This key contains the names of the columns to group on |
|
64 |
+(besides the columns holding genomic coordinates of the integration |
|
65 |
+sites) and must be contained in at least one of \code{x} or \code{lineages} |
|
66 |
+data frames. If the key is not found only in \code{x}, then a join operation |
|
67 |
+with the \code{lineages} data frame is performed on the common column(s) |
|
68 |
+\code{join_on}. |
|
69 |
+} |
|
70 |
+ |
|
71 |
+\subsection{Group selection}{ |
|
72 |
+ |
|
73 |
+It is possible for the user to specify on which groups the logic of the |
|
74 |
+filter should be applied to. For example: if we have |
|
75 |
+\code{group_key = c("HematoLineage")} and we set |
|
76 |
+\code{selected_groups = c("CD34", "Myeloid","Lymphoid")} |
|
77 |
+it means that a single integration will be evaluated for the filter only |
|
78 |
+for groups that have the values of "CD34", "Myeloid" and "Lymphoid" in |
|
79 |
+the "HematoLineage" column. |
|
80 |
+If the same integration is present in other groups it is |
|
81 |
+kept as it is. \code{selected_groups} can be set to \code{NULL} if we want |
|
82 |
+the logic to apply to every group present in the data frame, |
|
83 |
+it can be set as a simple character vector as the example above if |
|
84 |
+the group key has length 1 (and there is no need to filter on time point). |
|
85 |
+If the group key is longer than 1 then the filter is applied only on the |
|
86 |
+first element of the key. |
|
87 |
+ |
|
88 |
+If a more refined selection on groups is needed, a data frame can |
|
89 |
+be provided instead:\preformatted{group_key = c("CellMarker", "Tissue") |
|
90 |
+selected_groups = tibble::tribble( |
|
91 |
+~ CellMarker, ~ Tissue, |
|
92 |
+"CD34", "BM", |
|
93 |
+"CD14", "BM", |
|
94 |
+"CD14", "PB" |
|
95 |
+) |
|
96 |
+} |
|
97 |
+ |
|
98 |
+Columns in the data frame should be the same as group key (plus, |
|
99 |
+eventually, the time point column). In this example only those groups |
|
100 |
+identified by the rows in the provided data frame are processed. |
|
101 |
+} |
|
102 |
+} |
|
103 |
+\examples{ |
|
104 |
+data("integration_matrices", package = "ISAnalytics") |
|
105 |
+data("association_file", package = "ISAnalytics") |
|
106 |
+aggreg <- aggregate_values_by_key( |
|
107 |
+ x = rlang::current_env()$integration_matrices, |
|
108 |
+ association_file = rlang::current_env()$association_file, |
|
109 |
+ value_cols = c("seqCount", "fragmentEstimate") |
|
110 |
+) |
|
111 |
+filtered_by_purity <- purity_filter(x = aggreg, |
|
112 |
+value_column = "seqCount_sum") |
|
113 |
+head(filtered_by_purity) |
|
114 |
+} |
|
115 |
+\seealso{ |
|
116 |
+Other Analysis functions: |
|
117 |
+\code{\link{CIS_grubbs}()}, |
|
118 |
+\code{\link{comparison_matrix}()}, |
|
119 |
+\code{\link{compute_abundance}()}, |
|
120 |
+\code{\link{cumulative_count_union}()}, |
|
121 |
+\code{\link{cumulative_is}()}, |
|
122 |
+\code{\link{is_sharing}()}, |
|
123 |
+\code{\link{sample_statistics}()}, |
|
124 |
+\code{\link{separate_quant_matrices}()}, |
|
125 |
+\code{\link{threshold_filter}()}, |
|
126 |
+\code{\link{top_integrations}()} |
|
127 |
+} |
|
128 |
+\concept{Analysis functions} |
... | ... |
@@ -78,6 +78,7 @@ Other Analysis functions: |
78 | 78 |
\code{\link{cumulative_count_union}()}, |
79 | 79 |
\code{\link{cumulative_is}()}, |
80 | 80 |
\code{\link{is_sharing}()}, |
81 |
+\code{\link{purity_filter}()}, |
|
81 | 82 |
\code{\link{separate_quant_matrices}()}, |
82 | 83 |
\code{\link{threshold_filter}()}, |
83 | 84 |
\code{\link{top_integrations}()} |
... | ... |
@@ -62,6 +62,7 @@ Other Analysis functions: |
62 | 62 |
\code{\link{cumulative_count_union}()}, |
63 | 63 |
\code{\link{cumulative_is}()}, |
64 | 64 |
\code{\link{is_sharing}()}, |
65 |
+\code{\link{purity_filter}()}, |
|
65 | 66 |
\code{\link{sample_statistics}()}, |
66 | 67 |
\code{\link{threshold_filter}()}, |
67 | 68 |
\code{\link{top_integrations}()} |
... | ... |
@@ -78,8 +78,7 @@ It is also possible to filter different data frames with different |
78 | 78 |
sets of conditions. Besides having the possibility of defining the |
79 | 79 |
other parameters as simple vector, which has the same results as |
80 | 80 |
operating on an unnamed list, the user can define the parameters as |
81 |
-named lists containing vectors. For example:\if{html}{\out{<div class="r">}}\preformatted{ |
|
82 |
-example_df <- tibble::tibble(a = c(20, 30, 40), |
|
81 |
+named lists containing vectors. For example:\if{html}{\out{<div class="r">}}\preformatted{example_df <- tibble::tibble(a = c(20, 30, 40), |
|
83 | 82 |
b = c(40, 50, 60), |
84 | 83 |
c = c("a", "b", "c"), |
85 | 84 |
d = c(3L, 4L, 5L)) |
... | ... |
@@ -87,31 +86,30 @@ example_list <- list(first = example_df, |
87 | 86 |
second = example_df, |
88 | 87 |
third = example_df) |
89 | 88 |
print(example_list) |
90 |
-#> $first |
|
91 |
-#> # A tibble: 3 × 4 |
|
92 |
-#> a b c d |
|
93 |
-#> <dbl> <dbl> <chr> <int> |
|
94 |
-#> 1 20 40 a 3 |
|
95 |
-#> 2 30 50 b 4 |
|
96 |
-#> 3 40 60 c 5 |
|
97 |
-#> |
|
98 |
-#> $second |
|
99 |
-#> # A tibble: 3 × 4 |
|
100 |
-#> a b c d |
|
101 |
-#> <dbl> <dbl> <chr> <int> |
|
102 |
-#> 1 20 40 a 3 |
|
103 |
-#> 2 30 50 b 4 |
|
104 |
-#> 3 40 60 c 5 |
|
105 |
-#> |
|
106 |
-#> $third |
|
107 |
-#> # A tibble: 3 × 4 |
|
108 |
-#> a b c d |
|
109 |
-#> <dbl> <dbl> <chr> <int> |
|
110 |
-#> 1 20 40 a 3 |
|
111 |
-#> 2 30 50 b 4 |
|
112 |
-#> 3 40 60 c 5 |
|
113 |
- |
|
114 |
-filtered <- threshold_filter(example_list, |
|
89 |
+}\if{html}{\out{</div>}}\preformatted{## $first |
|
90 |
+## # A tibble: 3 × 4 |
|
91 |
+## a b c d |
|
92 |
+## <dbl> <dbl> <chr> <int> |
|
93 |
+## 1 20 40 a 3 |
|
94 |
+## 2 30 50 b 4 |
|
95 |
+## 3 40 60 c 5 |
|
96 |
+## |
|
97 |
+## $second |
|
98 |
+## # A tibble: 3 × 4 |
|
99 |
+## a b c d |
|
100 |
+## <dbl> <dbl> <chr> <int> |
|
101 |
+## 1 20 40 a 3 |
|
102 |
+## 2 30 50 b 4 |
|
103 |
+## 3 40 60 c 5 |
|
104 |
+## |
|
105 |
+## $third |
|
106 |
+## # A tibble: 3 × 4 |
|
107 |
+## a b c d |
|
108 |
+## <dbl> <dbl> <chr> <int> |
|
109 |
+## 1 20 40 a 3 |
|
110 |
+## 2 30 50 b 4 |
|
111 |
+## 3 40 60 c 5 |
|
112 |
+}\if{html}{\out{<div class="r">}}\preformatted{filtered <- threshold_filter(example_list, |
|
115 | 113 |
threshold = list(first = c(20, 60), |
116 | 114 |
third = c(25)), |
117 | 115 |
cols_to_compare = list(first = c("a", "b"), |
... | ... |
@@ -119,27 +117,27 @@ third = c("a")), |
119 | 117 |
comparators = list(first = c(">", "<"), |
120 | 118 |
third = c(">="))) |
121 | 119 |
print(filtered) |
122 |
-#> $first |
|
123 |
-#> # A tibble: 1 × 4 |
|
124 |
-#> a b c d |
|
125 |
-#> <dbl> <dbl> <chr> <int> |
|
126 |
-#> 1 30 50 b 4 |
|
127 |
-#> |
|
128 |
-#> $second |
|
129 |
-#> # A tibble: 3 × 4 |
|
130 |
-#> a b c d |
|
131 |
-#> <dbl> <dbl> <chr> <int> |
|
132 |
-#> 1 20 40 a 3 |
|
133 |
-#> 2 30 50 b 4 |
|
134 |
-#> 3 40 60 c 5 |
|
135 |
-#> |
|
136 |
-#> $third |
|
137 |
-#> # A tibble: 2 × 4 |
|
138 |
-#> a b c d |
|
139 |
-#> <dbl> <dbl> <chr> <int> |
|
140 |
-#> 1 30 50 b 4 |
|
141 |
-#> 2 40 60 c 5 |
|
142 |
-}\if{html}{\out{</div>}} |
|
120 |
+}\if{html}{\out{</div>}}\preformatted{## $first |
|
121 |
+## # A tibble: 1 × 4 |
|
122 |
+## a b c d |
|
123 |
+## <dbl> <dbl> <chr> <int> |
|
124 |
+## 1 30 50 b 4 |
|
125 |
+## |
|
126 |
+## $second |
|
127 |
+## # A tibble: 3 × 4 |
|
128 |
+## a b c d |
|
129 |
+## <dbl> <dbl> <chr> <int> |
|
130 |
+## 1 20 40 a 3 |
|
131 |
+## 2 30 50 b 4 |
|
132 |
+## 3 40 60 c 5 |
|
133 |
+## |
|
134 |
+## $third |
|
135 |
+## # A tibble: 2 × 4 |
|
136 |
+## a b c d |
|
137 |
+## <dbl> <dbl> <chr> <int> |
|
138 |
+## 1 30 50 b 4 |
|
139 |
+## 2 40 60 c 5 |
|
140 |
+} |
|
143 | 141 |
|
144 | 142 |
The above signature will roughly be translated as: |
145 | 143 |
\itemize{ |
... | ... |
@@ -213,6 +211,7 @@ Other Analysis functions: |
213 | 211 |
\code{\link{cumulative_count_union}()}, |
214 | 212 |
\code{\link{cumulative_is}()}, |
215 | 213 |
\code{\link{is_sharing}()}, |
214 |
+\code{\link{purity_filter}()}, |
|
216 | 215 |
\code{\link{sample_statistics}()}, |
217 | 216 |
\code{\link{separate_quant_matrices}()}, |
218 | 217 |
\code{\link{top_integrations}()} |
... | ... |
@@ -78,6 +78,7 @@ Other Analysis functions: |
78 | 78 |
\code{\link{cumulative_count_union}()}, |
79 | 79 |
\code{\link{cumulative_is}()}, |
80 | 80 |
\code{\link{is_sharing}()}, |
81 |
+\code{\link{purity_filter}()}, |
|
81 | 82 |
\code{\link{sample_statistics}()}, |
82 | 83 |
\code{\link{separate_quant_matrices}()}, |
83 | 84 |
\code{\link{threshold_filter}()} |
... | ... |
@@ -108,6 +108,7 @@ test_that(paste(func_name, "reads type NEW standard"), { |
108 | 108 |
}) |
109 | 109 |
|
110 | 110 |
test_that(paste(func_name, "reads type NEW different params"), { |
111 |
+ skip_on_os("windows") |
|
111 | 112 |
tf <- withr::local_tempfile(fileext = ".tsv") |
112 | 113 |
readr::write_tsv(sample_df, tf) |
113 | 114 |
expected_summary_msg <- .summary_ism_import_msg( |
114 | 115 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,328 @@ |
1 |
+library(ISAnalytics) |
|
2 |
+func_name <- "purity_filter" |
|
3 |
+#------------------------------------------------------------------------------# |
|
4 |
+# Global vars |
|
5 |
+#------------------------------------------------------------------------------# |
|
6 |
+ |
|
7 |
+df <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1', '1', '1', '1', |
|
8 |
+ '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', |
|
9 |
+ '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', |
|
10 |
+ '1', '1', '1', '1', '1', '1', '1', '1', '1', '1'), |
|
11 |
+ integration_locus = c(121249, 251227, 645551, 732938, |
|
12 |
+ 775536, 846681, 1029785, 1036835, |
|
13 |
+ 121249, 251227, 645551, 732938, |
|
14 |
+ 775536, 846681, 1029785, 1036835, |
|
15 |
+ 121249, 251227, 645551, 732938, |
|
16 |
+ 775536, 846681, 1029785, 1036835, |
|
17 |
+ 121249, 251227, 645551, 732938, |
|
18 |
+ 775536, 846681, 1029785, 1036835, |
|
19 |
+ 121249, 251227, 645551, 732938, |
|
20 |
+ 775536, 846681, 1029785, 1036835), |
|
21 |
+ strand = c('+', '+', '+', '+', '+', '+', '-', '+', '+', |
|
22 |
+ '+', '+', '+', '+', '+', '-', '+', '+', '+', |
|
23 |
+ '+', '+', '+', '+', '-', '+', '+', '+', '+', |
|
24 |
+ '+', '+', '+', '-', '+', '+', '+', '+', '+', |
|
25 |
+ '+', '+', '-', '+'), |
|
26 |
+ GeneName = c('LOC729737', 'LOC100132287', 'LOC100133331', |
|
27 |
+ 'LOC100288069', 'LINC01128', 'LOC100130417', |
|
28 |
+ 'C1orf159', 'C1orf159', 'LOC729737', |
|
29 |
+ 'LOC100132287', 'LOC100133331', |
|
30 |
+ 'LOC100288069', 'LINC01128', 'LOC100130417', |
|
31 |
+ 'C1orf159', 'C1orf159', 'LOC729737', |
|
32 |
+ 'LOC100132287', 'LOC100133331', |
|
33 |
+ 'LOC100288069', 'LINC01128', 'LOC100130417', |
|
34 |
+ 'C1orf159', 'C1orf159', 'LOC729737', |
|
35 |
+ 'LOC100132287', 'LOC100133331', |
|
36 |
+ 'LOC100288069', 'LINC01128', 'LOC100130417', |
|
37 |
+ 'C1orf159', 'C1orf159', 'LOC729737', |
|
38 |
+ 'LOC100132287', 'LOC100133331', |
|
39 |
+ 'LOC100288069', 'LINC01128', 'LOC100130417', |
|
40 |
+ 'C1orf159', 'C1orf159'), |
|
41 |
+ GeneStrand = c('-', '+', '-', '-', '+', '-', '-', '-', |
|
42 |
+ '-', '+', '-', '-', '+', '-', '-', '-', |
|
43 |
+ '-', '+', '-', '-', '+', '-', '-', '-', |
|
44 |
+ '-', '+', '-', '-', '+', '-', '-', '-', |
|
45 |
+ '-', '+', '-', '-', '+', '-', '-', '-'), |
|
46 |
+ CellMarker = c('CD13', 'CD13', 'CD13', 'CD13', 'CD13', |
|
47 |
+ 'CD13', 'CD13', 'CD13', 'CD14', 'CD14', |
|
48 |
+ 'CD14', 'CD14', 'CD14', 'CD14', 'CD14', |
|
49 |
+ 'CD14', 'CD19', 'CD19', 'CD19', 'CD19', |
|
50 |
+ 'CD19', 'CD19', 'CD19', 'CD19', 'CD3', |
|
51 |
+ 'CD3', 'CD3', 'CD3', 'CD3', 'CD3', 'CD3', |
|
52 |
+ 'CD3', 'CD34', 'CD34', 'CD34', 'CD34', |
|
53 |
+ 'CD34', 'CD34', 'CD34', 'CD34'), |
|
54 |
+ Tissue = c('BM', 'BM', 'BM', 'BM', 'BM', 'BM', 'BM', |
|
55 |
+ 'BM', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', |
|
56 |
+ 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', |
|
57 |
+ 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', |
|
58 |
+ 'PB', 'PB', 'PB', 'PB', 'BM', 'BM', 'BM', |
|
59 |
+ 'BM', 'BM', 'BM', 'BM', 'BM'), |
|
60 |
+ TimePoint = c('01', '01', '01', '01', '01', '01', '01', |
|
61 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
62 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
63 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
64 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
65 |
+ '01', '01', '01', '01', '01'), |
|
66 |
+ Value = c(1, 1, 1, 1000, 1, 10, 3, 3, 1, 1000, 1000, 500, |
|
67 |
+ 1, 12, 30, 1000, 1, 1, 500, 1, 10, 14, 30, 90, |
|
68 |
+ 1, 1, 1, 1, 10, 9, 30, 90, 1000, 1, 1, 300, |
|
69 |
+ 10, 8, 1000, 3)) |
|
70 |
+ |
|
71 |
+expected_output_sc <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1', |
|
72 |
+ '1', '1', '1', '1', '1', '1', '1', |
|
73 |
+ '1', '1', '1', '1', '1'), |
|
74 |
+ integration_locus = c(121249, 251227, |
|
75 |
+ 645551, 645551, |
|
76 |
+ 732938, 732938, |
|
77 |
+ 732938, 775536, |
|
78 |
+ 775536, 775536, |
|
79 |
+ 775536, 775536, |
|
80 |
+ 846681, 846681, |
|
81 |
+ 846681, 846681, |
|
82 |
+ 846681, 1029785, |
|
83 |
+ 1036835), |
|
84 |
+ strand = c('+', '+', '+', '+', '+', '+', |
|
85 |
+ '+', '+', '+', '+', '+', '+', |
|
86 |
+ '+', '+', '+', '+', '+', '-', |
|
87 |
+ '+'), |
|
88 |
+ GeneName = c('LOC729737', 'LOC100132287', |
|
89 |
+ 'LOC100133331', |
|
90 |
+ 'LOC100133331', |
|
91 |
+ 'LOC100288069', |
|
92 |
+ 'LOC100288069', |
|
93 |
+ 'LOC100288069', |
|
94 |
+ 'LINC01128', |
|
95 |
+ 'LINC01128', |
|
96 |
+ 'LINC01128', |
|
97 |
+ 'LINC01128', 'LINC01128', |
|
98 |
+ 'LOC100130417', |
|
99 |
+ 'LOC100130417', |
|
100 |
+ 'LOC100130417', |
|
101 |
+ 'LOC100130417', |
|
102 |
+ 'LOC100130417', 'C1orf159', |
|
103 |
+ 'C1orf159'), |
|
104 |
+ GeneStrand = c('-', '+', '-', '-', '-', |
|
105 |
+ '-', '-', '+', '+', '+', |
|
106 |
+ '+', '+', '-', '-', '-', |
|
107 |
+ '-', '-', '-', '-'), |
|
108 |
+ TimePoint = c('01', '01', '01', '01', |
|
109 |
+ '01', '01', '01', '01', |
|
110 |
+ '01', '01', '01', '01', |
|
111 |
+ '01', '01', '01', '01', |
|
112 |
+ '01', '01', '01'), |
|
113 |
+ CellMarker = c('CD34', 'CD14', 'CD14', |
|
114 |
+ 'CD19', 'CD13', 'CD14', |
|
115 |
+ 'CD34', 'CD13', 'CD14', |
|
116 |
+ 'CD19', 'CD3', 'CD34', |
|
117 |
+ 'CD13', 'CD14', 'CD19', |
|
118 |
+ 'CD3', 'CD34', 'CD34', |
|
119 |
+ 'CD14'), |
|
120 |
+ Tissue = c('BM', 'PB', 'PB', 'PB', 'BM', |
|
121 |
+ 'PB', 'BM', 'BM', 'PB', 'PB', |
|
122 |
+ 'PB', 'BM', 'BM', 'PB', 'PB', |
|
123 |
+ 'PB', 'BM', 'BM', 'PB'), |
|
124 |
+ Value = c(1000, 1000, 1000, 500, 1000, |
|
125 |
+ 500, 300, 1, 1, 10, 10, 10, 10, |
|
126 |
+ 12, 14, 9, 8, 1000, 1000)) |
|
127 |
+expected_output_ab <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1', |
|
128 |
+ '1', '1', '1', '1', '1', '1', '1', |
|
129 |
+ '1', '1', '1', '1'), |
|
130 |
+ integration_locus = c(121249, 251227, 645551, 645551, |
|
131 |
+ 732938, 732938, 732938, 775536, |
|
132 |
+ 775536, 846681, 846681, 846681, |
|
133 |
+ 1029785, 1029785, 1029785, 1036835, |
|
134 |
+ 1036835, 1036835), |
|
135 |
+ strand = c('+', '+', '+', '+', '+', '+', '+', '+', '+', |
|
136 |
+ '+', '+', '+', '-', '-', '-', '+', '+', '+'), |
|
137 |
+ GeneName = c('LOC729737', 'LOC100132287', 'LOC100133331', |
|
138 |
+ 'LOC100133331', 'LOC100288069', |
|
139 |
+ 'LOC100288069', 'LOC100288069', 'LINC01128', |
|
140 |
+ 'LINC01128', 'LOC100130417', 'LOC100130417', |
|
141 |
+ 'LOC100130417', 'C1orf159', 'C1orf159', |
|
142 |
+ 'C1orf159', 'C1orf159', 'C1orf159', |
|
143 |
+ 'C1orf159'), |
|
144 |
+ GeneStrand = c('-', '+', '-', '-', '-', '-', '-', '+', |
|
145 |
+ '+', '-', '-', '-', '-', '-', '-', '-', |
|
146 |
+ '-', '-'), |
|
147 |
+ TimePoint = c('01', '01', '01', '01', '01', '01', '01', |
|
148 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
149 |
+ '01', '01', '01', '01'), |
|
150 |
+ CellMarker = c('CD34', 'CD14', 'CD14', 'CD19', 'CD13', |
|
151 |
+ 'CD14', 'CD34', 'CD19', 'CD3', 'CD13', |
|
152 |
+ 'CD19', 'CD3', 'CD19', 'CD3', 'CD34', |
|
153 |
+ 'CD14', 'CD19', 'CD3'), |
|
154 |
+ Tissue = c('BM', 'PB', 'PB', 'PB', 'BM', 'PB', 'BM', |
|
155 |
+ 'PB', 'PB', 'BM', 'PB', 'PB', 'PB', 'PB', |
|
156 |
+ 'BM', 'PB', 'PB', 'PB'), |
|
157 |
+ Value = c(43.0477830391735, 28.2167042889391, |
|
158 |
+ 28.2167042889391, 77.2797527047913, |
|
159 |
+ 98.0392156862745, 14.1083521444695, |
|
160 |
+ 12.914334911752, 1.54559505409583, |
|
161 |
+ 6.99300699300699, 0.980392156862745, |
|
162 |
+ 2.16383307573416, 6.29370629370629, |
|
163 |
+ 4.63678516228748, 20.979020979021, |
|
164 |
+ 43.0477830391735, 28.2167042889391, |
|
165 |
+ 13.9103554868624, 62.9370629370629)) |
|
166 |
+ |
|
167 |
+#------------------------------------------------------------------------------# |
|
168 |
+# Tests |
|
169 |
+#------------------------------------------------------------------------------# |
|
170 |
+test_that(paste(func_name, "produces expected output - sc"), { |
|
171 |
+ purity_filtered <- purity_filter(x = df, |
|
172 |
+ aggregation_key = c("CellMarker", |
|
173 |
+ "Tissue", |
|
174 |
+ "TimePoint"), |
|
175 |
+ group_key = c("CellMarker", "Tissue"), |
|
176 |
+ min_value = 0, impurity_threshold = 10, |
|
177 |
+ by_timepoint = TRUE, value_column = "Value") |
|
178 |
+ expect_equal(purity_filtered, expected_output_sc) |
|
179 |
+}) |
|
180 |
+ |
|
181 |
+test_that(paste(func_name, "produces expected output - abundance"), { |
|
182 |
+ abund <- compute_abundance(x = df, columns = "Value", key = c("CellMarker", |
|
183 |
+ "Tissue", |
|
184 |
+ "TimePoint")) |
|
185 |
+ purity_filtered <- purity_filter(x = abund, |
|
186 |
+ aggregation_key = c("CellMarker", |
|
187 |
+ "Tissue", |
|
188 |
+ "TimePoint"), |
|
189 |
+ group_key = c("CellMarker", "Tissue"), |
|
190 |
+ min_value = 0, |
|
191 |
+ impurity_threshold = 10, |
|
192 |
+ by_timepoint = TRUE, |
|
193 |
+ value_column = "Value_PercAbundance") |
|
194 |
+ expect_equal(purity_filtered, expected_output_ab) |
|
195 |
+}) |
|
196 |
+ |
|
197 |
+test_that(paste(func_name, "produces expected output - join"), { |
|
198 |
+ purity_filtered <- purity_filter(x = df, |
|
199 |
+ aggregation_key = c("CellMarker", |
|
200 |
+ "Tissue", |
|
201 |
+ "TimePoint"), |
|
202 |
+ group_key = c("HematoLineage"), |
|
203 |
+ min_value = 3, |
|
204 |
+ impurity_threshold = 10, |
|
205 |
+ by_timepoint = TRUE, |
|
206 |
+ value_column = "Value") |
|
207 |
+ expected <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1', '1', |
|
208 |
+ '1', '1', '1', '1', '1', '1'), |
|
209 |
+ integration_locus = c(645551, 645551, 732938, 732938, |
|
210 |
+ 775536, 775536, 846681, 846681, |
|
211 |
+ 846681, 1029785, 1036835, 1036835, |
|
212 |
+ 121249, 251227), |
|
213 |
+ strand = c('+', '+', '+', '+', '+', '+', '+', '+', '+', |
|
214 |
+ '-', '+', '+', '+', '+'), |
|
215 |
+ GeneName = c('LOC100133331', 'LOC100133331', |
|
216 |
+ 'LOC100288069', 'LOC100288069', |
|
217 |
+ 'LINC01128', 'LINC01128', 'LOC100130417', |
|
218 |
+ 'LOC100130417', 'LOC100130417', 'C1orf159', |
|
219 |
+ 'C1orf159', 'C1orf159', 'LOC729737', |
|
220 |
+ 'LOC100132287'), |
|
221 |
+ GeneStrand = c('-', '-', '-', '-', '+', '+', '-', '-', |
|
222 |
+ '-', '-', '-', '-', '-', '+'), |
|
223 |
+ TimePoint = c('01', '01', '01', '01', '01', '01', '01', |
|
224 |
+ '01', '01', '01', '01', '01', '01', '01'), |
|
225 |
+ HematoLineage = c('Lymphoid', 'Myeloid', 'CD34', |
|
226 |
+ 'Myeloid', 'CD34', 'Lymphoid', |
|
227 |
+ 'CD34', 'Lymphoid', 'Myeloid', |
|
228 |
+ 'CD34', 'Lymphoid', 'Myeloid', |
|
229 |
+ 'CD34', 'Myeloid'), |
|
230 |
+ Value = c(501, 1001, 300, 1500, 10, 20, 8, 23, 22, |
|
231 |
+ 1000, 180, 1003, 1000, 1001)) |
|
232 |
+ expect_equal(purity_filtered, expected) |
|
233 |
+}) |
|
234 |
+ |
|
235 |
+test_that(paste(func_name, "produces expected output - group selection"), { |
|
236 |
+ ## Vector |
|
237 |
+ purity_filtered <- purity_filter(x = df, |
|
238 |
+ aggregation_key = c("CellMarker", |
|
239 |
+ "Tissue", |
|
240 |
+ "TimePoint"), |
|
241 |
+ group_key = c("CellMarker", "Tissue"), |
|
242 |
+ selected_groups = c("CD34", "CD13"), |
|
243 |
+ min_value = 0, |
|
244 |
+ impurity_threshold = 10, |
|
245 |
+ by_timepoint = TRUE, |
|
246 |
+ value_column = "Value") |
|
247 |
+ expected <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1', '1', |
|
248 |
+ '1', '1', '1', '1', '1', '1', '1', '1', |
|
249 |
+ '1', '1', '1', '1', '1', '1', '1', '1', |
|
250 |
+ '1', '1', '1', '1', '1', '1', '1', '1', |
|
251 |
+ '1', '1', '1', '1', '1', '1'), |
|
252 |
+ integration_locus = c(121249, 251227, 251227, 645551, |
|
253 |
+ 645551, 732938, 732938, 775536, |
|
254 |
+ 775536, 846681, 846681, 1029785, |
|
255 |
+ 1036835, 1036835, 121249, 121249, |
|
256 |
+ 121249, 251227, 251227, 251227, |
|
257 |
+ 645551, 645551, 645551, 732938, |
|
258 |
+ 732938, 732938, 775536, 775536, |
|
259 |
+ 775536, 846681, 846681, 846681, |
|
260 |
+ 1029785, 1029785, 1029785, 1036835, |
|
261 |
+ 1036835, 1036835), |
|
262 |
+ strand = c('+', '+', '+', '+', '+', '+', '+', '+', '+', |
|
263 |
+ '+', '+', '-', '+', '+', '+', '+', '+', '+', |
|
264 |
+ '+', '+', '+', '+', '+', '+', '+', '+', '+', |
|
265 |
+ '+', '+', '+', '+', '+', '-', '-', '-', '+', |
|
266 |
+ '+', '+'), |
|
267 |
+ GeneName = c('LOC729737', 'LOC100132287', 'LOC100132287', |
|
268 |
+ 'LOC100133331', 'LOC100133331', |
|
269 |
+ 'LOC100288069', 'LOC100288069', |
|
270 |
+ 'LINC01128', 'LINC01128', 'LOC100130417', |
|
271 |
+ 'LOC100130417', 'C1orf159', 'C1orf159', |
|
272 |
+ 'C1orf159', 'LOC729737', 'LOC729737', |
|
273 |
+ 'LOC729737', 'LOC100132287', 'LOC100132287', |
|
274 |
+ 'LOC100132287', 'LOC100133331', |
|
275 |
+ 'LOC100133331', 'LOC100133331', |
|
276 |
+ 'LOC100288069', 'LOC100288069', |
|
277 |
+ 'LOC100288069', 'LINC01128', 'LINC01128', |
|
278 |
+ 'LINC01128', 'LOC100130417', 'LOC100130417', |
|
279 |
+ 'LOC100130417', 'C1orf159', 'C1orf159', |
|
280 |
+ 'C1orf159', 'C1orf159', 'C1orf159', |
|
281 |
+ 'C1orf159'), |
|
282 |
+ GeneStrand = c('-', '+', '+', '-', '-', '-', '-', '+', |
|
283 |
+ '+', '-', '-', '-', '-', '-', '-', '-', |
|
284 |
+ '-', '+', '+', '+', '-', '-', '-', '-', |
|
285 |
+ '-', '-', '+', '+', '+', '-', '-', '-', |
|
286 |
+ '-', '-', '-', '-', '-', '-'), |
|
287 |
+ TimePoint = c('01', '01', '01', '01', '01', '01', '01', |
|
288 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
289 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
290 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
291 |
+ '01', '01', '01', '01', '01', '01', '01', |
|
292 |
+ '01', '01', '01'), |
|
293 |
+ CellMarker = c('CD34', 'CD13', 'CD34', 'CD13', 'CD34', |
|
294 |
+ 'CD13', 'CD34', 'CD13', 'CD34', 'CD13', |
|
295 |
+ 'CD34', 'CD34', 'CD13', 'CD34', 'CD14', |
|
296 |
+ 'CD19', 'CD3', 'CD14', 'CD19', 'CD3', |
|
297 |
+ 'CD14', 'CD19', 'CD3', 'CD14', 'CD19', |
|
298 |
+ 'CD3', 'CD14', 'CD19', 'CD3', 'CD14', |
|
299 |
+ 'CD19', 'CD3', 'CD14', 'CD19', 'CD3', |
|
300 |
+ 'CD14', 'CD19', 'CD3'), |
|
301 |
+ Tissue = c('BM', 'BM', 'BM', 'BM', 'BM', 'BM', 'BM', |
|
302 |
+ 'BM', 'BM', 'BM', 'BM', 'BM', 'BM', 'BM', |
|
303 |
+ 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', |
|
304 |
+ 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', |
|
305 |
+ 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB', |
|
306 |
+ 'PB', 'PB', 'PB'), |
|
307 |
+ Value = c(1000, 1, 1, 1, 1, 1000, 300, 1, 10, 10, 8, |
|
308 |
+ 1000, 3, 3, 1, 1, 1, 1000, 1, 1, 1000, 500, |
|
309 |
+ 1, 500, 1, 1, 1, 10, 10, 12, 14, 9, 30, 30, |
|
310 |
+ 30, 1000, 90, 90)) |
|
311 |
+ expect_equal(purity_filtered, expected) |
|
312 |
+ ## DF |
|
313 |
+ purity_filtered <- purity_filter(x = df, |
|
314 |
+ aggregation_key = c("CellMarker", |
|
315 |
+ "Tissue", |
|
316 |
+ "TimePoint"), |
|
317 |
+ group_key = c("CellMarker", "Tissue"), |
|
318 |
+ selected_groups = tibble::tribble( |
|
319 |
+ ~ CellMarker, ~ Tissue, |
|
320 |
+ "CD34", "BM", |
|
321 |
+ "CD13", "BM" |
|
322 |
+ ), |
|
323 |
+ min_value = 0, |
|
324 |
+ impurity_threshold = 10, |
|
325 |
+ by_timepoint = TRUE, |
|
326 |
+ value_column = "Value") |
|
327 |
+ expect_equal(purity_filtered, expected) |
|
328 |
+}) |