Browse code

[UPDATE] Update version 1.3.6

Giulia Pais authored on 05/10/2021 10:05:45
Showing 21 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: ISAnalytics
2 2
 Title: Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies
3
-Version: 1.3.5
3
+Version: 1.3.6
4 4
 Date: 2020-07-03
5 5
 Authors@R: c(
6 6
   person(given = "Andrea",
... ...
@@ -39,6 +39,7 @@ export(mandatory_IS_vars)
39 39
 export(matching_options)
40 40
 export(outlier_filter)
41 41
 export(outliers_by_pool_fragments)
42
+export(purity_filter)
42 43
 export(quantification_types)
43 44
 export(realign_after_collisions)
44 45
 export(reduced_AF_columns)
... ...
@@ -2,6 +2,16 @@
2 2
 title: "NEWS"
3 3
 output: github_document
4 4
 ---
5
+# ISAnalytics 1.3.6 (2021-10-05)
6
+
7
+## NEW
8
+
9
+* Added new feature `purity_filter()`
10
+
11
+## FIXES
12
+
13
+* Fixed small issue in printing information in reports
14
+
5 15
 # ISAnalytics 1.3.5 (2021-09-21)
6 16
 
7 17
 ## MAJOR CHANGES
... ...
@@ -1,6 +1,16 @@
1 1
 NEWS
2 2
 ================
3 3
 
4
+# ISAnalytics 1.3.6 (2021-10-05)
5
+
6
+## NEW
7
+
8
+-   Added new feature `purity_filter()`
9
+
10
+## FIXES
11
+
12
+-   Fixed small issue in printing information in reports
13
+
4 14
 # ISAnalytics 1.3.5 (2021-09-21)
5 15
 
6 16
 ## MAJOR CHANGES
... ...
@@ -1462,6 +1462,255 @@ is_sharing <- function(...,
1462 1462
 }
1463 1463
 
1464 1464
 
1465
+#' Filter integration sites based on purity.
1466
+#'
1467
+#' @description
1468
+#' \lifecycle{experimental}
1469
+#' Filter that targets possible contamination between cell lines based on
1470
+#' a numeric quantification (likely abundance or sequence count).
1471
+#'
1472
+#' @details
1473
+#' ## Setting input arguments
1474
+#'
1475
+#' The input matrix can be re-aggregated with the provided `group_key`
1476
+#' argument. This key contains the names of the columns to group on
1477
+#' (besides the columns holding genomic coordinates of the integration
1478
+#' sites) and must be contained in at least one of `x` or `lineages`
1479
+#' data frames. If the key is not found only in `x`, then a join operation
1480
+#' with the `lineages` data frame is performed on the common column(s)
1481
+#' `join_on`.
1482
+#'
1483
+#' ## Group selection
1484
+#' It is possible for the user to specify on which groups the logic of the
1485
+#' filter should be applied to. For example: if we have
1486
+#' `group_key = c("HematoLineage")` and we set
1487
+#' `selected_groups = c("CD34", "Myeloid","Lymphoid")`
1488
+#' it means that a single integration will be evaluated for the filter only
1489
+#' for groups that have the values of "CD34", "Myeloid" and "Lymphoid" in
1490
+#' the "HematoLineage" column.
1491
+#' If the same integration is present in other groups it is
1492
+#' kept as it is. `selected_groups` can be set to `NULL` if we want
1493
+#' the logic to apply to every group present in the data frame,
1494
+#' it can be set as a simple character vector as the example above if
1495
+#' the group key has length 1 (and there is no need to filter on time point).
1496
+#' If the group key is longer than 1 then the filter is applied only on the
1497
+#' first element of the key.
1498
+#'
1499
+#' If a more refined selection on groups is needed, a data frame can
1500
+#' be provided instead:
1501
+#'
1502
+#' ```
1503
+#' group_key = c("CellMarker", "Tissue")
1504
+#' selected_groups = tibble::tribble(
1505
+#' ~ CellMarker, ~ Tissue,
1506
+#' "CD34", "BM",
1507
+#' "CD14", "BM",
1508
+#' "CD14", "PB"
1509
+#' )
1510
+#' ```
1511
+#'
1512
+#' Columns in the data frame should be the same as group key (plus,
1513
+#' eventually, the time point column). In this example only those groups
1514
+#' identified by the rows in the provided data frame are processed.
1515
+#'
1516
+#' @family Analysis functions
1517
+#'
1518
+#' @param x An aggregated integration matrix, obtained via
1519
+#' `aggregate_values_by_key()`
1520
+#' @param lineages A data frame containing cell lineages information
1521
+#' @param aggregation_key The key used for aggregating `x`
1522
+#' @param group_key A character vector of column names for re-aggregation.
1523
+#' Column names must be either in `x` or in `lineages`. See details.
1524
+#' @param selected_groups Either NULL, a character vector or a
1525
+#' data frame for group selection. See details.
1526
+#' @param join_on Common columns to perform a join operation on
1527
+#' @param min_value A minimum value to filter the input matrix. Integrations
1528
+#' with a value strictly lower than `min_value` are excluded (dropped) from
1529
+#' the output.
1530
+#' @param impurity_threshold The ratio threshold for impurity in groups
1531
+#' @param by_timepoint Should filtering be applied on each time point? If
1532
+#' `FALSE`, all time points are merged together
1533
+#' @param timepoint_column Column in `x` containing the time point
1534
+#' @param value_column Column in `x` containing the numeric
1535
+#' quantification of interest
1536
+#'
1537
+#' @return A data frame
1538
+#' @export
1539
+#'
1540
+#' @examples
1541
+#' data("integration_matrices", package = "ISAnalytics")
1542
+#' data("association_file", package = "ISAnalytics")
1543
+#' aggreg <- aggregate_values_by_key(
1544
+#'     x = rlang::current_env()$integration_matrices,
1545
+#'     association_file = rlang::current_env()$association_file,
1546
+#'     value_cols = c("seqCount", "fragmentEstimate")
1547
+#' )
1548
+#' filtered_by_purity <- purity_filter(x = aggreg,
1549
+#' value_column = "seqCount_sum")
1550
+#' head(filtered_by_purity)
1551
+purity_filter <- function(x,
1552
+                          lineages = blood_lineages_default(),
1553
+                          aggregation_key = c("SubjectID", "CellMarker",
1554
+                                              "Tissue", "TimePoint"),
1555
+                          group_key = c("CellMarker", "Tissue"),
1556
+                          selected_groups = NULL,
1557
+                          join_on = "CellMarker",
1558
+                          min_value = 3,
1559
+                          impurity_threshold = 10,
1560
+                          by_timepoint = TRUE,
1561
+                          timepoint_column = "TimePoint",
1562
+                          value_column = "seqCount_sum") {
1563
+    ## Checks
1564
+    #### - Base
1565
+    stopifnot(is.data.frame(x))
1566
+    stopifnot(is.character(aggregation_key))
1567
+    stopifnot(is.character(group_key))
1568
+    stopifnot(is.logical(by_timepoint))
1569
+    stopifnot(is.numeric(min_value) || is.integer(min_value))
1570
+    stopifnot(is.numeric(impurity_threshold) || is.integer(impurity_threshold))
1571
+    stopifnot(is.character(value_column))
1572
+    stopifnot(is.null(selected_groups) || is.character(selected_groups) ||
1573
+                  is.data.frame(selected_groups))
1574
+    #### - Keys
1575
+    if (!all(aggregation_key %in% colnames(x))) {
1576
+        rlang::abort(.missing_user_cols_error(
1577
+            aggregation_key[!aggregation_key %in% colnames(x)]
1578
+            ))
1579
+    }
1580
+    if (!value_column[1] %in% colnames(x)) {
1581
+        rlang::abort(.missing_user_cols_error(value_column))
1582
+    }
1583
+    to_join <- if (all(group_key %in% colnames(x))) {
1584
+        ### If the groups rely only on attributes not related to lineages
1585
+        ### and are contained in the input matrix
1586
+        FALSE
1587
+    } else {
1588
+        ### If lineages info is needed
1589
+        stopifnot(is.data.frame(lineages))
1590
+        if (!all(group_key %in% unique(c(colnames(x), colnames(lineages))))) {
1591
+            missing_cols <- group_key[!group_key %in% unique(c(colnames(x),
1592
+                                               colnames(lineages)))]
1593
+            rlang::abort(.missing_user_cols_error(missing_cols))
1594
+        }
1595
+        if (!(all(join_on %in% colnames(x)) &
1596
+              all(join_on %in% colnames(lineages)))
1597
+            ) {
1598
+            missing_common <- c("Missing common column(s) to join on",
1599
+                                i = paste("The column(s) provided in argument",
1600
+                                          "`join_on` is missing from one or",
1601
+                                          "both data frames, aborting"))
1602
+           rlang::abort(missing_common)
1603
+        }
1604
+        TRUE
1605
+    }
1606
+    if (by_timepoint) {
1607
+       stopifnot(is.character(timepoint_column))
1608
+        timepoint_column <- timepoint_column[1]
1609
+        if (!timepoint_column %in% colnames(x)) {
1610
+            rlang::abort(.missing_user_cols_error(timepoint_column))
1611
+        }
1612
+        group_key <- union(group_key, timepoint_column)
1613
+    }
1614
+    ## Pre-processing
1615
+    #### - Join if needed
1616
+    if (to_join) {
1617
+        x <- x %>%
1618
+            dplyr::left_join(lineages, by = join_on)
1619
+    }
1620
+    #### - Group and sum
1621
+    is_vars <- if (.is_annotated(x)) {
1622
+        c(mandatory_IS_vars(), annotation_IS_vars())
1623
+    } else {
1624
+        mandatory_IS_vars()
1625
+    }
1626
+    grouped <- x %>%
1627
+        dplyr::group_by(dplyr::across(dplyr::all_of(c(is_vars, group_key)))) %>%
1628
+        dplyr::summarise(Value = sum(.data[[value_column]]),
1629
+                         .groups = "drop")
1630
+    #### - value filter
1631
+    filtered_value <- threshold_filter(x = grouped,
1632
+                                 threshold = min_value,
1633
+                                 cols_to_compare = "Value",
1634
+                                 comparators = ">=")
1635
+    #### - Separating IS 1: group filtering
1636
+    pre_filt <- list()
1637
+    if (is.null(selected_groups) || purrr::is_empty(selected_groups)) {
1638
+        pre_filt[["process"]] <- filtered_value
1639
+        pre_filt[["keep"]] <- filtered_value[0, ]
1640
+    } else if (is.character(selected_groups)) {
1641
+        pre_filt[["process"]] <- filtered_value %>%
1642
+            dplyr::filter(.data[[group_key[1]]] %in% selected_groups)
1643
+        pre_filt[["keep"]] <- filtered_value %>%
1644
+            dplyr::filter(!.data[[group_key[1]]] %in% selected_groups)
1645
+    } else {
1646
+        ok_cols <- colnames(selected_groups)[colnames(selected_groups) %in%
1647
+                                                 group_key]
1648
+        selected_groups <- selected_groups %>%
1649
+            dplyr::select(dplyr::all_of(ok_cols)) %>%
1650
+            dplyr::distinct()
1651
+        if (ncol(selected_groups) == 0 ||
1652
+            nrow(selected_groups) == 0) {
1653
+            pre_filt[["process"]] <- filtered_value
1654
+            pre_filt[["keep"]] <- filtered_value[0, ]
1655
+        } else {
1656
+            pre_filt[["process"]] <- dplyr::inner_join(filtered_value,
1657
+                                                       selected_groups,
1658
+                                                       by = ok_cols)
1659
+            pre_filt[["keep"]] <- dplyr::anti_join(filtered_value,
1660
+                                                   selected_groups,
1661
+                                                   by = ok_cols)
1662
+        }
1663
+    }
1664
+    if (nrow(pre_filt$process) == 0) {
1665
+        if (getOption("ISAnalytics.verbose")) {
1666
+            rlang::inform("No iss to process, done")
1667
+        }
1668
+        return(filtered_value)
1669
+    }
1670
+    #### - Separating IS 2: iss that are shared between groups are going to be
1671
+    #### processed, unique iss are kept as they are
1672
+    vars_to_group <- if (by_timepoint) {
1673
+        c(is_vars, timepoint_column)
1674
+    } else {
1675
+        is_vars
1676
+    }
1677
+    by_is <- pre_filt$process %>%
1678
+        dplyr::group_by(dplyr::across(dplyr::all_of(vars_to_group))) %>%
1679
+        dplyr::summarise(n = n(), .groups = "drop")
1680
+    to_process <- by_is %>%
1681
+        dplyr::filter(.data$n > 1) %>%
1682
+        dplyr::select(-.data$n) %>%
1683
+        dplyr::inner_join(pre_filt$process, by = vars_to_group)
1684
+    if (nrow(to_process) == 0) {
1685
+        ## If there are no shared iss there is nothing to process,
1686
+        ## return just the filtered matrix
1687
+        return(filtered_value)
1688
+    }
1689
+    to_keep <- by_is %>%
1690
+        dplyr::filter(.data$n == 1) %>%
1691
+        dplyr::select(-.data$n) %>%
1692
+        dplyr::inner_join(pre_filt$process, by = vars_to_group)
1693
+    #### - Process groups
1694
+    .filter_by_purity <- function(group) {
1695
+        max_val <- max(group$Value)
1696
+        processed <- group %>%
1697
+            dplyr::mutate(remove = (max_val / .data$Value) >
1698
+                              impurity_threshold) %>%
1699
+            dplyr::filter(remove == FALSE) %>%
1700
+            dplyr::select(-.data$remove)
1701
+        processed
1702
+    }
1703
+    processed_iss <- to_process %>%
1704
+        dplyr::group_by(dplyr::across(vars_to_group)) %>%
1705
+        dplyr::group_modify(~ .filter_by_purity(.x)) %>%
1706
+        dplyr::ungroup()
1707
+    #### - Re-compose matrix
1708
+    final <- processed_iss %>%
1709
+        dplyr::bind_rows(to_keep) %>%
1710
+        dplyr::bind_rows(pre_filt$keep)
1711
+    final
1712
+}
1713
+
1465 1714
 #' A set of pre-defined functions for `sample_statistics`.
1466 1715
 #'
1467 1716
 #' @return A named list of functions/purrr-style lambdas
... ...
@@ -137,6 +137,16 @@ options("ISAnalytics.reports" = TRUE)
137 137
 Show more
138 138
 </summary>
139 139
 
140
+# ISAnalytics 1.3.6 (2021-10-05)
141
+
142
+## NEW
143
+
144
+-   Added new feature `purity_filter()`
145
+
146
+## FIXES
147
+
148
+-   Fixed small issue in printing information in reports
149
+
140 150
 # ISAnalytics 1.3.5 (2021-09-21)
141 151
 
142 152
 ## MAJOR CHANGES
... ...
@@ -129,16 +129,16 @@ logic <- if (length(params$key) > 1) {
129 129
         combined <- rbind(base_flag, c(flag_logic, ""))
130 130
         paste(
131 131
             "* Key length > 1, flagging formula used: ",
132
-            paste(combined, collapse = " ")
132
+            paste0(combined, collapse = " ")
133 133
         )
134 134
     } else {
135 135
         base_flag <- paste0(
136
-            "(tdist_", key, " < ", params$outlier_thresh,
137
-            " & zscore_", key, " < 0)"
136
+            "(tdist_", params$key, " < ", params$outlier_thresh,
137
+            " & zscore_", params$key, " < 0)"
138 138
         )
139 139
         paste(
140 140
             "* Flagging formula used: ",
141
-            paste(base_flag)
141
+            paste0(base_flag)
142 142
         )
143 143
     }
144 144
 log2 <- if (params$log2_req) {
... ...
@@ -80,6 +80,7 @@ Other Analysis functions:
80 80
 \code{\link{cumulative_count_union}()},
81 81
 \code{\link{cumulative_is}()},
82 82
 \code{\link{is_sharing}()},
83
+\code{\link{purity_filter}()},
83 84
 \code{\link{sample_statistics}()},
84 85
 \code{\link{separate_quant_matrices}()},
85 86
 \code{\link{threshold_filter}()},
... ...
@@ -81,29 +81,29 @@ For more details on how this files were generated use the help
81 81
 
82 82
 The default values are included in this package and
83 83
 it can be accessed by doing:\if{html}{\out{<div class="r">}}\preformatted{head(known_clinical_oncogenes())
84
-#> # A tibble: 5 × 2
85
-#>   GeneName KnownClonalExpansion
86
-#>   <chr>    <lgl>               
87
-#> 1 MECOM    TRUE                
88
-#> 2 CCND2    TRUE                
89
-#> 3 TAL1     TRUE                
90
-#> 4 LMO2     TRUE                
91
-#> 5 HMGA2    TRUE
92
-}\if{html}{\out{</div>}}
84
+}\if{html}{\out{</div>}}\preformatted{## # A tibble: 5 × 2
85
+##   GeneName KnownClonalExpansion
86
+##   <chr>    <lgl>               
87
+## 1 MECOM    TRUE                
88
+## 2 CCND2    TRUE                
89
+## 3 TAL1     TRUE                
90
+## 4 LMO2     TRUE                
91
+## 5 HMGA2    TRUE
92
+}
93 93
 
94 94
 If the user wants to change this parameter the input data frame must
95 95
 preserve the column structure. The same goes for the \code{suspicious_genes}
96 96
 parameter (DOIReference column is optional):\if{html}{\out{<div class="r">}}\preformatted{head(clinical_relevant_suspicious_genes())
97
-#> # A tibble: 6 × 3
98
-#>   GeneName ClinicalRelevance DOIReference                                
99
-#>   <chr>    <lgl>             <chr>                                       
100
-#> 1 DNMT3A   TRUE              https://doi.org/10.1182/blood-2018-01-829937
101
-#> 2 TET2     TRUE              https://doi.org/10.1182/blood-2018-01-829937
102
-#> 3 ASXL1    TRUE              https://doi.org/10.1182/blood-2018-01-829937
103
-#> 4 JAK2     TRUE              https://doi.org/10.1182/blood-2018-01-829937
104
-#> 5 CBL      TRUE              https://doi.org/10.1182/blood-2018-01-829937
105
-#> 6 TP53     TRUE              https://doi.org/10.1182/blood-2018-01-829937
106
-}\if{html}{\out{</div>}}
97
+}\if{html}{\out{</div>}}\preformatted{## # A tibble: 6 × 3
98
+##   GeneName ClinicalRelevance DOIReference                                
99
+##   <chr>    <lgl>             <chr>                                       
100
+## 1 DNMT3A   TRUE              https://doi.org/10.1182/blood-2018-01-829937
101
+## 2 TET2     TRUE              https://doi.org/10.1182/blood-2018-01-829937
102
+## 3 ASXL1    TRUE              https://doi.org/10.1182/blood-2018-01-829937
103
+## 4 JAK2     TRUE              https://doi.org/10.1182/blood-2018-01-829937
104
+## 5 CBL      TRUE              https://doi.org/10.1182/blood-2018-01-829937
105
+## 6 TP53     TRUE              https://doi.org/10.1182/blood-2018-01-829937
106
+}
107 107
 }
108 108
 }
109 109
 \examples{
... ...
@@ -70,6 +70,7 @@ Other Analysis functions:
70 70
 \code{\link{cumulative_count_union}()},
71 71
 \code{\link{cumulative_is}()},
72 72
 \code{\link{is_sharing}()},
73
+\code{\link{purity_filter}()},
73 74
 \code{\link{sample_statistics}()},
74 75
 \code{\link{separate_quant_matrices}()},
75 76
 \code{\link{threshold_filter}()},
... ...
@@ -62,6 +62,7 @@ Other Analysis functions:
62 62
 \code{\link{cumulative_count_union}()},
63 63
 \code{\link{cumulative_is}()},
64 64
 \code{\link{is_sharing}()},
65
+\code{\link{purity_filter}()},
65 66
 \code{\link{sample_statistics}()},
66 67
 \code{\link{separate_quant_matrices}()},
67 68
 \code{\link{threshold_filter}()},
... ...
@@ -91,6 +91,7 @@ Other Analysis functions:
91 91
 \code{\link{compute_abundance}()},
92 92
 \code{\link{cumulative_is}()},
93 93
 \code{\link{is_sharing}()},
94
+\code{\link{purity_filter}()},
94 95
 \code{\link{sample_statistics}()},
95 96
 \code{\link{separate_quant_matrices}()},
96 97
 \code{\link{threshold_filter}()},
... ...
@@ -57,6 +57,7 @@ Other Analysis functions:
57 57
 \code{\link{compute_abundance}()},
58 58
 \code{\link{cumulative_count_union}()},
59 59
 \code{\link{is_sharing}()},
60
+\code{\link{purity_filter}()},
60 61
 \code{\link{sample_statistics}()},
61 62
 \code{\link{separate_quant_matrices}()},
62 63
 \code{\link{threshold_filter}()},
... ...
@@ -95,6 +95,7 @@ Other Analysis functions:
95 95
 \code{\link{compute_abundance}()},
96 96
 \code{\link{cumulative_count_union}()},
97 97
 \code{\link{cumulative_is}()},
98
+\code{\link{purity_filter}()},
98 99
 \code{\link{sample_statistics}()},
99 100
 \code{\link{separate_quant_matrices}()},
100 101
 \code{\link{threshold_filter}()},
101 102
new file mode 100644
... ...
@@ -0,0 +1,128 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/analysis-functions.R
3
+\name{purity_filter}
4
+\alias{purity_filter}
5
+\title{Filter integration sites based on purity.}
6
+\usage{
7
+purity_filter(
8
+  x,
9
+  lineages = blood_lineages_default(),
10
+  aggregation_key = c("SubjectID", "CellMarker", "Tissue", "TimePoint"),
11
+  group_key = c("CellMarker", "Tissue"),
12
+  selected_groups = NULL,
13
+  join_on = "CellMarker",
14
+  min_value = 3,
15
+  impurity_threshold = 10,
16
+  by_timepoint = TRUE,
17
+  timepoint_column = "TimePoint",
18
+  value_column = "seqCount_sum"
19
+)
20
+}
21
+\arguments{
22
+\item{x}{An aggregated integration matrix, obtained via
23
+\code{aggregate_values_by_key()}}
24
+
25
+\item{lineages}{A data frame containing cell lineages information}
26
+
27
+\item{aggregation_key}{The key used for aggregating \code{x}}
28
+
29
+\item{group_key}{A character vector of column names for re-aggregation.
30
+Column names must be either in \code{x} or in \code{lineages}. See details.}
31
+
32
+\item{selected_groups}{Either NULL, a character vector or a
33
+data frame for group selection. See details.}
34
+
35
+\item{join_on}{Common columns to perform a join operation on}
36
+
37
+\item{min_value}{A minimum value to filter the input matrix. Integrations
38
+with a value strictly lower than \code{min_value} are excluded (dropped) from
39
+the output.}
40
+
41
+\item{impurity_threshold}{The ratio threshold for impurity in groups}
42
+
43
+\item{by_timepoint}{Should filtering be applied on each time point? If
44
+\code{FALSE}, all time points are merged together}
45
+
46
+\item{timepoint_column}{Column in \code{x} containing the time point}
47
+
48
+\item{value_column}{Column in \code{x} containing the numeric
49
+quantification of interest}
50
+}
51
+\value{
52
+A data frame
53
+}
54
+\description{
55
+\lifecycle{experimental}
56
+Filter that targets possible contamination between cell lines based on
57
+a numeric quantification (likely abundance or sequence count).
58
+}
59
+\details{
60
+\subsection{Setting input arguments}{
61
+
62
+The input matrix can be re-aggregated with the provided \code{group_key}
63
+argument. This key contains the names of the columns to group on
64
+(besides the columns holding genomic coordinates of the integration
65
+sites) and must be contained in at least one of \code{x} or \code{lineages}
66
+data frames. If the key is not found only in \code{x}, then a join operation
67
+with the \code{lineages} data frame is performed on the common column(s)
68
+\code{join_on}.
69
+}
70
+
71
+\subsection{Group selection}{
72
+
73
+It is possible for the user to specify on which groups the logic of the
74
+filter should be applied to. For example: if we have
75
+\code{group_key = c("HematoLineage")} and we set
76
+\code{selected_groups = c("CD34", "Myeloid","Lymphoid")}
77
+it means that a single integration will be evaluated for the filter only
78
+for groups that have the values of "CD34", "Myeloid" and "Lymphoid" in
79
+the "HematoLineage" column.
80
+If the same integration is present in other groups it is
81
+kept as it is. \code{selected_groups} can be set to \code{NULL} if we want
82
+the logic to apply to every group present in the data frame,
83
+it can be set as a simple character vector as the example above if
84
+the group key has length 1 (and there is no need to filter on time point).
85
+If the group key is longer than 1 then the filter is applied only on the
86
+first element of the key.
87
+
88
+If a more refined selection on groups is needed, a data frame can
89
+be provided instead:\preformatted{group_key = c("CellMarker", "Tissue")
90
+selected_groups = tibble::tribble(
91
+~ CellMarker, ~ Tissue,
92
+"CD34", "BM",
93
+"CD14", "BM",
94
+"CD14", "PB"
95
+)
96
+}
97
+
98
+Columns in the data frame should be the same as group key (plus,
99
+eventually, the time point column). In this example only those groups
100
+identified by the rows in the provided data frame are processed.
101
+}
102
+}
103
+\examples{
104
+data("integration_matrices", package = "ISAnalytics")
105
+data("association_file", package = "ISAnalytics")
106
+aggreg <- aggregate_values_by_key(
107
+    x = rlang::current_env()$integration_matrices,
108
+    association_file = rlang::current_env()$association_file,
109
+    value_cols = c("seqCount", "fragmentEstimate")
110
+)
111
+filtered_by_purity <- purity_filter(x = aggreg,
112
+value_column = "seqCount_sum")
113
+head(filtered_by_purity)
114
+}
115
+\seealso{
116
+Other Analysis functions: 
117
+\code{\link{CIS_grubbs}()},
118
+\code{\link{comparison_matrix}()},
119
+\code{\link{compute_abundance}()},
120
+\code{\link{cumulative_count_union}()},
121
+\code{\link{cumulative_is}()},
122
+\code{\link{is_sharing}()},
123
+\code{\link{sample_statistics}()},
124
+\code{\link{separate_quant_matrices}()},
125
+\code{\link{threshold_filter}()},
126
+\code{\link{top_integrations}()}
127
+}
128
+\concept{Analysis functions}
... ...
@@ -78,6 +78,7 @@ Other Analysis functions:
78 78
 \code{\link{cumulative_count_union}()},
79 79
 \code{\link{cumulative_is}()},
80 80
 \code{\link{is_sharing}()},
81
+\code{\link{purity_filter}()},
81 82
 \code{\link{separate_quant_matrices}()},
82 83
 \code{\link{threshold_filter}()},
83 84
 \code{\link{top_integrations}()}
... ...
@@ -62,6 +62,7 @@ Other Analysis functions:
62 62
 \code{\link{cumulative_count_union}()},
63 63
 \code{\link{cumulative_is}()},
64 64
 \code{\link{is_sharing}()},
65
+\code{\link{purity_filter}()},
65 66
 \code{\link{sample_statistics}()},
66 67
 \code{\link{threshold_filter}()},
67 68
 \code{\link{top_integrations}()}
... ...
@@ -78,8 +78,7 @@ It is also possible to filter different data frames with different
78 78
 sets of conditions. Besides having the possibility of defining the
79 79
 other parameters as simple vector, which has the same results as
80 80
 operating on an unnamed list, the user can define the parameters as
81
-named lists containing vectors. For example:\if{html}{\out{<div class="r">}}\preformatted{
82
-example_df <- tibble::tibble(a = c(20, 30, 40),
81
+named lists containing vectors. For example:\if{html}{\out{<div class="r">}}\preformatted{example_df <- tibble::tibble(a = c(20, 30, 40),
83 82
                              b = c(40, 50, 60),
84 83
                              c = c("a", "b", "c"),
85 84
                              d = c(3L, 4L, 5L))
... ...
@@ -87,31 +86,30 @@ example_list <- list(first = example_df,
87 86
                      second = example_df,
88 87
                      third = example_df)
89 88
 print(example_list)
90
-#> $first
91
-#> # A tibble: 3 × 4
92
-#>       a     b c         d
93
-#>   <dbl> <dbl> <chr> <int>
94
-#> 1    20    40 a         3
95
-#> 2    30    50 b         4
96
-#> 3    40    60 c         5
97
-#> 
98
-#> $second
99
-#> # A tibble: 3 × 4
100
-#>       a     b c         d
101
-#>   <dbl> <dbl> <chr> <int>
102
-#> 1    20    40 a         3
103
-#> 2    30    50 b         4
104
-#> 3    40    60 c         5
105
-#> 
106
-#> $third
107
-#> # A tibble: 3 × 4
108
-#>       a     b c         d
109
-#>   <dbl> <dbl> <chr> <int>
110
-#> 1    20    40 a         3
111
-#> 2    30    50 b         4
112
-#> 3    40    60 c         5
113
-
114
-filtered <- threshold_filter(example_list,
89
+}\if{html}{\out{</div>}}\preformatted{## $first
90
+## # A tibble: 3 × 4
91
+##       a     b c         d
92
+##   <dbl> <dbl> <chr> <int>
93
+## 1    20    40 a         3
94
+## 2    30    50 b         4
95
+## 3    40    60 c         5
96
+## 
97
+## $second
98
+## # A tibble: 3 × 4
99
+##       a     b c         d
100
+##   <dbl> <dbl> <chr> <int>
101
+## 1    20    40 a         3
102
+## 2    30    50 b         4
103
+## 3    40    60 c         5
104
+## 
105
+## $third
106
+## # A tibble: 3 × 4
107
+##       a     b c         d
108
+##   <dbl> <dbl> <chr> <int>
109
+## 1    20    40 a         3
110
+## 2    30    50 b         4
111
+## 3    40    60 c         5
112
+}\if{html}{\out{<div class="r">}}\preformatted{filtered <- threshold_filter(example_list,
115 113
 threshold = list(first = c(20, 60),
116 114
 third = c(25)),
117 115
 cols_to_compare = list(first = c("a", "b"),
... ...
@@ -119,27 +117,27 @@ third = c("a")),
119 117
 comparators = list(first = c(">", "<"),
120 118
 third = c(">=")))
121 119
 print(filtered)
122
-#> $first
123
-#> # A tibble: 1 × 4
124
-#>       a     b c         d
125
-#>   <dbl> <dbl> <chr> <int>
126
-#> 1    30    50 b         4
127
-#> 
128
-#> $second
129
-#> # A tibble: 3 × 4
130
-#>       a     b c         d
131
-#>   <dbl> <dbl> <chr> <int>
132
-#> 1    20    40 a         3
133
-#> 2    30    50 b         4
134
-#> 3    40    60 c         5
135
-#> 
136
-#> $third
137
-#> # A tibble: 2 × 4
138
-#>       a     b c         d
139
-#>   <dbl> <dbl> <chr> <int>
140
-#> 1    30    50 b         4
141
-#> 2    40    60 c         5
142
-}\if{html}{\out{</div>}}
120
+}\if{html}{\out{</div>}}\preformatted{## $first
121
+## # A tibble: 1 × 4
122
+##       a     b c         d
123
+##   <dbl> <dbl> <chr> <int>
124
+## 1    30    50 b         4
125
+## 
126
+## $second
127
+## # A tibble: 3 × 4
128
+##       a     b c         d
129
+##   <dbl> <dbl> <chr> <int>
130
+## 1    20    40 a         3
131
+## 2    30    50 b         4
132
+## 3    40    60 c         5
133
+## 
134
+## $third
135
+## # A tibble: 2 × 4
136
+##       a     b c         d
137
+##   <dbl> <dbl> <chr> <int>
138
+## 1    30    50 b         4
139
+## 2    40    60 c         5
140
+}
143 141
 
144 142
 The above signature will roughly be translated as:
145 143
 \itemize{
... ...
@@ -213,6 +211,7 @@ Other Analysis functions:
213 211
 \code{\link{cumulative_count_union}()},
214 212
 \code{\link{cumulative_is}()},
215 213
 \code{\link{is_sharing}()},
214
+\code{\link{purity_filter}()},
216 215
 \code{\link{sample_statistics}()},
217 216
 \code{\link{separate_quant_matrices}()},
218 217
 \code{\link{top_integrations}()}
... ...
@@ -78,6 +78,7 @@ Other Analysis functions:
78 78
 \code{\link{cumulative_count_union}()},
79 79
 \code{\link{cumulative_is}()},
80 80
 \code{\link{is_sharing}()},
81
+\code{\link{purity_filter}()},
81 82
 \code{\link{sample_statistics}()},
82 83
 \code{\link{separate_quant_matrices}()},
83 84
 \code{\link{threshold_filter}()}
... ...
@@ -108,6 +108,7 @@ test_that(paste(func_name, "reads type NEW standard"), {
108 108
 })
109 109
 
110 110
 test_that(paste(func_name, "reads type NEW different params"), {
111
+    skip_on_os("windows")
111 112
     tf <- withr::local_tempfile(fileext = ".tsv")
112 113
     readr::write_tsv(sample_df, tf)
113 114
     expected_summary_msg <- .summary_ism_import_msg(
114 115
new file mode 100644
... ...
@@ -0,0 +1,328 @@
1
+library(ISAnalytics)
2
+func_name <- "purity_filter"
3
+#------------------------------------------------------------------------------#
4
+# Global vars
5
+#------------------------------------------------------------------------------#
6
+
7
+df <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
8
+                             '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
9
+                             '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
10
+                             '1', '1', '1', '1', '1', '1', '1', '1', '1', '1'),
11
+                     integration_locus = c(121249, 251227, 645551, 732938,
12
+                                           775536, 846681, 1029785, 1036835,
13
+                                           121249, 251227, 645551, 732938,
14
+                                           775536, 846681, 1029785, 1036835,
15
+                                           121249, 251227, 645551, 732938,
16
+                                           775536, 846681, 1029785, 1036835,
17
+                                           121249, 251227, 645551, 732938,
18
+                                           775536, 846681, 1029785, 1036835,
19
+                                           121249, 251227, 645551, 732938,
20
+                                           775536, 846681, 1029785, 1036835),
21
+                     strand = c('+', '+', '+', '+', '+', '+', '-', '+', '+',
22
+                                '+', '+', '+', '+', '+', '-', '+', '+', '+',
23
+                                '+', '+', '+', '+', '-', '+', '+', '+', '+',
24
+                                '+', '+', '+', '-', '+', '+', '+', '+', '+',
25
+                                '+', '+', '-', '+'),
26
+                     GeneName = c('LOC729737', 'LOC100132287', 'LOC100133331',
27
+                                  'LOC100288069', 'LINC01128', 'LOC100130417',
28
+                                  'C1orf159', 'C1orf159', 'LOC729737',
29
+                                  'LOC100132287', 'LOC100133331',
30
+                                  'LOC100288069', 'LINC01128', 'LOC100130417',
31
+                                  'C1orf159', 'C1orf159', 'LOC729737',
32
+                                  'LOC100132287', 'LOC100133331',
33
+                                  'LOC100288069', 'LINC01128', 'LOC100130417',
34
+                                  'C1orf159', 'C1orf159', 'LOC729737',
35
+                                  'LOC100132287', 'LOC100133331',
36
+                                  'LOC100288069', 'LINC01128', 'LOC100130417',
37
+                                  'C1orf159', 'C1orf159', 'LOC729737',
38
+                                  'LOC100132287', 'LOC100133331',
39
+                                  'LOC100288069', 'LINC01128', 'LOC100130417',
40
+                                  'C1orf159', 'C1orf159'),
41
+                     GeneStrand = c('-', '+', '-', '-', '+', '-', '-', '-',
42
+                                    '-', '+', '-', '-', '+', '-', '-', '-',
43
+                                    '-', '+', '-', '-', '+', '-', '-', '-',
44
+                                    '-', '+', '-', '-', '+', '-', '-', '-',
45
+                                    '-', '+', '-', '-', '+', '-', '-', '-'),
46
+                     CellMarker = c('CD13', 'CD13', 'CD13', 'CD13', 'CD13',
47
+                                    'CD13', 'CD13', 'CD13', 'CD14', 'CD14',
48
+                                    'CD14', 'CD14', 'CD14', 'CD14', 'CD14',
49
+                                    'CD14', 'CD19', 'CD19', 'CD19', 'CD19',
50
+                                    'CD19', 'CD19', 'CD19', 'CD19', 'CD3',
51
+                                    'CD3', 'CD3', 'CD3', 'CD3', 'CD3', 'CD3',
52
+                                    'CD3', 'CD34', 'CD34', 'CD34', 'CD34',
53
+                                    'CD34', 'CD34', 'CD34', 'CD34'),
54
+                     Tissue = c('BM', 'BM', 'BM', 'BM', 'BM', 'BM', 'BM',
55
+                                'BM', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB',
56
+                                'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB',
57
+                                'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB',
58
+                                'PB', 'PB', 'PB', 'PB', 'BM', 'BM', 'BM',
59
+                                'BM', 'BM', 'BM', 'BM', 'BM'),
60
+                     TimePoint = c('01', '01', '01', '01', '01', '01', '01',
61
+                                   '01', '01', '01', '01', '01', '01', '01',
62
+                                   '01', '01', '01', '01', '01', '01', '01',
63
+                                   '01', '01', '01', '01', '01', '01', '01',
64
+                                   '01', '01', '01', '01', '01', '01', '01',
65
+                                   '01', '01', '01', '01', '01'),
66
+                     Value = c(1, 1, 1, 1000, 1, 10, 3, 3, 1, 1000, 1000, 500,
67
+                               1, 12, 30, 1000, 1, 1, 500, 1, 10, 14, 30, 90,
68
+                               1, 1, 1, 1, 10, 9, 30, 90, 1000, 1, 1, 300,
69
+                               10, 8, 1000, 3))
70
+
71
+expected_output_sc <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1',
72
+                                             '1', '1', '1', '1', '1', '1', '1',
73
+                                             '1', '1', '1', '1', '1'),
74
+                                     integration_locus = c(121249, 251227,
75
+                                                           645551, 645551,
76
+                                                           732938, 732938,
77
+                                                           732938, 775536,
78
+                                                           775536, 775536,
79
+                                                           775536, 775536,
80
+                                                           846681, 846681,
81
+                                                           846681, 846681,
82
+                                                           846681, 1029785,
83
+                                                           1036835),
84
+                                     strand = c('+', '+', '+', '+', '+', '+',
85
+                                                '+', '+', '+', '+', '+', '+',
86
+                                                '+', '+', '+', '+', '+', '-',
87
+                                                '+'),
88
+                                     GeneName = c('LOC729737', 'LOC100132287',
89
+                                                  'LOC100133331',
90
+                                                  'LOC100133331',
91
+                                                  'LOC100288069',
92
+                                                  'LOC100288069',
93
+                                                  'LOC100288069',
94
+                                                  'LINC01128',
95
+                                                  'LINC01128',
96
+                                                  'LINC01128',
97
+                                                  'LINC01128', 'LINC01128',
98
+                                                  'LOC100130417',
99
+                                                  'LOC100130417',
100
+                                                  'LOC100130417',
101
+                                                  'LOC100130417',
102
+                                                  'LOC100130417', 'C1orf159',
103
+                                                  'C1orf159'),
104
+                                     GeneStrand = c('-', '+', '-', '-', '-',
105
+                                                    '-', '-', '+', '+', '+',
106
+                                                    '+', '+', '-', '-', '-',
107
+                                                    '-', '-', '-', '-'),
108
+                                     TimePoint = c('01', '01', '01', '01',
109
+                                                   '01', '01', '01', '01',
110
+                                                   '01', '01', '01', '01',
111
+                                                   '01', '01', '01', '01',
112
+                                                   '01', '01', '01'),
113
+                                     CellMarker = c('CD34', 'CD14', 'CD14',
114
+                                                    'CD19', 'CD13', 'CD14',
115
+                                                    'CD34', 'CD13', 'CD14',
116
+                                                    'CD19', 'CD3', 'CD34',
117
+                                                    'CD13', 'CD14', 'CD19',
118
+                                                    'CD3', 'CD34', 'CD34',
119
+                                                    'CD14'),
120
+                                     Tissue = c('BM', 'PB', 'PB', 'PB', 'BM',
121
+                                                'PB', 'BM', 'BM', 'PB', 'PB',
122
+                                                'PB', 'BM', 'BM', 'PB', 'PB',
123
+                                                'PB', 'BM', 'BM', 'PB'),
124
+                                     Value = c(1000, 1000, 1000, 500, 1000,
125
+                                               500, 300, 1, 1, 10, 10, 10, 10,
126
+                                               12, 14, 9, 8, 1000, 1000))
127
+expected_output_ab <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1',
128
+                                             '1', '1', '1', '1', '1', '1', '1',
129
+                                             '1', '1', '1', '1'),
130
+                     integration_locus = c(121249, 251227, 645551, 645551,
131
+                                           732938, 732938, 732938, 775536,
132
+                                           775536, 846681, 846681, 846681,
133
+                                           1029785, 1029785, 1029785, 1036835,
134
+                                           1036835, 1036835),
135
+                     strand = c('+', '+', '+', '+', '+', '+', '+', '+', '+',
136
+                                '+', '+', '+', '-', '-', '-', '+', '+', '+'),
137
+                     GeneName = c('LOC729737', 'LOC100132287', 'LOC100133331',
138
+                                  'LOC100133331', 'LOC100288069',
139
+                                  'LOC100288069', 'LOC100288069', 'LINC01128',
140
+                                  'LINC01128', 'LOC100130417', 'LOC100130417',
141
+                                  'LOC100130417', 'C1orf159', 'C1orf159',
142
+                                  'C1orf159', 'C1orf159', 'C1orf159',
143
+                                  'C1orf159'),
144
+                     GeneStrand = c('-', '+', '-', '-', '-', '-', '-', '+',
145
+                                    '+', '-', '-', '-', '-', '-', '-', '-',
146
+                                    '-', '-'),
147
+                     TimePoint = c('01', '01', '01', '01', '01', '01', '01',
148
+                                   '01', '01', '01', '01', '01', '01', '01',
149
+                                   '01', '01', '01', '01'),
150
+                     CellMarker = c('CD34', 'CD14', 'CD14', 'CD19', 'CD13',
151
+                                    'CD14', 'CD34', 'CD19', 'CD3', 'CD13',
152
+                                    'CD19', 'CD3', 'CD19', 'CD3', 'CD34',
153
+                                    'CD14', 'CD19', 'CD3'),
154
+                     Tissue = c('BM', 'PB', 'PB', 'PB', 'BM', 'PB', 'BM',
155
+                                'PB', 'PB', 'BM', 'PB', 'PB', 'PB', 'PB',
156
+                                'BM', 'PB', 'PB', 'PB'),
157
+                     Value = c(43.0477830391735, 28.2167042889391,
158
+                               28.2167042889391, 77.2797527047913,
159
+                               98.0392156862745, 14.1083521444695,
160
+                               12.914334911752, 1.54559505409583,
161
+                               6.99300699300699, 0.980392156862745,
162
+                               2.16383307573416, 6.29370629370629,
163
+                               4.63678516228748, 20.979020979021,
164
+                               43.0477830391735, 28.2167042889391,
165
+                               13.9103554868624, 62.9370629370629))
166
+
167
+#------------------------------------------------------------------------------#
168
+# Tests
169
+#------------------------------------------------------------------------------#
170
+test_that(paste(func_name, "produces expected output - sc"), {
171
+  purity_filtered <- purity_filter(x = df,
172
+                                   aggregation_key = c("CellMarker",
173
+                                                       "Tissue",
174
+                                                       "TimePoint"),
175
+                                   group_key = c("CellMarker", "Tissue"),
176
+                                   min_value = 0, impurity_threshold = 10,
177
+                                   by_timepoint = TRUE, value_column = "Value")
178
+  expect_equal(purity_filtered, expected_output_sc)
179
+})
180
+
181
+test_that(paste(func_name, "produces expected output - abundance"), {
182
+  abund <- compute_abundance(x = df, columns = "Value", key = c("CellMarker",
183
+                                                                "Tissue",
184
+                                                                "TimePoint"))
185
+  purity_filtered <- purity_filter(x = abund,
186
+                                   aggregation_key = c("CellMarker",
187
+                                                       "Tissue",
188
+                                                       "TimePoint"),
189
+                                   group_key = c("CellMarker", "Tissue"),
190
+                                   min_value = 0,
191
+                                   impurity_threshold = 10,
192
+                                   by_timepoint = TRUE,
193
+                                   value_column = "Value_PercAbundance")
194
+  expect_equal(purity_filtered, expected_output_ab)
195
+})
196
+
197
+test_that(paste(func_name, "produces expected output - join"), {
198
+  purity_filtered <- purity_filter(x = df,
199
+                                   aggregation_key = c("CellMarker",
200
+                                                       "Tissue",
201
+                                                       "TimePoint"),
202
+                                   group_key = c("HematoLineage"),
203
+                                   min_value = 3,
204
+                                   impurity_threshold = 10,
205
+                                   by_timepoint = TRUE,
206
+                                   value_column = "Value")
207
+  expected <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1', '1',
208
+                                     '1', '1', '1', '1', '1', '1'),
209
+                       integration_locus = c(645551, 645551, 732938, 732938,
210
+                                             775536, 775536, 846681, 846681,
211
+                                             846681, 1029785, 1036835, 1036835,
212
+                                             121249, 251227),
213
+                       strand = c('+', '+', '+', '+', '+', '+', '+', '+', '+',
214
+                                  '-', '+', '+', '+', '+'),
215
+                       GeneName = c('LOC100133331', 'LOC100133331',
216
+                                    'LOC100288069', 'LOC100288069',
217
+                                    'LINC01128', 'LINC01128', 'LOC100130417',
218
+                                    'LOC100130417', 'LOC100130417', 'C1orf159',
219
+                                    'C1orf159', 'C1orf159', 'LOC729737',
220
+                                    'LOC100132287'),
221
+                       GeneStrand = c('-', '-', '-', '-', '+', '+', '-', '-',
222
+                                      '-', '-', '-', '-', '-', '+'),
223
+                       TimePoint = c('01', '01', '01', '01', '01', '01', '01',
224
+                                     '01', '01', '01', '01', '01', '01', '01'),
225
+                       HematoLineage = c('Lymphoid', 'Myeloid', 'CD34',
226
+                                         'Myeloid', 'CD34', 'Lymphoid',
227
+                                         'CD34', 'Lymphoid', 'Myeloid',
228
+                                         'CD34', 'Lymphoid', 'Myeloid',
229
+                                         'CD34', 'Myeloid'),
230
+                       Value = c(501, 1001, 300, 1500, 10, 20, 8, 23, 22,
231
+                                 1000, 180, 1003, 1000, 1001))
232
+  expect_equal(purity_filtered, expected)
233
+})
234
+
235
+test_that(paste(func_name, "produces expected output - group selection"), {
236
+  ## Vector
237
+  purity_filtered <- purity_filter(x = df,
238
+                                   aggregation_key = c("CellMarker",
239
+                                                       "Tissue",
240
+                                                       "TimePoint"),
241
+                                   group_key = c("CellMarker", "Tissue"),
242
+                                   selected_groups = c("CD34", "CD13"),
243
+                                   min_value = 0,
244
+                                   impurity_threshold = 10,
245
+                                   by_timepoint = TRUE,
246
+                                   value_column = "Value")
247
+  expected <- tibble::tibble(chr = c('1', '1', '1', '1', '1', '1', '1', '1',
248
+                                     '1', '1', '1', '1', '1', '1', '1', '1',
249
+                                     '1', '1', '1', '1', '1', '1', '1', '1',
250
+                                     '1', '1', '1', '1', '1', '1', '1', '1',
251
+                                     '1', '1', '1', '1', '1', '1'),
252
+                       integration_locus = c(121249, 251227, 251227, 645551,
253
+                                             645551, 732938, 732938, 775536,
254
+                                             775536, 846681, 846681, 1029785,
255
+                                             1036835, 1036835, 121249, 121249,
256
+                                             121249, 251227, 251227, 251227,
257
+                                             645551, 645551, 645551, 732938,
258
+                                             732938, 732938, 775536, 775536,
259
+                                             775536, 846681, 846681, 846681,
260
+                                             1029785, 1029785, 1029785, 1036835,
261
+                                             1036835, 1036835),
262
+                       strand = c('+', '+', '+', '+', '+', '+', '+', '+', '+',
263
+                                  '+', '+', '-', '+', '+', '+', '+', '+', '+',
264
+                                  '+', '+', '+', '+', '+', '+', '+', '+', '+',
265
+                                  '+', '+', '+', '+', '+', '-', '-', '-', '+',
266
+                                  '+', '+'),
267
+                       GeneName = c('LOC729737', 'LOC100132287', 'LOC100132287',
268
+                                    'LOC100133331', 'LOC100133331',
269
+                                    'LOC100288069', 'LOC100288069',
270
+                                    'LINC01128', 'LINC01128', 'LOC100130417',
271
+                                    'LOC100130417', 'C1orf159', 'C1orf159',
272
+                                    'C1orf159', 'LOC729737', 'LOC729737',
273
+                                    'LOC729737', 'LOC100132287', 'LOC100132287',
274
+                                    'LOC100132287', 'LOC100133331',
275
+                                    'LOC100133331', 'LOC100133331',
276
+                                    'LOC100288069', 'LOC100288069',
277
+                                    'LOC100288069', 'LINC01128', 'LINC01128',
278
+                                    'LINC01128', 'LOC100130417', 'LOC100130417',
279
+                                    'LOC100130417', 'C1orf159', 'C1orf159',
280
+                                    'C1orf159', 'C1orf159', 'C1orf159',
281
+                                    'C1orf159'),
282
+                       GeneStrand = c('-', '+', '+', '-', '-', '-', '-', '+',
283
+                                      '+', '-', '-', '-', '-', '-', '-', '-',
284
+                                      '-', '+', '+', '+', '-', '-', '-', '-',
285
+                                      '-', '-', '+', '+', '+', '-', '-', '-',
286
+                                      '-', '-', '-', '-', '-', '-'),
287
+                       TimePoint = c('01', '01', '01', '01', '01', '01', '01',
288
+                                     '01', '01', '01', '01', '01', '01', '01',
289
+                                     '01', '01', '01', '01', '01', '01', '01',
290
+                                     '01', '01', '01', '01', '01', '01', '01',
291
+                                     '01', '01', '01', '01', '01', '01', '01',
292
+                                     '01', '01', '01'),
293
+                       CellMarker = c('CD34', 'CD13', 'CD34', 'CD13', 'CD34',
294
+                                      'CD13', 'CD34', 'CD13', 'CD34', 'CD13',
295
+                                      'CD34', 'CD34', 'CD13', 'CD34', 'CD14',
296
+                                      'CD19', 'CD3', 'CD14', 'CD19', 'CD3',
297
+                                      'CD14', 'CD19', 'CD3', 'CD14', 'CD19',
298
+                                      'CD3', 'CD14', 'CD19', 'CD3', 'CD14',
299
+                                      'CD19', 'CD3', 'CD14', 'CD19', 'CD3',
300
+                                      'CD14', 'CD19', 'CD3'),
301
+                       Tissue = c('BM', 'BM', 'BM', 'BM', 'BM', 'BM', 'BM',
302
+                                  'BM', 'BM', 'BM', 'BM', 'BM', 'BM', 'BM',
303
+                                  'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB',
304
+                                  'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB',
305
+                                  'PB', 'PB', 'PB', 'PB', 'PB', 'PB', 'PB',
306
+                                  'PB', 'PB', 'PB'),
307
+                       Value = c(1000, 1, 1, 1, 1, 1000, 300, 1, 10, 10, 8,
308
+                                 1000, 3, 3, 1, 1, 1, 1000, 1, 1, 1000, 500,
309
+                                 1, 500, 1, 1, 1, 10, 10, 12, 14, 9, 30, 30,
310
+                                 30, 1000, 90, 90))
311
+  expect_equal(purity_filtered, expected)
312
+  ## DF
313
+  purity_filtered <- purity_filter(x = df,
314
+                                   aggregation_key = c("CellMarker",
315
+                                                       "Tissue",
316
+                                                       "TimePoint"),
317
+                                   group_key = c("CellMarker", "Tissue"),
318
+                                   selected_groups = tibble::tribble(
319
+                                     ~ CellMarker, ~ Tissue,
320
+                                     "CD34", "BM",
321
+                                     "CD13", "BM"
322
+                                   ),
323
+                                   min_value = 0,
324
+                                   impurity_threshold = 10,
325
+                                   by_timepoint = TRUE,
326
+                                   value_column = "Value")
327
+  expect_equal(purity_filtered, expected)
328
+})