Browse code

Minor updates BIOM functions (#645)

Tuomas Borman authored on 27/09/2024 07:17:45 • GitHub committed on 27/09/2024 07:17:45
Showing 4 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: mia
2 2
 Type: Package
3
-Version: 1.13.43
3
+Version: 1.13.44
4 4
 Authors@R:
5 5
     c(person(given = "Felix G.M.", family = "Ernst", role = c("aut"),
6 6
              email = "felix.gm.ernst@outlook.com",
... ...
@@ -1,4 +1,5 @@
1
-#' Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM} results
1
+#' Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM}
2
+#' results
2 3
 #'
3 4
 #' For convenience, a few functions are available to convert BIOM, DADA2 and 
4 5
 #' phyloseq objects to 
... ...
@@ -9,8 +10,8 @@
9 10
 #' 
10 11
 #' @param prefix.rm \code{Logical scalar}. Should
11 12
 #' taxonomic prefixes be removed? The prefixes is removed only from detected
12
-#' taxa columns meaning that \code{rank.from.prefix} should be enabled in the most cases.
13
-#' (Default: \code{FALSE})
13
+#' taxa columns meaning that \code{rank.from.prefix} should be enabled in the
14
+#' most cases. (Default: \code{FALSE})
14 15
 #' 
15 16
 #' @param removeTaxaPrefixes Deprecated. Use \code{prefix.rm} instead.
16 17
 #' 
... ...
@@ -27,7 +28,8 @@
27 28
 #' @param remove.artifacts Deprecated. Use \code{artifact.rm} instead.
28 29
 #' 
29 30
 #' @details 
30
-#' \code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}} object to a 
31
+#' \code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}}
32
+#' object to a 
31 33
 #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
32 34
 #' object.
33 35
 #' 
... ...
@@ -37,7 +39,7 @@
37 39
 #'   
38 40
 #' @return
39 41
 #' \code{convertFromBIOM} returns an object of class
40
-#'   \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
42
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
41 43
 #'   
42 44
 #' @name importBIOM
43 45
 #' 
... ...
@@ -75,7 +77,7 @@ NULL
75 77
 #' 
76 78
 #' @return 
77 79
 #' \code{importBIOM} returns an object of class
78
-#'   \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
80
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
79 81
 #' 
80 82
 #' @name importBIOM
81 83
 #' 
... ...
@@ -187,7 +189,8 @@ convertFromBIOM <- function(
187 189
         # Transposing feature_data and make it df object
188 190
         feature_data <- as.data.frame(feature_data)
189 191
         # Add column that includes all the data
190
-        feature_data[["taxonomy_unparsed"]] <- apply(feature_data, 1, paste0, collapse = ";")
192
+        merged_col <- apply(feature_data, 1, paste0, collapse = ";")
193
+        feature_data <- cbind(feature_data, merged_col)
191 194
         # Add correct colnames
192 195
         colnames(feature_data) <- c(colnames, "taxonomy_unparsed")
193 196
     }
... ...
@@ -195,27 +198,29 @@ convertFromBIOM <- function(
195 198
     # that the taxonomy is not parsed. Try to parse it.
196 199
     if( ncol(feature_data) == 1 ){
197 200
         colnames(feature_data) <- "taxonomy_unparsed"
198
-        tax_tab <- .parse_taxonomy(feature_data, column_name = colnames(feature_data))
201
+        tax_tab <- .parse_taxonomy(
202
+            feature_data, column_name = colnames(feature_data))
199 203
         feature_data <- cbind(tax_tab, feature_data)
200 204
         feature_data <- as.data.frame(feature_data)
201 205
     }
202 206
     
203
-    # Clean feature_data from possible character artifacts if specified
207
+    # Clean feature_data from possible character artifacts if specified.
204 208
     if( artifact.rm ){
205 209
         feature_data <- .detect_taxa_artifacts_and_clean(feature_data, ...)
206 210
     }
207
-    
208
-    # Replace taxonomy ranks with ranks found based on prefixes
209
-    if( rank.from.prefix && all(
210
-        unlist(lapply(colnames(feature_data),
211
-                        function(x) !x %in% TAXONOMY_RANKS)))){
211
+    # Replace taxonomy ranks with ranks found based on prefixes. Test first
212
+    # if columns are already ranks. If they are, do not change them.
213
+    cols_not_already_ranks <- lapply(
214
+        colnames(feature_data), function(x) !x %in% TAXONOMY_RANKS) |>
215
+        unlist() |> all()
216
+    if( rank.from.prefix && cols_not_already_ranks ){
212 217
         # Find ranks
213
-        ranks <- lapply(colnames(feature_data),
214
-                        .replace_colnames_based_on_prefix, x=feature_data)
218
+        ranks <- lapply(
219
+            colnames(feature_data), .replace_colnames_based_on_prefix,
220
+            x = feature_data)
215 221
         # Replace old ranks with found ranks
216 222
         colnames(feature_data) <- unlist(ranks)
217 223
     }
218
-    
219 224
     # Adjust row and colnames
220 225
     rownames(counts) <- rownames(feature_data) <- biomformat::rownames(x)
221 226
     colnames(counts) <- rownames(sample_data) <- biomformat::colnames(x)
... ...
@@ -225,27 +230,24 @@ convertFromBIOM <- function(
225 230
     feature_data <- DataFrame(feature_data)
226 231
     # Convert into list
227 232
     assays <- SimpleList(counts = counts)
228
-    
229 233
     # Create TreeSE
230 234
     tse <- TreeSummarizedExperiment(
231 235
         assays = assays,
232 236
         colData = sample_data,
233 237
         rowData = feature_data)
234
-    
235 238
     # Set ranks based on rowData columns if user has specified to do so
236 239
     temp <- .set_ranks_based_on_rowdata(tse, ...)
237
-    
238 240
     # Remove prefixes if specified and rowData includes info
239 241
     if(prefix.rm && ncol(rowData(tse)) > 0){
240 242
         rowData(tse) <- .remove_prefixes_from_taxa(rowData(tse), ...)
241 243
     }
242
-    
243 244
     return(tse)
244 245
 }
245 246
 
246 247
 #' @rdname importBIOM
247 248
 #' 
248
-#' @param x \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
249
+#' @param x
250
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
249 251
 #' 
250 252
 #' @param assay.type \code{Character scaler}. The name of assay.
251 253
 #' (Default: \code{"counts"})
... ...
@@ -280,7 +282,7 @@ setMethod(
280 282
         feature_tab,
281 283
         prefixes = paste0(
282 284
             "(", paste0(.taxonomy_rank_prefixes, collapse = "|"), ")__"),
283
-        only.taxa.col = TRUE, ...){
285
+        only.taxa.col = TRUE, ignore.col = "taxonomy_unparsed", ...){
284 286
     #
285 287
     if( !.is_a_bool(only.taxa.col) ){
286 288
         stop("'only.taxa.col' must be TRUE or FALSE.", call. = FALSE)
... ...
@@ -289,17 +291,22 @@ setMethod(
289 291
     if( !.is_a_string(prefixes) ){
290 292
         stop("'prefixes' must be a single character value.", call. = FALSE)
291 293
     }
294
+    #
295
+    if( !(is.character(ignore.col) || is.null(ignore.col)) ){
296
+        stop("'ignore.col' must be a character value or NULL.", call. = FALSE)
297
+    }
298
+    #
292 299
     # Subset by taking only taxonomy info if user want to remove the pattern 
293 300
     # only from those. (Might be too restricting, e.g., if taxonomy columns are 
294 301
     # not detected in previous steps. That is way the default is FALSE)
302
+    ind <- rep(TRUE, ncol(feature_tab))
295 303
     if( only.taxa.col ){
296 304
         ind <- tolower(colnames(feature_tab)) %in% TAXONOMY_RANKS
297
-        temp <- feature_tab[, ind, drop = FALSE]
298
-    } else{
299
-        ind <- rep(TRUE, ncol(feature_tab))
300
-        temp <- feature_tab
301 305
     }
302
-    
306
+    # Also remove those columns that are specified to ignore (by default the
307
+    # column that has unparsed data)
308
+    ind <- ind & !colnames(feature_tab) %in% ignore.col
309
+    temp <- feature_tab[, ind, drop = FALSE]
303 310
     # If there are columns left for removing the pattern
304 311
     if( ncol(temp) > 0 ){
305 312
         # Remove patterns
... ...
@@ -321,7 +328,7 @@ setMethod(
321 328
 # rank that is fed to function.
322 329
 .replace_colnames_based_on_prefix <- function(colname, x){
323 330
     # Get column
324
-    col = x[ , colname]
331
+    col <- x[ , colname]
325 332
     # List prefixes
326 333
     all_ranks <- .taxonomy_rank_prefixes
327 334
     prefixes <- paste0("^", all_ranks, "__")
... ...
@@ -340,35 +347,53 @@ setMethod(
340 347
 }
341 348
 
342 349
 # Detect and clean non wanted characters from Taxonomy data if needed.
343
-.detect_taxa_artifacts_and_clean <- function(x, pattern = "auto", ...) {
350
+.detect_taxa_artifacts_and_clean <- function(
351
+        x, pattern = "auto", ignore.col = "taxonomy_unparsed", ...) {
344 352
     #
345 353
     if( !.is_non_empty_character(pattern) ){
346 354
         stop("'pattern' must be a single character value.", call. = FALSE)
347 355
     }
348 356
     #
357
+    if( !(is.character(ignore.col) || is.null(ignore.col)) ){
358
+        stop("'ignore.col' must be a character value or NULL.", call. = FALSE)
359
+    }
360
+    #
361
+    # Store rownames because they are lost during modifications
349 362
     row_names <- rownames(x)
350
-    # Remove artifacts
351
-    if( pattern == "auto" ){
352
-        .require_package("stringr")
353
-        # Remove all but these characters
354
-        pattern <- "[[:alnum:]]|-|_|\\[|\\]|,|;\\||[[:space:]]"
355
-        x <- lapply(x, function(col){
356
-            # Take all specified characters as a matrix where each column is a 
357
-            # character
358
-            temp <- stringr::str_extract_all(col, pattern = pattern, simplify = TRUE)
359
-            # Collapse matrix to strings
360
-            temp <- apply(temp, 1, paste, collapse = "")
361
-            # Now NAs are converted into characters. Convert them back
362
-            temp[ temp == "NA" ] <- NA
363
-            # Convert also empty strings to NA
364
-            temp[ temp == "" ] <- NA
365
-            return(temp)
366
-        })
367
-    } else{
368
-        # Remove pattern specified by user
369
-        x <- lapply(x, gsub, pattern = pattern, replacement = "")
363
+    # Get those columns that are not modified
364
+    not_mod <- x[ , colnames(x) %in% ignore.col, drop = FALSE]
365
+    # Subset the data being modified so that it includes only those that user
366
+    # wants to modify
367
+    x <- x[ , !colnames(x) %in% ignore.col, drop = FALSE]
368
+    # If there are still columns to clean after subsetting
369
+    if( ncol(x) > 0 ){
370
+        # Remove artifacts
371
+        if( pattern == "auto" ){
372
+            .require_package("stringr")
373
+            # Remove all but these characters
374
+            pattern <- "[[:alnum:]]|-|_|\\[|\\]|,|;\\||[[:space:]]"
375
+            x <- lapply(x, function(col){
376
+                # Take all specified characters as a matrix where each column
377
+                # is a character
378
+                temp <- stringr::str_extract_all(
379
+                    col, pattern = pattern, simplify = TRUE)
380
+                # Collapse matrix to strings
381
+                temp <- apply(temp, 1, paste, collapse = "")
382
+                # Now NAs are converted into characters. Convert them back
383
+                temp[ temp == "NA" ] <- NA
384
+                # Convert also empty strings to NA
385
+                temp[ temp == "" ] <- NA
386
+                return(temp)
387
+            })
388
+        } else{
389
+            # Remove pattern specified by user
390
+            x <- lapply(x, gsub, pattern = pattern, replacement = "")
391
+        }
370 392
     }
393
+    # Ensure that the data is data.frame
371 394
     x <- as.data.frame(x)
395
+    # Combine modified and non-modified data
396
+    x <- cbind(x, not_mod)
372 397
     # Add rownames because they are dropped while removing artifacts
373 398
     rownames(x) <- row_names
374 399
     return(x)
... ...
@@ -400,4 +425,4 @@ setMethod(
400 425
     # Create biom
401 426
     biom <- do.call(biomformat::make_biom, args)
402 427
     return(biom)
403
-}
404 428
\ No newline at end of file
429
+}
... ...
@@ -540,7 +540,8 @@
540 540
 # This function sets taxonomy ranks based on rowData of TreeSE. With this,
541 541
 # user can automatically set ranks based on imported data.
542 542
 .set_ranks_based_on_rowdata <- function(
543
-        tse, set.ranks = FALSE, verbose = TRUE, ...){
543
+        tse, set.ranks = FALSE, verbose = TRUE,
544
+        ignore.col = "taxonomy_unparsed", ...){
544 545
     #
545 546
     if( !.is_a_bool(set.ranks) ){
546 547
         stop("'set.ranks' must be TRUE or FALSE.", call. = FALSE)
... ...
@@ -550,28 +551,38 @@
550 551
         stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
551 552
     }
552 553
     #
553
-    # If user do not want to set ranks
554
-    if( !set.ranks ){
555
-        return(NULL)
554
+    if( !(is.character(ignore.col) || is.null(ignore.col)) ){
555
+        stop("'ignore.col' must be a character value or NULL.", call. = FALSE)
556 556
     }
557
+    #
557 558
     # Get ranks from rowData
558
-    ranks <- colnames(rowData(tse))
559
+    rd <- rowData(tse)
560
+    # Remove those columns that are ignored. By default, the column
561
+    # containing unparsed taxonomy.
562
+    rd <- rd[ , !colnames(rd) %in% ignore.col, drop = FALSE]
559 563
     # Ranks must be character columns
560
-    is_char <- lapply(rowData(tse), function(x) is.character(x) || is.factor(x))
564
+    is_char <- lapply(
565
+        rd, function(x) is.character(x) || is.factor(x))
561 566
     is_char <- unlist(is_char)
562
-    ranks <- ranks[ is_char ]
563
-    # rowData is empty, cannot set ranks
564
-    if( length(ranks) == 0 ){
567
+    rd <- rd[ , is_char, drop = FALSE]
568
+    # If user wants to set ranks and there are ranks after filtering out
569
+    # those columns that are not characters.
570
+    if( set.ranks && ncol(rd) > 0L ){
571
+        # Finally, set ranks and give message
572
+        ranks <- colnames(rd)
573
+        temp <- setTaxonomyRanks(ranks)
574
+        if( verbose ){
575
+            message(
576
+                "TAXONOMY_RANKS set to: '",
577
+                paste0(ranks, collapse = "', '"), "'")
578
+        }
579
+    }
580
+    # If user wanted to set ranks but there were no suitable columns in rowData,
581
+    # give warning
582
+    if( set.ranks && ncol(rd) == 0L ){
565 583
         warning(
566 584
             "Ranks cannot be set. rowData(x) does not include columns ",
567 585
             "specifying character values.", call. = FALSE)
568
-        return(NULL)
569
-    }
570
-    # Finally, set ranks and give message
571
-    temp <- setTaxonomyRanks(ranks)
572
-    if( verbose ){
573
-        message(
574
-            "TAXONOMY_RANKS set to: '", paste0(ranks, collapse = "', '"), "'")
575 586
     }
576 587
     return(NULL)
577 588
 }
... ...
@@ -5,7 +5,8 @@
5 5
 \alias{convertFromBIOM}
6 6
 \alias{convertToBIOM}
7 7
 \alias{convertToBIOM,SummarizedExperiment-method}
8
-\title{Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM} results}
8
+\title{Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM}
9
+results}
9 10
 \usage{
10 11
 importBIOM(file, ...)
11 12
 
... ...
@@ -33,8 +34,8 @@ convertToBIOM(x, assay.type = "counts", ...)
33 34
 
34 35
 \item{prefix.rm}{\code{Logical scalar}. Should
35 36
 taxonomic prefixes be removed? The prefixes is removed only from detected
36
-taxa columns meaning that \code{rank.from.prefix} should be enabled in the most cases.
37
-(Default: \code{FALSE})}
37
+taxa columns meaning that \code{rank.from.prefix} should be enabled in the
38
+most cases. (Default: \code{FALSE})}
38 39
 
39 40
 \item{removeTaxaPrefixes}{Deprecated. Use \code{prefix.rm} instead.}
40 41
 
... ...
@@ -69,7 +70,8 @@ objects, and
69 70
 objects to phyloseq objects.
70 71
 }
71 72
 \details{
72
-\code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}} object to a
73
+\code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}}
74
+object to a
73 75
 \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
74 76
 object.
75 77