... | ... |
@@ -1,4 +1,5 @@ |
1 |
-#' Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM} results |
|
1 |
+#' Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM} |
|
2 |
+#' results |
|
2 | 3 |
#' |
3 | 4 |
#' For convenience, a few functions are available to convert BIOM, DADA2 and |
4 | 5 |
#' phyloseq objects to |
... | ... |
@@ -9,8 +10,8 @@ |
9 | 10 |
#' |
10 | 11 |
#' @param prefix.rm \code{Logical scalar}. Should |
11 | 12 |
#' taxonomic prefixes be removed? The prefixes is removed only from detected |
12 |
-#' taxa columns meaning that \code{rank.from.prefix} should be enabled in the most cases. |
|
13 |
-#' (Default: \code{FALSE}) |
|
13 |
+#' taxa columns meaning that \code{rank.from.prefix} should be enabled in the |
|
14 |
+#' most cases. (Default: \code{FALSE}) |
|
14 | 15 |
#' |
15 | 16 |
#' @param removeTaxaPrefixes Deprecated. Use \code{prefix.rm} instead. |
16 | 17 |
#' |
... | ... |
@@ -27,7 +28,8 @@ |
27 | 28 |
#' @param remove.artifacts Deprecated. Use \code{artifact.rm} instead. |
28 | 29 |
#' |
29 | 30 |
#' @details |
30 |
-#' \code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}} object to a |
|
31 |
+#' \code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}} |
|
32 |
+#' object to a |
|
31 | 33 |
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} |
32 | 34 |
#' object. |
33 | 35 |
#' |
... | ... |
@@ -37,7 +39,7 @@ |
37 | 39 |
#' |
38 | 40 |
#' @return |
39 | 41 |
#' \code{convertFromBIOM} returns an object of class |
40 |
-#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} |
|
42 |
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} |
|
41 | 43 |
#' |
42 | 44 |
#' @name importBIOM |
43 | 45 |
#' |
... | ... |
@@ -75,7 +77,7 @@ NULL |
75 | 77 |
#' |
76 | 78 |
#' @return |
77 | 79 |
#' \code{importBIOM} returns an object of class |
78 |
-#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} |
|
80 |
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} |
|
79 | 81 |
#' |
80 | 82 |
#' @name importBIOM |
81 | 83 |
#' |
... | ... |
@@ -187,7 +189,8 @@ convertFromBIOM <- function( |
187 | 189 |
# Transposing feature_data and make it df object |
188 | 190 |
feature_data <- as.data.frame(feature_data) |
189 | 191 |
# Add column that includes all the data |
190 |
- feature_data[["taxonomy_unparsed"]] <- apply(feature_data, 1, paste0, collapse = ";") |
|
192 |
+ merged_col <- apply(feature_data, 1, paste0, collapse = ";") |
|
193 |
+ feature_data <- cbind(feature_data, merged_col) |
|
191 | 194 |
# Add correct colnames |
192 | 195 |
colnames(feature_data) <- c(colnames, "taxonomy_unparsed") |
193 | 196 |
} |
... | ... |
@@ -195,27 +198,29 @@ convertFromBIOM <- function( |
195 | 198 |
# that the taxonomy is not parsed. Try to parse it. |
196 | 199 |
if( ncol(feature_data) == 1 ){ |
197 | 200 |
colnames(feature_data) <- "taxonomy_unparsed" |
198 |
- tax_tab <- .parse_taxonomy(feature_data, column_name = colnames(feature_data)) |
|
201 |
+ tax_tab <- .parse_taxonomy( |
|
202 |
+ feature_data, column_name = colnames(feature_data)) |
|
199 | 203 |
feature_data <- cbind(tax_tab, feature_data) |
200 | 204 |
feature_data <- as.data.frame(feature_data) |
201 | 205 |
} |
202 | 206 |
|
203 |
- # Clean feature_data from possible character artifacts if specified |
|
207 |
+ # Clean feature_data from possible character artifacts if specified. |
|
204 | 208 |
if( artifact.rm ){ |
205 | 209 |
feature_data <- .detect_taxa_artifacts_and_clean(feature_data, ...) |
206 | 210 |
} |
207 |
- |
|
208 |
- # Replace taxonomy ranks with ranks found based on prefixes |
|
209 |
- if( rank.from.prefix && all( |
|
210 |
- unlist(lapply(colnames(feature_data), |
|
211 |
- function(x) !x %in% TAXONOMY_RANKS)))){ |
|
211 |
+ # Replace taxonomy ranks with ranks found based on prefixes. Test first |
|
212 |
+ # if columns are already ranks. If they are, do not change them. |
|
213 |
+ cols_not_already_ranks <- lapply( |
|
214 |
+ colnames(feature_data), function(x) !x %in% TAXONOMY_RANKS) |> |
|
215 |
+ unlist() |> all() |
|
216 |
+ if( rank.from.prefix && cols_not_already_ranks ){ |
|
212 | 217 |
# Find ranks |
213 |
- ranks <- lapply(colnames(feature_data), |
|
214 |
- .replace_colnames_based_on_prefix, x=feature_data) |
|
218 |
+ ranks <- lapply( |
|
219 |
+ colnames(feature_data), .replace_colnames_based_on_prefix, |
|
220 |
+ x = feature_data) |
|
215 | 221 |
# Replace old ranks with found ranks |
216 | 222 |
colnames(feature_data) <- unlist(ranks) |
217 | 223 |
} |
218 |
- |
|
219 | 224 |
# Adjust row and colnames |
220 | 225 |
rownames(counts) <- rownames(feature_data) <- biomformat::rownames(x) |
221 | 226 |
colnames(counts) <- rownames(sample_data) <- biomformat::colnames(x) |
... | ... |
@@ -225,27 +230,24 @@ convertFromBIOM <- function( |
225 | 230 |
feature_data <- DataFrame(feature_data) |
226 | 231 |
# Convert into list |
227 | 232 |
assays <- SimpleList(counts = counts) |
228 |
- |
|
229 | 233 |
# Create TreeSE |
230 | 234 |
tse <- TreeSummarizedExperiment( |
231 | 235 |
assays = assays, |
232 | 236 |
colData = sample_data, |
233 | 237 |
rowData = feature_data) |
234 |
- |
|
235 | 238 |
# Set ranks based on rowData columns if user has specified to do so |
236 | 239 |
temp <- .set_ranks_based_on_rowdata(tse, ...) |
237 |
- |
|
238 | 240 |
# Remove prefixes if specified and rowData includes info |
239 | 241 |
if(prefix.rm && ncol(rowData(tse)) > 0){ |
240 | 242 |
rowData(tse) <- .remove_prefixes_from_taxa(rowData(tse), ...) |
241 | 243 |
} |
242 |
- |
|
243 | 244 |
return(tse) |
244 | 245 |
} |
245 | 246 |
|
246 | 247 |
#' @rdname importBIOM |
247 | 248 |
#' |
248 |
-#' @param x \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} |
|
249 |
+#' @param x |
|
250 |
+#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} |
|
249 | 251 |
#' |
250 | 252 |
#' @param assay.type \code{Character scaler}. The name of assay. |
251 | 253 |
#' (Default: \code{"counts"}) |
... | ... |
@@ -280,7 +282,7 @@ setMethod( |
280 | 282 |
feature_tab, |
281 | 283 |
prefixes = paste0( |
282 | 284 |
"(", paste0(.taxonomy_rank_prefixes, collapse = "|"), ")__"), |
283 |
- only.taxa.col = TRUE, ...){ |
|
285 |
+ only.taxa.col = TRUE, ignore.col = "taxonomy_unparsed", ...){ |
|
284 | 286 |
# |
285 | 287 |
if( !.is_a_bool(only.taxa.col) ){ |
286 | 288 |
stop("'only.taxa.col' must be TRUE or FALSE.", call. = FALSE) |
... | ... |
@@ -289,17 +291,22 @@ setMethod( |
289 | 291 |
if( !.is_a_string(prefixes) ){ |
290 | 292 |
stop("'prefixes' must be a single character value.", call. = FALSE) |
291 | 293 |
} |
294 |
+ # |
|
295 |
+ if( !(is.character(ignore.col) || is.null(ignore.col)) ){ |
|
296 |
+ stop("'ignore.col' must be a character value or NULL.", call. = FALSE) |
|
297 |
+ } |
|
298 |
+ # |
|
292 | 299 |
# Subset by taking only taxonomy info if user want to remove the pattern |
293 | 300 |
# only from those. (Might be too restricting, e.g., if taxonomy columns are |
294 | 301 |
# not detected in previous steps. That is way the default is FALSE) |
302 |
+ ind <- rep(TRUE, ncol(feature_tab)) |
|
295 | 303 |
if( only.taxa.col ){ |
296 | 304 |
ind <- tolower(colnames(feature_tab)) %in% TAXONOMY_RANKS |
297 |
- temp <- feature_tab[, ind, drop = FALSE] |
|
298 |
- } else{ |
|
299 |
- ind <- rep(TRUE, ncol(feature_tab)) |
|
300 |
- temp <- feature_tab |
|
301 | 305 |
} |
302 |
- |
|
306 |
+ # Also remove those columns that are specified to ignore (by default the |
|
307 |
+ # column that has unparsed data) |
|
308 |
+ ind <- ind & !colnames(feature_tab) %in% ignore.col |
|
309 |
+ temp <- feature_tab[, ind, drop = FALSE] |
|
303 | 310 |
# If there are columns left for removing the pattern |
304 | 311 |
if( ncol(temp) > 0 ){ |
305 | 312 |
# Remove patterns |
... | ... |
@@ -321,7 +328,7 @@ setMethod( |
321 | 328 |
# rank that is fed to function. |
322 | 329 |
.replace_colnames_based_on_prefix <- function(colname, x){ |
323 | 330 |
# Get column |
324 |
- col = x[ , colname] |
|
331 |
+ col <- x[ , colname] |
|
325 | 332 |
# List prefixes |
326 | 333 |
all_ranks <- .taxonomy_rank_prefixes |
327 | 334 |
prefixes <- paste0("^", all_ranks, "__") |
... | ... |
@@ -340,35 +347,53 @@ setMethod( |
340 | 347 |
} |
341 | 348 |
|
342 | 349 |
# Detect and clean non wanted characters from Taxonomy data if needed. |
343 |
-.detect_taxa_artifacts_and_clean <- function(x, pattern = "auto", ...) { |
|
350 |
+.detect_taxa_artifacts_and_clean <- function( |
|
351 |
+ x, pattern = "auto", ignore.col = "taxonomy_unparsed", ...) { |
|
344 | 352 |
# |
345 | 353 |
if( !.is_non_empty_character(pattern) ){ |
346 | 354 |
stop("'pattern' must be a single character value.", call. = FALSE) |
347 | 355 |
} |
348 | 356 |
# |
357 |
+ if( !(is.character(ignore.col) || is.null(ignore.col)) ){ |
|
358 |
+ stop("'ignore.col' must be a character value or NULL.", call. = FALSE) |
|
359 |
+ } |
|
360 |
+ # |
|
361 |
+ # Store rownames because they are lost during modifications |
|
349 | 362 |
row_names <- rownames(x) |
350 |
- # Remove artifacts |
|
351 |
- if( pattern == "auto" ){ |
|
352 |
- .require_package("stringr") |
|
353 |
- # Remove all but these characters |
|
354 |
- pattern <- "[[:alnum:]]|-|_|\\[|\\]|,|;\\||[[:space:]]" |
|
355 |
- x <- lapply(x, function(col){ |
|
356 |
- # Take all specified characters as a matrix where each column is a |
|
357 |
- # character |
|
358 |
- temp <- stringr::str_extract_all(col, pattern = pattern, simplify = TRUE) |
|
359 |
- # Collapse matrix to strings |
|
360 |
- temp <- apply(temp, 1, paste, collapse = "") |
|
361 |
- # Now NAs are converted into characters. Convert them back |
|
362 |
- temp[ temp == "NA" ] <- NA |
|
363 |
- # Convert also empty strings to NA |
|
364 |
- temp[ temp == "" ] <- NA |
|
365 |
- return(temp) |
|
366 |
- }) |
|
367 |
- } else{ |
|
368 |
- # Remove pattern specified by user |
|
369 |
- x <- lapply(x, gsub, pattern = pattern, replacement = "") |
|
363 |
+ # Get those columns that are not modified |
|
364 |
+ not_mod <- x[ , colnames(x) %in% ignore.col, drop = FALSE] |
|
365 |
+ # Subset the data being modified so that it includes only those that user |
|
366 |
+ # wants to modify |
|
367 |
+ x <- x[ , !colnames(x) %in% ignore.col, drop = FALSE] |
|
368 |
+ # If there are still columns to clean after subsetting |
|
369 |
+ if( ncol(x) > 0 ){ |
|
370 |
+ # Remove artifacts |
|
371 |
+ if( pattern == "auto" ){ |
|
372 |
+ .require_package("stringr") |
|
373 |
+ # Remove all but these characters |
|
374 |
+ pattern <- "[[:alnum:]]|-|_|\\[|\\]|,|;\\||[[:space:]]" |
|
375 |
+ x <- lapply(x, function(col){ |
|
376 |
+ # Take all specified characters as a matrix where each column |
|
377 |
+ # is a character |
|
378 |
+ temp <- stringr::str_extract_all( |
|
379 |
+ col, pattern = pattern, simplify = TRUE) |
|
380 |
+ # Collapse matrix to strings |
|
381 |
+ temp <- apply(temp, 1, paste, collapse = "") |
|
382 |
+ # Now NAs are converted into characters. Convert them back |
|
383 |
+ temp[ temp == "NA" ] <- NA |
|
384 |
+ # Convert also empty strings to NA |
|
385 |
+ temp[ temp == "" ] <- NA |
|
386 |
+ return(temp) |
|
387 |
+ }) |
|
388 |
+ } else{ |
|
389 |
+ # Remove pattern specified by user |
|
390 |
+ x <- lapply(x, gsub, pattern = pattern, replacement = "") |
|
391 |
+ } |
|
370 | 392 |
} |
393 |
+ # Ensure that the data is data.frame |
|
371 | 394 |
x <- as.data.frame(x) |
395 |
+ # Combine modified and non-modified data |
|
396 |
+ x <- cbind(x, not_mod) |
|
372 | 397 |
# Add rownames because they are dropped while removing artifacts |
373 | 398 |
rownames(x) <- row_names |
374 | 399 |
return(x) |
... | ... |
@@ -400,4 +425,4 @@ setMethod( |
400 | 425 |
# Create biom |
401 | 426 |
biom <- do.call(biomformat::make_biom, args) |
402 | 427 |
return(biom) |
403 |
-} |
|
404 | 428 |
\ No newline at end of file |
429 |
+} |
... | ... |
@@ -540,7 +540,8 @@ |
540 | 540 |
# This function sets taxonomy ranks based on rowData of TreeSE. With this, |
541 | 541 |
# user can automatically set ranks based on imported data. |
542 | 542 |
.set_ranks_based_on_rowdata <- function( |
543 |
- tse, set.ranks = FALSE, verbose = TRUE, ...){ |
|
543 |
+ tse, set.ranks = FALSE, verbose = TRUE, |
|
544 |
+ ignore.col = "taxonomy_unparsed", ...){ |
|
544 | 545 |
# |
545 | 546 |
if( !.is_a_bool(set.ranks) ){ |
546 | 547 |
stop("'set.ranks' must be TRUE or FALSE.", call. = FALSE) |
... | ... |
@@ -550,28 +551,38 @@ |
550 | 551 |
stop("'verbose' must be TRUE or FALSE.", call. = FALSE) |
551 | 552 |
} |
552 | 553 |
# |
553 |
- # If user do not want to set ranks |
|
554 |
- if( !set.ranks ){ |
|
555 |
- return(NULL) |
|
554 |
+ if( !(is.character(ignore.col) || is.null(ignore.col)) ){ |
|
555 |
+ stop("'ignore.col' must be a character value or NULL.", call. = FALSE) |
|
556 | 556 |
} |
557 |
+ # |
|
557 | 558 |
# Get ranks from rowData |
558 |
- ranks <- colnames(rowData(tse)) |
|
559 |
+ rd <- rowData(tse) |
|
560 |
+ # Remove those columns that are ignored. By default, the column |
|
561 |
+ # containing unparsed taxonomy. |
|
562 |
+ rd <- rd[ , !colnames(rd) %in% ignore.col, drop = FALSE] |
|
559 | 563 |
# Ranks must be character columns |
560 |
- is_char <- lapply(rowData(tse), function(x) is.character(x) || is.factor(x)) |
|
564 |
+ is_char <- lapply( |
|
565 |
+ rd, function(x) is.character(x) || is.factor(x)) |
|
561 | 566 |
is_char <- unlist(is_char) |
562 |
- ranks <- ranks[ is_char ] |
|
563 |
- # rowData is empty, cannot set ranks |
|
564 |
- if( length(ranks) == 0 ){ |
|
567 |
+ rd <- rd[ , is_char, drop = FALSE] |
|
568 |
+ # If user wants to set ranks and there are ranks after filtering out |
|
569 |
+ # those columns that are not characters. |
|
570 |
+ if( set.ranks && ncol(rd) > 0L ){ |
|
571 |
+ # Finally, set ranks and give message |
|
572 |
+ ranks <- colnames(rd) |
|
573 |
+ temp <- setTaxonomyRanks(ranks) |
|
574 |
+ if( verbose ){ |
|
575 |
+ message( |
|
576 |
+ "TAXONOMY_RANKS set to: '", |
|
577 |
+ paste0(ranks, collapse = "', '"), "'") |
|
578 |
+ } |
|
579 |
+ } |
|
580 |
+ # If user wanted to set ranks but there were no suitable columns in rowData, |
|
581 |
+ # give warning |
|
582 |
+ if( set.ranks && ncol(rd) == 0L ){ |
|
565 | 583 |
warning( |
566 | 584 |
"Ranks cannot be set. rowData(x) does not include columns ", |
567 | 585 |
"specifying character values.", call. = FALSE) |
568 |
- return(NULL) |
|
569 |
- } |
|
570 |
- # Finally, set ranks and give message |
|
571 |
- temp <- setTaxonomyRanks(ranks) |
|
572 |
- if( verbose ){ |
|
573 |
- message( |
|
574 |
- "TAXONOMY_RANKS set to: '", paste0(ranks, collapse = "', '"), "'") |
|
575 | 586 |
} |
576 | 587 |
return(NULL) |
577 | 588 |
} |
... | ... |
@@ -5,7 +5,8 @@ |
5 | 5 |
\alias{convertFromBIOM} |
6 | 6 |
\alias{convertToBIOM} |
7 | 7 |
\alias{convertToBIOM,SummarizedExperiment-method} |
8 |
-\title{Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM} results} |
|
8 |
+\title{Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM} |
|
9 |
+results} |
|
9 | 10 |
\usage{ |
10 | 11 |
importBIOM(file, ...) |
11 | 12 |
|
... | ... |
@@ -33,8 +34,8 @@ convertToBIOM(x, assay.type = "counts", ...) |
33 | 34 |
|
34 | 35 |
\item{prefix.rm}{\code{Logical scalar}. Should |
35 | 36 |
taxonomic prefixes be removed? The prefixes is removed only from detected |
36 |
-taxa columns meaning that \code{rank.from.prefix} should be enabled in the most cases. |
|
37 |
-(Default: \code{FALSE})} |
|
37 |
+taxa columns meaning that \code{rank.from.prefix} should be enabled in the |
|
38 |
+most cases. (Default: \code{FALSE})} |
|
38 | 39 |
|
39 | 40 |
\item{removeTaxaPrefixes}{Deprecated. Use \code{prefix.rm} instead.} |
40 | 41 |
|
... | ... |
@@ -69,7 +70,8 @@ objects, and |
69 | 70 |
objects to phyloseq objects. |
70 | 71 |
} |
71 | 72 |
\details{ |
72 |
-\code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}} object to a |
|
73 |
+\code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}} |
|
74 |
+object to a |
|
73 | 75 |
\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} |
74 | 76 |
object. |
75 | 77 |
|