#' Classify contigs as Prophage-like, Sloping, HighCovNoPattern, and NoPattern #' #' Performs all the pattern-matching and summarizes the results into a list. The #' first item in the list is a table consisting of the summary information of #' all the contigs that passed through pattern-matching (i.e were not filtered #' out). The second item in the list is a table consisting of the summary #' information of all contigs that were classified via pattern-matching. The #' third item in the list contains the pattern-match information associated with #' each contig in the previous table. The fourth object in the list is a table #' containing the contigs that were filtered out prior to pattern-matching. The #' fifth item is the windowSize used for the search. #' #' @param VLPpileup VLP-fraction pileup file generated by mapping sequencing reads #' from a sample's ultra-purified VLP-fraction mapped to the sample's whole-community #' metagenome assembly. The pileup file MUST have the following format: #' * V1: Contig accession #' * V2: Mapped read coverage values averaged over 100 bp windows #' * V3: Starting position (bp) of each 100 bp window. Restarts from 0 at the #' start of each new contig. #' * V4: Starting position (bp) of each 100 bp window. Does NOT restart at the #' start of each new contig. #' @param WCpileup A whole-community pileup file generated by mapping sequencing reads #' from a sample's whole-community mapped to the sample's whole-community metagenome assembly. #' The pileup file MUST have the following format: #' * V1: Contig accession #' * V2: Mapped read coverage values averaged over 100 bp windows #' * V3: Starting position (bp) of each 100 bp window. Restarts from 0 at the #' start of each new contig. #' * V4: Starting position (bp) of each 100 bp window. Does NOT restart at the #' start of each new contig. #' @param windowSize The number of basepairs to average read coverage values #' over. Options are 100, 200, 500, 1000 ONLY. Default is 1000. #' @param minBlockSize The minimum size (in bp) of the Prophage-like block #' pattern. Default is 10000. Must be at least 1000. #' @param maxBlockSize The maximum size (in bp) of the Prophage-like block #' pattern. Default is NA (no maximum). #' @param minContigLength The minimum contig size (in bp) to perform #' pattern-matching on. Must be at least 25000. Default is 30000. #' @param minSlope The minimum slope value to test for sloping patterns. Default #' is 0.001 (i.e minimum change of 10x read coverage over 100,000 bp). #' @param SaveFilesTo Optional, Provide a path to the directory you wish to save #' output to. A folder will be made within the provided directory to store #' results. #' @param verbose TRUE or FALSE. Print progress messages to console. Default is TRUE. #' @param suggFiltThresh TRUE or FALSE, Suggest a filtering threshold for #' TrIdent classifications based on the normalized pattern-match scores. #' Default is FALSE. #' @importFrom utils capture.output #' @return Large list containing 5 objects #' @export #' @examples #' data("VLPFractionSamplePileup") #' data("WholeCommunitySamplePileup") #' #' TrIdent_results <- TrIdentClassifier( #' VLPpileup = VLPFractionSamplePileup, #' WCpileup = WholeCommunitySamplePileup #' ) TrIdentClassifier <- function(VLPpileup, WCpileup, windowSize = 1000, minBlockSize = 10000, maxBlockSize = Inf, minContigLength = 30000, minSlope = 0.001, suggFiltThresh = FALSE, verbose = TRUE, SaveFilesTo) { ## error catching if (!(windowSize %in% list(100, 200, 500, 1000))) { stop("windowSize must be either 100, 200, 500, or 1000 bp!") } if (minContigLength <= 25000) { stop("minContigLength must be at least 25,000 bp for pattern-matching!") } if (minBlockSize <= 1000) { stop("minBlockSize must be greater than 1000 bp!") } ## input validation if (nrow(VLPpileup) != nrow(WCpileup)) { stop("VLP and WC pileup files have differing row numbers") } if (abs(VLPpileup[1, 3] - VLPpileup[2, 3]) != 100 | abs(WCpileup[1, 3] - WCpileup[2, 3]) != 100) { stop("pileup files MUST have a windowSize/binsize of 100!") } if (all(VLPpileup[, 1] == WCpileup[, 1]) == FALSE) { stop("The first column of the VLP and WC pileup file should be identical if mapping was performed correctly...") } ## main algorithm start startTime <- Sys.time() if (verbose) {message("Reformatting pileup files")} VLPpileup <- pileupFormatter(VLPpileup) WCpileup <- pileupFormatter(WCpileup) if (verbose) {message("Starting pattern-matching...")} classificationSummary <- patternMatcher( VLPpileup, WCpileup, windowSize, minBlockSize, maxBlockSize, minContigLength, minSlope, verbose ) summaryTable <- contigClassSumm(classificationSummary[[1]]) summaryTable <- VLPtoWCRatioCalc(summaryTable, WCpileup, VLPpileup) summaryTable <- patternMatchSize( summaryTable, classificationSummary[[1]], windowSize, verbose ) summaryTable <- prophageLikeElevation( summaryTable, allProphageLikeClassifs(classificationSummary[[1]]), VLPpileup, WCpileup, windowSize, verbose ) summaryTable <- slopeSumm( summaryTable, allSlopingClassifs(classificationSummary[[1]]), windowSize ) if (verbose) {message("Finalizing output")} summaryList <- list( SummaryTable = summaryTable, CleanedSummaryTable = summaryTable[which( summaryTable[, 2] == "Prophage-like" | summaryTable[, 2] == "Sloping" | summaryTable[, 2] == "HighCovNoPattern" ), ], PatternMatchInfo = allPatternMatches( classificationSummary[[1]], summaryTable ), FilteredOutContigTable = classificationSummary[[2]], windowSize = windowSize ) plot <- resultsHisto(summaryList, suggFiltThresh) summaryList <- c(summaryList, ResultHistogram = plot) endTime <- Sys.time() duration <- difftime(endTime, startTime) if (verbose) { message( "Execution time: ", round(duration[[1]], 2), units(duration))} if (verbose) { message( length(which( classificationSummary[[2]][, 2] == "Low VLP-fraction read cov" )), " contigs were filtered out based on low read coverage" ) } if (verbose) { message( length(which( classificationSummary[[2]][, 2] == "Contig length too small" )), " contigs were filtered out based on length" ) } table <- (table(summaryList[[1]][, 2])) if (verbose) { message(paste0(capture.output(table), collapse = "\n")) } if (verbose) { message( length(which(summaryList[[1]][, 8] == "Elevated")), " of the prophage-like classifications are highly active or abundant" ) } if (verbose) { message( length(which(summaryList[[1]][, 8] == "Depressed")), " of the prophage-like classifications are mixed, i.e. heterogenously integrated into their bacterial host population" ) } if (missing(SaveFilesTo) == FALSE) { ifelse(!dir.exists(paths = paste0(SaveFilesTo, "\\TrIdentOutput")), dir.create(paste0(SaveFilesTo, "\\TrIdentOutput")), stop( "'TrIdentOutput' folder exists already in the provided directory" ) ) write.table( summaryTable, file = paste0( SaveFilesTo, "\\TrIdentOutput\\TrIdentSummaryTable.csv" ), sep = ",", row.names = FALSE ) write.table( summaryTable[which( summaryTable[, 2] == "Prophage-like" | summaryTable[, 2] == "Sloping" | summaryTable[, 2] == "HighCovNoPattern" ), ], file = paste0( SaveFilesTo, "\\TrIdentOutput\\TrIdentSummaryTableCleaned.csv" ), sep = ",", row.names = FALSE ) write.table( classificationSummary[[2]], file = paste0( SaveFilesTo, "\\TrIdentOutput\\TrIdentFilteredOutContigs.csv" ), sep = ",", row.names = FALSE ) ggsave( filename = paste0( SaveFilesTo, "\\TrIdentOutput\\TrIdentResultHistogram.png" ), plot = plot, width = 4, height = 4 ) return(summaryList) } else { return(summaryList) } }