#' Search for a sequence #' #' Search for one or more amino acid or nucleotide CDR3 sequences in a list of #' data frames. #' #' @param list A list of data frames generated by the LymphoSeq functions readImmunoSeq #' or productiveSeq. "aminoAcid" or "nucleotide", "frequencyCount", and #' "count" are required columns. #' @param sequence A character vector of one ore more amino acid or nucleotide #' CDR3 sequences to search. #' @param type A character vector specifying the type of sequence(s) to be #' searched. Available options are "aminoAcid" or "nucleotide". #' @param match A character vector specifying whether an exact partial or exact global #' match of the searched sequence(s) is desired. Available options are #' "partial" and "global". #' @param editDistance An integer giving the minimum edit distance that the #' sequence must be less than or equal to. See details below. #' @details An exact partial match means the searched sequence is contained within #' target sequence. An exact global match means the searched sequence is identical to #' the target sequence. #' #' Edit distance is a way of quantifying how dissimilar two sequences #' are to one another by counting the minimum number of operations required to #' transform one sequence into the other. For example, an edit distance of 0 #' means the sequences are identical and an edit distance of 1 indicates that #' the sequences different by a single amino acid or nucleotide. #' @return Returns the rows for every instance in the list of data frames where #' the searched sequence(s) appeared. #' @examples #' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") #' #' file.list <- readImmunoSeq(path = file.path) #' #' aa1 <- "CASSPVSNEQFF" #' #' aa2 <- "CASSQEVPPYQAFF" #' #' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", #' match = "global", editDistance = 0) #' #' searchSeq(list = file.list, sequence = c(aa1, aa2), #' type = "aminoAcid", match = "global", editDistance = 0) #' #' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", editDistance = 1) #' #' nt <- "CTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTCCGGTAAGCAATGAGCAGTTCTTCGGGCCA" #' #' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 3) #' #' searchSeq(list = file.list, sequence = "CASSPVS", type = "aminoAcid", #' match = "partial", editDistance = 0) #' #' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 0) #' @export #' @importFrom plyr llply #' @importFrom utils adist searchSeq <- function(list, sequence, type = "aminoAcid", match = "global", editDistance = 0) { if (editDistance >= 1) { merged.results <- data.frame() i <- 1 for (i in 1:length(list)) { file <- list[[i]] ed <- adist(sequence, file[, type]) rownames(ed) <- sequence colnames(ed) <- file[, type] ed.index <- which(ed <= editDistance, arr.ind = TRUE) if (nrow(ed.index) >= 1) { ed.subset <- as.integer() j <- 1 for (j in 1:nrow(ed.index)) { ed.j <- ed[ed.index[j, "row"], ed.index[j, "col"]] ed.subset <- c(ed.subset, ed.j) } results <- data.frame(searchSequnece = rownames(ed)[ed.index[, 1]], foundSequnece = colnames(ed)[ed.index[, 2]], editDistance = ed.subset) results <- results[!duplicated(results), ] search <- plyr::llply(list[i], function(x) x[which(x[, type] %in% results$foundSequnece), ]) found <- NULL k <- 1 for (k in 1:length(search)) { if (nrow(search[[k]]) != 0) { search[[k]] <- cbind(rep(names(search)[k], nrow(search[[k]])), search[[k]]) colnames(search[[k]])[1] <- "sample" found <- rbind(found, search[[k]]) } } names(found)[which(names(found) == type)] <- "foundSequnece" merged.search <- merge(results, found) merged.results <- rbind(merged.results, merged.search) merged.results <- merged.results[c("sample", setdiff(names(merged.results), "sample"))] } } if (nrow(merged.results) >= 1) { merged.results$foundSequnece = as.character(merged.results$foundSequnece) merged.results$searchSequnece = as.character(merged.results$searchSequnece) merged.results = merged.results[order(merged.results$frequencyCount, decreasing = TRUE),] rownames(merged.results) = NULL return(merged.results) } else { cat("No sequences found.") } } else { if (match == "global") { search <- plyr::llply(list, function(x) x[which(x[, type] %in% sequence), ]) } if (match == "partial") { search <- plyr::llply(list, function(x) x[grep(paste(sequence, collapse = "|"), x[, type]), ]) } found <- NULL l <- 1 for (l in 1:length(search)) { if (nrow(search[[l]]) != 0) { search[[l]] <- cbind(rep(names(search)[l], nrow(search[[l]])), search[[l]]) colnames(search[[l]])[1] <- "sample" found <- rbind(found, search[[l]]) } } if (is.null(found)) { message("No sequences found.") } else { found = found[order(found$frequencyCount, decreasing = TRUE),] rownames(found) = NULL return(found) } } }