##' site mask ##' ##' ##' @title mask ##' @param tree_object tree object ##' @param field selected field ##' @param site site ##' @param mask_site if TRUE, site will be masked. ##' if FALSE, selected site will not be masked, while other sites will be masked. ##' @return updated tree object ##' @export ##' @author Guangchuang Yu mask <- function(tree_object, field, site, mask_site=FALSE) { has_field <- has.field(tree_object, field) if (has_field == FALSE) { stop("'field' is not available in 'tree_object'...") } has_slot <- attr(has_field, "has_slot") is_codeml <- attr(has_field, "is_codeml") if (has_slot) { if (is_codeml) { field_info <- slot(tree_object@rst, field) } else { field_info <- slot(tree_object, field) } field_data <- field_info[,2] } else { field_data <- tree_object@extraInfo[, field] } field_data <- sapply(field_data, gsub, pattern="\n", replacement="/") x <- field_data[field_data != ""] x <- x[!is.na(x)] pos <- strsplit(x, " / ") %>% unlist %>% gsub("^[a-zA-Z]+", "", . ) %>% gsub("[a-zA-Z]\\s*$", "", .) %>% as.numeric if (mask_site == FALSE) { pos2 <- 1:max(pos) pos2 <- pos2[-site] site <- pos2 } site <- site[site %in% pos] for (i in seq_along(field_data)) { if (is.na(field_data[i])) next for (j in seq_along(site)) { pattern <- paste0("/*\\s*[a-zA-Z]", site[j], "[a-zA-Z]\\s*") field_data[i] <- gsub(pattern, "", field_data[i]) } field_data[i] <- gsub("^/\\s", "", field_data[i]) %>% .add_new_line } if (has_slot) { field_info[,2] <- field_data if (is_codeml) { slot(tree_object@rst, field) <- field_info } else { slot(tree_object, field) <- field_info } } else { tree_object@extraInfo[, field] <- field_data } tree_object } read.tip_seq_mlc <- function(mlcfile) { info <- getPhyInfo(mlcfile) mlc <- readLines(mlcfile) ## remove blank lines blk <- grep("^\\s*$", mlc) if (length(blk) > 0) { mlc <- mlc[-blk] } seqs <- mlc[2:(info$num+1)] seqs <- gsub("\\s+", "", seqs) wd <- info$width res <- sapply(seqs, function(x) substring(x, (base::nchar(x) - wd + 1), base::nchar(x))) nn <- sapply(seqs, function(x) substring(x, 1, (base::nchar(x) - wd))) names(res) <- nn return(res) } read.tip_seq_mlb <- read.tip_seq_mlc read.dnds_mlc <- function(mlcfile) { mlc <- readLines(mlcfile) i <- grep("dN & dS for each branch", mlc) j <- grep("tree length for dN", mlc) mlc <- mlc[i:j] hi <- grep("dN/dS", mlc) cn <- strsplit(mlc[hi], " ") %>% unlist %>% `[`(nzchar(.)) ii <- grep("\\d+\\.\\.\\d+", mlc) info <- mlc[ii] info %<>% sub("^\\s+", "", .) info %<>% sub("\\s+$", "", .) res <- t(sapply(info, function(x) { y <- unlist(strsplit(x, "\\s+")) edge <- unlist(strsplit(y[1], "\\.\\.")) yy <- c(edge, y[-1]) as.numeric(yy) })) row.names(res) <- NULL colnames(res) <- c("parent", "node", cn[-1]) colnames(res) <- gsub("\\*", "_x_", colnames(res)) colnames(res) <- gsub("\\/", "_vs_", colnames(res)) return(res) } read.treetext_paml_mlc <- function(mlcfile) { read.treetext_paml(mlcfile, "mlc") } read.treetext_paml_rst<- function(rstfile) { read.treetext_paml(rstfile, "rst") } read.treetext_paml <- function(file, by) { ## works fine with baseml and codeml x <- readLines(file) tr.idx <- get_tree_index_paml(x) if ( by == "rst" ) { ii <- 1 } else if (by == "mlc") { ii <- 3 } else { stop("_by_ should be one of 'rst' or 'mlc'") } return(x[tr.idx][ii]) } ##' @importFrom ape Ntip read.phylo_paml_mlc <- function(mlcfile) { parent <- node <- label <- NULL mlc <- readLines(mlcfile) edge <- get_tree_edge_paml(mlc) tr.idx <- get_tree_index_paml(mlc) tr2 <- read.tree(text=mlc[tr.idx[2]]) tr3 <- read.tree(text=mlc[tr.idx[3]]) treeinfo <- as.data.frame(edge) colnames(treeinfo) <- c("parent", "node") treeinfo$length <- NA treeinfo$label <- NA treeinfo$isTip <- FALSE ntip <- Ntip(tr3) ii <- match(tr2$tip.label, treeinfo[, "node"]) treeinfo[ii, "label"] <- tr3$tip.label treeinfo[ii, "isTip"] <- TRUE ## jj <- match(1:ntip, tr3$edge[,2]) treeinfo[ii, "length"] <- tr2$edge.length[ii] ##tr3$edge.length[jj] root <- getRoot(tr3) ## always == (Ntip(tr3) + 1) currentNode <- treeinfo$label[ii] treeinfo.tr3 <- fortify(tr3) treeinfo$visited <- FALSE while(any(treeinfo$visited == FALSE)) { pNode <- c() for( kk in currentNode ) { i <- which(treeinfo$label == kk) treeinfo[i, "visited"] <- TRUE j <- which(treeinfo.tr3$label == kk) ip <- treeinfo[i, "parent"] if (ip != root) { ii <- which(treeinfo[, "node"] == ip) if (treeinfo$visited[ii] == FALSE) { jp <- treeinfo.tr3[j, "parent"] jj <- which(treeinfo.tr3[, "node"] == jp) treeinfo[ii, "label"] <- as.character(ip) treeinfo.tr3[jj, "label"] <- as.character(ip) treeinfo[ii, "length"] <- treeinfo.tr3[jj, "branch.length"] pNode <- c(pNode, ip) } treeinfo[ii, "visited"] <- TRUE } } currentNode <- unique(pNode) } phylo <- with(treeinfo, list(edge= cbind(as.numeric(parent), as.numeric(node)), edge.length = length, tip.label = label[order(node)][1:ntip], Nnode = tr3$Nnode, node.label = c(root, label[order(node)][-c(1:ntip)]) ) ) class(phylo) <- "phylo" phylo <- reorder.phylo(phylo, "cladewise") return(phylo) } ##' @importFrom ape reorder.phylo read.phylo_paml_rst <- function(rstfile) { parent <- node <- label <- NULL ## works fine with baseml and codeml rst <- readLines(rstfile) tr.idx <- get_tree_index_paml(rst) tr1 <- read.tree(text=rst[tr.idx][1]) tr3 <- read.tree(text=rst[tr.idx][3]) edge <- get_tree_edge_paml(rst) label=c(tr3$tip.label, tr3$node.label) root <- getRoot(tr3) label %<>% `[`(. != root) node.length <- data.frame(label=label, length=tr1$edge.length) ## node.length$node <- sub("_\\w+", "", node.length$label node.length$node <- gsub("^(\\d+)_.*", "\\1", node.length$label) node.length$label %<>% sub("\\d+_", "", .) edge <- as.data.frame(edge) colnames(edge) <- c("parent", "node") treeinfo <- merge(edge, node.length, by.x="node", by.y="node") edge2 <- treeinfo[, c("parent", "node")] edge2 %<>% as.matrix ntip <- Ntip(tr3) phylo <- with(treeinfo, list(edge= cbind(as.numeric(parent), as.numeric(node)), edge.length = length, tip.label = label[order(node)][1:ntip], Nnode = tr3$Nnode, node.label = c(root, label[order(node)][-c(1:ntip)]) ) ) class(phylo) <- "phylo" phylo <- reorder.phylo(phylo, "cladewise") return(phylo) } read.ancseq_paml_rst <- function(rstfile, by="Marginal") { ## works fine with baseml and codeml rst <- readLines(rstfile) by <- match.arg(by, c("Marginal", "Joint")) query <- paste(by, "reconstruction of ancestral sequences") idx <- grep(query, rst) if(length(idx) == 0) { ## in some paml setting, joint_ancseq are not available. return("") } si <- grep("reconstructed sequences", rst) idx <- si[which.min(abs(si-idx))] nl <- strsplit(rst[idx+2], split=" ") %>% unlist %<>% `[`(nzchar(.)) N <- as.numeric(nl[1]) seq.leng <- as.numeric(nl[2]) seqs <- rst[(idx+4):(idx+3+N)] seq.name <- character(N) res <- character(N) for (i in 1:N) { ss <- gsub(" ", "", seqs[i]) nn <- base::nchar(ss) res[i] <- substring(ss, nn-seq.leng+1,nn) seq.name[i] <- substring(ss, 1, nn-seq.leng) } seq.name <- sub("\\w+#", "", seq.name) names(res) <- seq.name return(res) } get_tree_index_paml <- function(paml) { grep("\\)[ \\.0-9]*;", paml) } get_tree_edge_index_paml <- function(paml) { grep("\\d+\\.\\.\\d+", paml) } get_tree_edge_paml <- function(paml) { tr.idx <- get_tree_index_paml(paml) edge.idx <- get_tree_edge_index_paml(paml) edge.idx <- edge.idx[edge.idx < tr.idx[3]] nodeNum <- strsplit(paml[edge.idx], split="\\.\\.") %>% unlist %>% strsplit(split="[[:space:]]") %>% unlist nodeNum %<>% `[`(nzchar(.)) edge <- matrix(as.numeric(nodeNum), ncol=2, byrow = TRUE) return(edge) } set.paml_rst_ <- function(object) { if (!is(object, "paml_rst")) { stop("object should be an instance of 'paml_rst'") } if (length(object@tip_seq) == 0) { return(object) } types <- get.fields(object) for (type in types) { value <- subs_paml_rst(object, type) if (all(is.na(value))) next if (type == "marginal_subs") { object@marginal_subs <- value } else if (type == "marginal_AA_subs") { object@marginal_AA_subs <- value } else if (type == "joint_subs") { object@joint_subs <- value } else if (type == "joint_AA_subs") { object@joint_AA_subs <- value } } return(object) } get.subs_paml_rst <- function(object, type) { if (!is(object, "paml_rst")) { stop("object should be an instance of 'paml_rst'") } if (type == "marginal_subs") { res <- object@marginal_subs } else if (type == "marginal_AA_subs") { res <- object@marginal_AA_subs } else if (type == "joint_subs") { res <- object@joint_subs } else if (type == "joint_AA_subs") { res <- object@joint_AA_subs } else { stop("type should be one of 'marginal_subs', 'marginal_AA_subs', 'joint_subs' or 'joint_AA_subs'. ") } return(res) } subs_paml_rst <- function(x, type, ...) { if (class(x) != "paml_rst") { stop("x should be an object of paml_rst...") } seqs <- x@tip_seq if (length(seqs) == 0) { stop("tip sequences is not available...") } if (type %in% c("marginal_subs", "marginal_AA_subs")) { ancseq <- x@marginal_ancseq ## seqs <- c(seqs, x@marginal_ancseq) } else if (type %in% c("joint_subs", "joint_AA_subs")){ ancseq <- x@joint_ancseq ## seqs <- c(seqs, x@joint_ancseq) } else { stop("type should be one of 'marginal_subs', 'marginal_AA_subs', 'joint_subs' or 'joint_AA_subs'. ") } if( type %in% c("marginal_subs", "joint_subs")) { translate <- FALSE } else { translate <- TRUE } if (all(ancseq == "")) { return(NA) } seqs <- c(seqs, ancseq) get.subs_(x@phylo, seqs, translate=translate, ...) }