#' Annotate KEGG edge mappings with user data #' @description Add data column[s] to object created from function #' expand_KEGG_edges #' @export #' @importFrom gtools smartbind #' @importFrom plyr rename #' @param expanded_edges The data frame object generated via the function #' expand_KEGG_edges #' @param KEGG_mappings KEGG_mappings The data.frame object generated by the #' function expand_KEGG_mappings #' @param user_data A data frame where in which the first two columns contain #' gene symbols representing an edge and any/all other column[s] contain #' corresponding edge data. #' @param data_column_no The column index for desired user data to be added #' @param map_type If the genes in your data set are left untranslated #' set to "NUMBER" (assuming numbers are gene accession numbers) #' @param only_mapped A logical indicator; if set to FALSE will return 'de-novo' #' edges that 'exist' in data but are not documented in KEGG #' @return A data frame object with detailed KEGG edge mappings annotated with #' user data #' @examples #' p53_KGML <- get_KGML('hsa04115') #' p53_KEGG_mappings <- expand_KEGG_mappings(p53_KGML) #' p53_edges <- expand_KEGG_edges(p53_KGML, p53_KEGG_mappings) #' p53_HA1E_data <- overlap_info(p53_KGML, p53_KEGG_mappings, 'HA1E', #' data_type = '100_bing', only_mapped = FALSE) #' #' p53_edges_HA1E_ALL <- add_edge_data(p53_edges, p53_KEGG_mappings, #' p53_HA1E_data, c(3, 10,12)) #' p53_edges_HA1E_MAPPED <- add_edge_data(p53_edges, p53_KEGG_mappings, #' p53_HA1E_data, c(3, 10,12), #' only_mapped = TRUE) add_edge_data <- function(expanded_edges, KEGG_mappings, user_data, map_type = "SYMBOL",data_column_no = 3, only_mapped = FALSE) { expanded_edges <- expanded_edges[expanded_edges$type != "maplink", ] if (nrow(expanded_edges) > 0) { if (map_type == "SYMBOL"){ expanded_edges$unique_ID = paste0(expanded_edges$entry1symbol, ",", expanded_edges$entry2symbol) expanded_edges$unique_IDR = paste0(expanded_edges$entry2symbol, ",", expanded_edges$entry1symbol) } if (map_type == "NUMBER"){ expanded_edges$unique_ID = paste0(expanded_edges$entry1accession, ",", expanded_edges$entry2accession) expanded_edges$unique_IDR = paste0(expanded_edges$entry2accession, ",", expanded_edges$entry1accession) } user_data$unique_ID = paste0(user_data[,1], ",", user_data[,2]) pre_mapped1 <- subset(user_data, user_data$unique_ID %in% expanded_edges$unique_ID) pre_mapped2 <- subset(user_data, user_data$unique_ID %in% expanded_edges$unique_IDR) pre_mapped2 <- pre_mapped2[, c(2, 1, 3:ncol(pre_mapped2))] names(pre_mapped2) = names(pre_mapped1) if (nrow(pre_mapped2) >= 1 & nrow(pre_mapped1) >= 1) { pre_mapped2$unique_ID <- paste0(pre_mapped2[,1], ",", pre_mapped2[,2]) pre_mapped <- rbind(pre_mapped1, pre_mapped2) } if (nrow(pre_mapped2) == 0 & (nrow(pre_mapped1) == 0)) { pre_mapped <- data.frame(unique_ID = NA) } if (nrow(pre_mapped1) == 0 & nrow(pre_mapped2) >= 1) { pre_mapped2$unique_ID <- paste0(pre_mapped2[,1], ",", pre_mapped2[,2]) pre_mapped <- pre_mapped2 } if (nrow(pre_mapped2) == 0 & nrow(pre_mapped1) >= 1) { pre_mapped <- pre_mapped1 } if (!is.na(pre_mapped[1, 1])) { expanded_edges_1 <- subset(expanded_edges, expanded_edges$unique_ID %in% pre_mapped$unique_ID) expanded_edges_1 <- expanded_edges[expanded_edges$unique_ID %in% pre_mapped$unique_ID, ] expanded_edges_2 <- expanded_edges[!expanded_edges$unique_ID %in% pre_mapped$unique_ID, ] expanded_edges_1$has_data = 1 testval <- nrow(expanded_edges_2) if (testval > 0) { expanded_edges_2$has_data = 0 edge_set <- rbind(expanded_edges_1, expanded_edges_2) } else { edge_set <- expanded_edges_1 } edge_set <- edge_set[order(edge_set$unique_ID), ] data_to_add <- data.frame("unique_ID" = pre_mapped$unique_ID, stringsAsFactors = FALSE) data_to_add <- cbind(data_to_add, pre_mapped[, data_column_no]) data_to_add <- data_to_add[order(data_to_add$unique_ID), ] colnames(data_to_add)[2] <- "summary_score" annotated_edges <- merge(edge_set, data_to_add, "unique_ID", all.x = TRUE) drops <- c("unique_ID", "unique_IDR") annotated_edges <- annotated_edges[, !(names(annotated_edges) %in% drops)] cat(paste0("Number of edges documented in selected pathway = ", nrow(annotated_edges)), "\n") cat(paste0("Number of edges with corresponding user data = ", sum(annotated_edges$has_data), "\n")) cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/ nrow(annotated_edges) * 100, 2), "%", "\n")) annotated_edges$premapped <- 1 # un_mapped <- subset(user_data, !user_data$unique_ID %in% # expanded_edges$unique_ID & # !user_data$unique_ID %in% expanded_edges$unique_IDR) # un_mapped_edges <- un_mapped[, c(1:2, data_column_no, # ncol(un_mapped))] if (only_mapped) { return(annotated_edges) } } else { annotated_edges <- expanded_edges annotated_edges$premapped <- 1 annotated_edges$has_data <- 0 un_mapped <- user_data un_mapped$unique_ID <- paste0(un_mapped$knockout1, un_mapped$knockout2) un_mapped_edges <- un_mapped[, c(1:2, data_column_no, ncol(un_mapped))] un_mapped_edges$premapped <- 0 un_mapped_edges$has_data <- 0 cat(paste0("Number of edges documented in selected pathway = ", nrow(annotated_edges)), "\n") cat(paste0("Number of edges with corresponding user data = ", sum(annotated_edges$has_data), "\n")) cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/ nrow(annotated_edges) * 100, 2), "%", "\n")) if (only_mapped) { cat(paste0("No documented edges are found in data; only data for de-novo edges can be mapped \n")) return(annotated_edges) } } } else if (nrow(expanded_edges) == 0) { annotated_edges <- expanded_edges annotated_edges$premapped <- 1 annotated_edges$has_data <- 0 un_mapped <- user_data un_mapped$unique_ID <- paste0(un_mapped$knockout1, un_mapped$knockout2) un_mapped_edges <- un_mapped[, c(1:2, data_column_no, ncol(un_mapped))] un_mapped_edges$premapped <- 0 un_mapped_edges$has_data <- 0 cat(paste0("Number of edges documented in selected pathway = ", nrow(annotated_edges)), "\n") cat(paste0("Number of edges with corresponding user data = ", sum(annotated_edges$has_data), "\n")) cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/ nrow(annotated_edges) * 100, 2), "%", "\n")) if (only_mapped) { return(annotated_edges) } } names(un_mapped_edges)[1:2] <- c("entryNAME_1", "entryNAME_2") for (i in 1:nrow(un_mapped_edges)) { un_mapped_edges$Source_eid[i] <- list(KEGG_mappings$entryID[which(KEGG_mappings$entrySYMBOL == un_mapped_edges$entryNAME_1[i])]) un_mapped_edges$Target_eid[i] <- list(KEGG_mappings$entryID[which(KEGG_mappings$entrySYMBOL == un_mapped_edges$entryNAME_2[i])]) x <- length(unlist(un_mapped_edges$Source_eid[i])) y <- length(unlist(un_mapped_edges$Target_eid[i])) if (x > 1 | y > 1) { un_mapped_edges$simple[i] <- FALSE } else { un_mapped_edges$simple[i] <- TRUE } } simple_un_mapped_edges <- un_mapped_edges[un_mapped_edges$simple == TRUE, -ncol(un_mapped_edges)] complex_un_mapped_edges <- un_mapped_edges[un_mapped_edges$simple == FALSE, -ncol(un_mapped_edges)] test_val <- nrow(complex_un_mapped_edges) if (test_val == 0) { un_mapped_edges <- simple_un_mapped_edges } else { keeps <- c("Source_eid", "Target_eid", "unique_ID") c_temp <- complex_un_mapped_edges[, (names(complex_un_mapped_edges) %in% keeps)] for (i in 1:nrow(c_temp)) { x <- length(unlist(c_temp$Source_eid[i])) y <- length(unlist(c_temp$Target_eid[i])) l <- x * y c_temp$Source_eid[i] <- list(sort(unlist(rep(c_temp$Source_eid[i], y)))) c_temp$Target_eid[i] <- list(unlist(rep(c_temp$Target_eid[i], x))) c_temp$unique_ID[i] <- list(rep(c_temp$unique_ID[i], l)) } c_temp <- data.frame(unique_ID = unlist(c_temp$unique_ID), Source_eid = unlist(c_temp$Source_eid), Target_eid = unlist(c_temp$Target_eid)) drops <- c("Source_eid", "Target_eid") complex_un_mapped_edges <- complex_un_mapped_edges[, !(names(complex_un_mapped_edges) %in% drops)] complex_un_mapped_edges <- merge(complex_un_mapped_edges, c_temp) un_mapped_edges <- rbind(simple_un_mapped_edges, complex_un_mapped_edges) } un_mapped_edges$Source_eid <- unlist(un_mapped_edges$Source_eid) un_mapped_edges$Target_eid <- unlist(un_mapped_edges$Target_eid) un_mapped_edges <- plyr::rename(un_mapped_edges, c(Source_eid = "entry1", Target_eid = "entry2", entryNAME_1 = "entry1symbol", entryNAME_2 = "entry2symbol")) un_mapped_edges$subtype1 <- "de_novo" drops <- c("unique_ID") un_mapped_edges <- un_mapped_edges[, !(names(un_mapped_edges) %in% drops)] un_mapped_edges$has_data <- 1 un_mapped_edges$premapped <- 0 for (i in 1:nrow(un_mapped_edges)) { un_mapped_edges$entry1accession[i] <- KEGG_mappings$entryACCESSION[which(KEGG_mappings$entrySYMBOL == un_mapped_edges$entry1symbol[i])][1] un_mapped_edges$entry2accession[i] <- KEGG_mappings$entryACCESSION[which(KEGG_mappings$entrySYMBOL == un_mapped_edges$entry2symbol[i])][1] un_mapped_edges$entry1type[i] <- KEGG_mappings$entryTYPE[which(KEGG_mappings$entrySYMBOL == un_mapped_edges$entry1symbol[i])][1] un_mapped_edges$entry2type[i] <- KEGG_mappings$entryTYPE[which(KEGG_mappings$entrySYMBOL == un_mapped_edges$entry2symbol[i])][1] } if (nrow(annotated_edges) > 0 & nrow(un_mapped_edges) > 0) { all_edges <- gtools::smartbind(annotated_edges, un_mapped_edges) } else if (nrow(annotated_edges) > 0) all_edges <- annotated_edges else { all_edges <- un_mapped_edges all_edges$value <- NA all_edges$subtype2 <- NA all_edges$value2 <- NA all_edges$specific_subtype <- NA all_edges$type <- NA all_edges$is_direct <- 1 all_edges$edgeID <- seq(1:nrow(un_mapped_edges)) refcols <- c("edgeID", "entry1accession", "entry2accession", "entry1", "entry2") all_edges <- all_edges[, c(refcols, setdiff(names(all_edges), refcols))] cat(paste0("All documented edges are of type maplink; only data for de-novo edges can be mapped \n")) } for (i in 1:nrow(all_edges)) { if (all_edges$entry1[i] == all_edges$entry2[i]) { all_edges$paralogs[i] <- 1 } else { all_edges$paralogs[i] <- 0 } } return(all_edges) }