Browse code

update add_edge_data so that users can map entries genes by accession number

Shana White authored on 27/09/2017 17:23:00
Showing 4 changed files

... ...
@@ -12,6 +12,8 @@
12 12
 #' gene symbols representing an edge and any/all other column[s] contain 
13 13
 #' corresponding edge data.
14 14
 #' @param data_column_no The column index for desired user data to be added
15
+#' @param map_type If the genes in your data set are left untranslated 
16
+#' set to "NUMBER" (assuming numbers are gene accession numbers) 
15 17
 #' @param only_mapped A logical indicator; if set to FALSE will return 'de-novo'
16 18
 #' edges that 'exist' in data but are not documented in KEGG 
17 19
 #' @return A data frame object with detailed KEGG edge mappings annotated with 
... ...
@@ -30,40 +32,48 @@
30 32
 #'                                         only_mapped = TRUE)
31 33
 
32 34
 
33
-add_edge_data <- function(expanded_edges, KEGG_mappings, 
34
-                        user_data, data_column_no = 3, only_mapped = FALSE) {
35
+add_edge_data <- function(expanded_edges, KEGG_mappings,
36
+                          user_data, map_type = "SYMBOL",data_column_no = 3,
37
+                          only_mapped = FALSE) {
35 38
     
36 39
     expanded_edges <- expanded_edges[expanded_edges$type != "maplink", ]
37
-    
38
-    # if ("pre_mapped" %in% names(user_data)) {
39
-    #     user_data <- user_data[, -c(which(colnames(user_data) == "pre_mapped"))]
40
-    # }
41
-    
42 40
     if (nrow(expanded_edges) > 0) {
41
+        if (map_type == "SYMBOL"){
42
+            expanded_edges$unique_ID = 
43
+                paste0(expanded_edges$entry1symbol, ",",
44
+                expanded_edges$entry2symbol)
45
+            expanded_edges$unique_IDR = 
46
+                paste0(expanded_edges$entry2symbol, ",",
47
+                expanded_edges$entry1symbol)
48
+        }
49
+        if (map_type == "NUMBER"){
50
+            expanded_edges$unique_ID = 
51
+                paste0(expanded_edges$entry1accession, ",",
52
+                expanded_edges$entry2accession)
53
+            expanded_edges$unique_IDR = 
54
+                paste0(expanded_edges$entry2accession, ",",
55
+                expanded_edges$entry1accession)
56
+        }
43 57
         
44
-        expanded_edges$unique_ID = paste0(expanded_edges$entry1symbol, ",", 
45
-            expanded_edges$entry2symbol)
46
-        expanded_edges$unique_IDR = paste0(expanded_edges$entry2symbol, ",", 
47
-            expanded_edges$entry1symbol)
48
-        user_data$unique_ID = paste0(user_data$knockout1, ",", 
49
-                                        user_data$knockout2)
50
-        pre_mapped1 <- subset(user_data, user_data$unique_ID %in% 
51
-                                    expanded_edges$unique_ID)
52
-        pre_mapped2 <- subset(user_data, user_data$unique_ID %in% 
53
-                                    expanded_edges$unique_IDR)
58
+        user_data$unique_ID = paste0(user_data[,1], ",",
59
+                                     user_data[,2])
60
+        pre_mapped1 <- subset(user_data, user_data$unique_ID %in%
61
+                                  expanded_edges$unique_ID)
62
+        pre_mapped2 <- subset(user_data, user_data$unique_ID %in%
63
+                                  expanded_edges$unique_IDR)
54 64
         pre_mapped2 <- pre_mapped2[, c(2, 1, 3:ncol(pre_mapped2))]
55 65
         names(pre_mapped2) = names(pre_mapped1)
56 66
         if (nrow(pre_mapped2) >= 1 & nrow(pre_mapped1) >= 1) {
57
-            pre_mapped2$unique_ID <- paste0(pre_mapped2$knockout1, ",", 
58
-                                            pre_mapped2$knockout2)
67
+            pre_mapped2$unique_ID <- paste0(pre_mapped2[,1], ",",
68
+                                            pre_mapped2[,2])
59 69
             pre_mapped <- rbind(pre_mapped1, pre_mapped2)
60 70
         }
61 71
         if (nrow(pre_mapped2) == 0 & (nrow(pre_mapped1) == 0)) {
62 72
             pre_mapped <- data.frame(unique_ID = NA)
63 73
         }
64 74
         if (nrow(pre_mapped1) == 0 & nrow(pre_mapped2) >= 1) {
65
-            pre_mapped2$unique_ID <- paste0(pre_mapped2$knockout1, ",", 
66
-                                            pre_mapped2$knockout2)
75
+            pre_mapped2$unique_ID <- paste0(pre_mapped2[,1], ",",
76
+                                            pre_mapped2[,2])
67 77
             pre_mapped <- pre_mapped2
68 78
         }
69 79
         if (nrow(pre_mapped2) == 0 & nrow(pre_mapped1) >= 1) {
... ...
@@ -71,72 +81,76 @@ add_edge_data <- function(expanded_edges, KEGG_mappings,
71 81
         }
72 82
         if (!is.na(pre_mapped[1, 1])) {
73 83
             expanded_edges_1 <- subset(expanded_edges, expanded_edges$unique_ID
74
-                                        %in% pre_mapped$unique_ID)
75
-            expanded_edges_1 <- expanded_edges[expanded_edges$unique_ID %in% 
76
-                pre_mapped$unique_ID, ]
77
-            expanded_edges_2 <- expanded_edges[!expanded_edges$unique_ID %in% 
78
-                pre_mapped$unique_ID, ]
84
+                                       %in% pre_mapped$unique_ID)
85
+            expanded_edges_1 <- expanded_edges[expanded_edges$unique_ID %in%
86
+                                                   pre_mapped$unique_ID, ]
87
+            expanded_edges_2 <- expanded_edges[!expanded_edges$unique_ID %in%
88
+                                                   pre_mapped$unique_ID, ]
79 89
             expanded_edges_1$has_data = 1
80 90
             testval <- nrow(expanded_edges_2)
81 91
             if (testval > 0) {
82 92
                 expanded_edges_2$has_data = 0
83 93
                 edge_set <- rbind(expanded_edges_1, expanded_edges_2)
84
-            } else {
94
+            }
95
+            else {
85 96
                 edge_set <- expanded_edges_1
86 97
             }
87
-
98
+            
88 99
             edge_set <- edge_set[order(edge_set$unique_ID), ]
89 100
             
90 101
             data_to_add <- data.frame("unique_ID" = pre_mapped$unique_ID, stringsAsFactors = FALSE)
91 102
             data_to_add <- cbind(data_to_add, pre_mapped[, data_column_no])
92
-
103
+            
93 104
             data_to_add <- data_to_add[order(data_to_add$unique_ID), ]
105
+            colnames(data_to_add)[2] <- "summary_score"
94 106
             
95
-            annotated_edges <- merge(edge_set, data_to_add, "unique_ID", 
107
+            annotated_edges <- merge(edge_set, data_to_add, "unique_ID",
96 108
                                      all.x = TRUE)
97 109
             drops <- c("unique_ID", "unique_IDR")
98
-            annotated_edges <- annotated_edges[, !(names(annotated_edges) %in% 
99
-                drops)]
100
-            cat(paste0("Number of edges documented in selected pathway = ", 
101
-                nrow(annotated_edges)), "\n")
102
-            cat(paste0("Number of edges with corresponding user data = ", 
103
-                        sum(annotated_edges$has_data), "\n"))
110
+            annotated_edges <- annotated_edges[, !(names(annotated_edges) %in%
111
+                                                       drops)]
112
+            cat(paste0("Number of edges documented in selected pathway = ",
113
+                       nrow(annotated_edges)), "\n")
114
+            cat(paste0("Number of edges with corresponding user data = ",
115
+                       sum(annotated_edges$has_data), "\n"))
104 116
             cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/
105
-                        nrow(annotated_edges) * 100, 2), "%", "\n"))
117
+                                                nrow(annotated_edges) * 100, 2), "%", "\n"))
106 118
             annotated_edges$premapped <- 1
107
-            un_mapped <- subset(user_data, !user_data$unique_ID %in% 
108
-                                expanded_edges$unique_ID & 
109
-                            !user_data$unique_ID %in% expanded_edges$unique_IDR)
110
-            un_mapped_edges <- un_mapped[, c(1:2, data_column_no,
111
-                                            ncol(un_mapped))]
119
+            # un_mapped <- subset(user_data, !user_data$unique_ID %in%
120
+            #                       expanded_edges$unique_ID &
121
+            #                       !user_data$unique_ID %in% expanded_edges$unique_IDR)
122
+            # un_mapped_edges <- un_mapped[, c(1:2, data_column_no,
123
+            #                                  ncol(un_mapped))]
112 124
             
113 125
             if (only_mapped) {
114 126
                 return(annotated_edges)
115 127
             }
116
-        } else {
128
+        }
129
+        else {
117 130
             annotated_edges <- expanded_edges
118 131
             annotated_edges$premapped <- 1
119 132
             annotated_edges$has_data <- 0
120 133
             un_mapped <- user_data
121
-            un_mapped$unique_ID <- paste0(un_mapped$knockout1, 
122
-                                        un_mapped$knockout2)
123
-            un_mapped_edges <- un_mapped[, c(1:2, data_column_no, 
124
-                                            ncol(un_mapped))]
134
+            un_mapped$unique_ID <- paste0(un_mapped$knockout1,
135
+                                          un_mapped$knockout2)
136
+            un_mapped_edges <- un_mapped[, c(1:2, data_column_no,
137
+                                             ncol(un_mapped))]
125 138
             un_mapped_edges$premapped <- 0
126 139
             un_mapped_edges$has_data <- 0
127
-            cat(paste0("Number of edges documented in selected pathway = ", 
128
-                nrow(annotated_edges)), "\n")
129
-            cat(paste0("Number of edges with corresponding user data = ", 
130
-                        sum(annotated_edges$has_data), "\n"))
140
+            cat(paste0("Number of edges documented in selected pathway = ",
141
+                       nrow(annotated_edges)), "\n")
142
+            cat(paste0("Number of edges with corresponding user data = ",
143
+                       sum(annotated_edges$has_data), "\n"))
131 144
             cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/
132
-                        nrow(annotated_edges) * 100, 2), "%", "\n"))
145
+                                                nrow(annotated_edges) * 100, 2), "%", "\n"))
133 146
             if (only_mapped) {
134
-                cat(paste0("No documented edges are found in data; 
135
-                    only data for de-novo edges can be mapped \n"))
147
+                cat(paste0("No documented edges are found in data;
148
+                           only data for de-novo edges can be mapped \n"))
136 149
                 return(annotated_edges)
137 150
             }
138 151
         }
139
-    } else if (nrow(expanded_edges) == 0) {
152
+        }
153
+    else if (nrow(expanded_edges) == 0) {
140 154
         annotated_edges <- expanded_edges
141 155
         annotated_edges$premapped <- 1
142 156
         annotated_edges$has_data <- 0
... ...
@@ -145,24 +159,24 @@ add_edge_data <- function(expanded_edges, KEGG_mappings,
145 159
         un_mapped_edges <- un_mapped[, c(1:2, data_column_no, ncol(un_mapped))]
146 160
         un_mapped_edges$premapped <- 0
147 161
         un_mapped_edges$has_data <- 0
148
-        cat(paste0("Number of edges documented in selected pathway = ", 
149
-                    nrow(annotated_edges)), "\n")
150
-        cat(paste0("Number of edges with corresponding user data = ", 
151
-                    sum(annotated_edges$has_data), "\n"))
162
+        cat(paste0("Number of edges documented in selected pathway = ",
163
+                   nrow(annotated_edges)), "\n")
164
+        cat(paste0("Number of edges with corresponding user data = ",
165
+                   sum(annotated_edges$has_data), "\n"))
152 166
         cat(paste0("Coverage = ", round(sum(annotated_edges$has_data)/
153
-                    nrow(annotated_edges) * 100, 2), "%", "\n"))
167
+                                            nrow(annotated_edges) * 100, 2), "%", "\n"))
154 168
         if (only_mapped) {
155 169
             return(annotated_edges)
156 170
         }
157 171
     }
158 172
     names(un_mapped_edges)[1:2] <- c("entryNAME_1", "entryNAME_2")
159 173
     for (i in 1:nrow(un_mapped_edges)) {
160
-        un_mapped_edges$Source_eid[i] <- 
161
-            list(KEGG_mappings$entryID[which(KEGG_mappings$entrySYMBOL == 
162
-            un_mapped_edges$entryNAME_1[i])])
163
-        un_mapped_edges$Target_eid[i] <- 
164
-            list(KEGG_mappings$entryID[which(KEGG_mappings$entrySYMBOL == 
165
-            un_mapped_edges$entryNAME_2[i])])
174
+        un_mapped_edges$Source_eid[i] <-
175
+            list(KEGG_mappings$entryID[which(KEGG_mappings$entrySYMBOL ==
176
+                                                 un_mapped_edges$entryNAME_1[i])])
177
+        un_mapped_edges$Target_eid[i] <-
178
+            list(KEGG_mappings$entryID[which(KEGG_mappings$entrySYMBOL ==
179
+                                                 un_mapped_edges$entryNAME_2[i])])
166 180
         x <- length(unlist(un_mapped_edges$Source_eid[i]))
167 181
         y <- length(unlist(un_mapped_edges$Target_eid[i]))
168 182
         if (x > 1 | y > 1) {
... ...
@@ -171,45 +185,45 @@ add_edge_data <- function(expanded_edges, KEGG_mappings,
171 185
             un_mapped_edges$simple[i] <- TRUE
172 186
         }
173 187
     }
174
-    simple_un_mapped_edges <- un_mapped_edges[un_mapped_edges$simple == TRUE, 
175
-        -ncol(un_mapped_edges)]
176
-    complex_un_mapped_edges <- un_mapped_edges[un_mapped_edges$simple == FALSE, 
177
-        -ncol(un_mapped_edges)]
188
+    simple_un_mapped_edges <- un_mapped_edges[un_mapped_edges$simple == TRUE,
189
+                                              -ncol(un_mapped_edges)]
190
+    complex_un_mapped_edges <- un_mapped_edges[un_mapped_edges$simple == FALSE,
191
+                                               -ncol(un_mapped_edges)]
178 192
     test_val <- nrow(complex_un_mapped_edges)
179 193
     if (test_val == 0) {
180 194
         un_mapped_edges <- simple_un_mapped_edges
181 195
     } else {
182 196
         keeps <- c("Source_eid", "Target_eid", "unique_ID")
183
-        c_temp <- complex_un_mapped_edges[, (names(complex_un_mapped_edges) %in% 
184
-            keeps)]
197
+        c_temp <- complex_un_mapped_edges[, (names(complex_un_mapped_edges) %in%
198
+                                                 keeps)]
185 199
         
186 200
         for (i in 1:nrow(c_temp)) {
187 201
             x <- length(unlist(c_temp$Source_eid[i]))
188 202
             y <- length(unlist(c_temp$Target_eid[i]))
189 203
             l <- x * y
190 204
             
191
-            c_temp$Source_eid[i] <- list(sort(unlist(rep(c_temp$Source_eid[i], 
192
-                y))))
205
+            c_temp$Source_eid[i] <- list(sort(unlist(rep(c_temp$Source_eid[i],
206
+                                                         y))))
193 207
             c_temp$Target_eid[i] <- list(unlist(rep(c_temp$Target_eid[i], x)))
194 208
             c_temp$unique_ID[i] <- list(rep(c_temp$unique_ID[i], l))
195 209
         }
196 210
         
197
-        c_temp <- data.frame(unique_ID = unlist(c_temp$unique_ID), Source_eid = 
198
-                            unlist(c_temp$Source_eid),
199
-                            Target_eid = unlist(c_temp$Target_eid))
211
+        c_temp <- data.frame(unique_ID = unlist(c_temp$unique_ID), Source_eid =
212
+                                 unlist(c_temp$Source_eid),
213
+                             Target_eid = unlist(c_temp$Target_eid))
200 214
         drops <- c("Source_eid", "Target_eid")
201
-        complex_un_mapped_edges <- 
202
-            complex_un_mapped_edges[, !(names(complex_un_mapped_edges) %in% 
203
-            drops)]
215
+        complex_un_mapped_edges <-
216
+            complex_un_mapped_edges[, !(names(complex_un_mapped_edges) %in%
217
+                                            drops)]
204 218
         complex_un_mapped_edges <- merge(complex_un_mapped_edges, c_temp)
205
-        un_mapped_edges <- rbind(simple_un_mapped_edges, 
219
+        un_mapped_edges <- rbind(simple_un_mapped_edges,
206 220
                                  complex_un_mapped_edges)
207 221
     }
208 222
     un_mapped_edges$Source_eid <- unlist(un_mapped_edges$Source_eid)
209 223
     un_mapped_edges$Target_eid <- unlist(un_mapped_edges$Target_eid)
210
-    un_mapped_edges <- plyr::rename(un_mapped_edges, c(Source_eid = "entry1", 
211
-        Target_eid = "entry2", entryNAME_1 = "entry1symbol", 
212
-        entryNAME_2 = "entry2symbol"))
224
+    un_mapped_edges <- plyr::rename(un_mapped_edges, c(Source_eid = "entry1",
225
+                                                       Target_eid = "entry2", entryNAME_1 = "entry1symbol",
226
+                                                       entryNAME_2 = "entry2symbol"))
213 227
     un_mapped_edges$subtype1 <- "de_novo"
214 228
     
215 229
     drops <- c("unique_ID")
... ...
@@ -217,38 +231,38 @@ add_edge_data <- function(expanded_edges, KEGG_mappings,
217 231
     un_mapped_edges$has_data <- 1
218 232
     un_mapped_edges$premapped <- 0
219 233
     for (i in 1:nrow(un_mapped_edges)) {
220
-        un_mapped_edges$entry1accession[i] <- 
221
-            KEGG_mappings$entryACCESSION[which(KEGG_mappings$entrySYMBOL == 
222
-            un_mapped_edges$entry1symbol[i])][1]
223
-        un_mapped_edges$entry2accession[i] <- 
224
-            KEGG_mappings$entryACCESSION[which(KEGG_mappings$entrySYMBOL == 
225
-            un_mapped_edges$entry2symbol[i])][1]
226
-        un_mapped_edges$entry1type[i] <- 
227
-            KEGG_mappings$entryTYPE[which(KEGG_mappings$entrySYMBOL == 
228
-            un_mapped_edges$entry1symbol[i])][1]
229
-        un_mapped_edges$entry2type[i] <- 
230
-            KEGG_mappings$entryTYPE[which(KEGG_mappings$entrySYMBOL == 
231
-            un_mapped_edges$entry2symbol[i])][1]
234
+        un_mapped_edges$entry1accession[i] <-
235
+            KEGG_mappings$entryACCESSION[which(KEGG_mappings$entrySYMBOL ==
236
+                                                   un_mapped_edges$entry1symbol[i])][1]
237
+        un_mapped_edges$entry2accession[i] <-
238
+            KEGG_mappings$entryACCESSION[which(KEGG_mappings$entrySYMBOL ==
239
+                                                   un_mapped_edges$entry2symbol[i])][1]
240
+        un_mapped_edges$entry1type[i] <-
241
+            KEGG_mappings$entryTYPE[which(KEGG_mappings$entrySYMBOL ==
242
+                                              un_mapped_edges$entry1symbol[i])][1]
243
+        un_mapped_edges$entry2type[i] <-
244
+            KEGG_mappings$entryTYPE[which(KEGG_mappings$entrySYMBOL ==
245
+                                              un_mapped_edges$entry2symbol[i])][1]
232 246
     }
233 247
     
234 248
     if (nrow(annotated_edges) > 0 & nrow(un_mapped_edges) > 0) {
235 249
         all_edges <- gtools::smartbind(annotated_edges, un_mapped_edges)
236
-    } else if (nrow(annotated_edges) > 0) 
250
+    } else if (nrow(annotated_edges) > 0)
237 251
         all_edges <- annotated_edges else {
238
-        all_edges <- un_mapped_edges
239
-        all_edges$value <- NA
240
-        all_edges$subtype2 <- NA
241
-        all_edges$value2 <- NA
242
-        all_edges$specific_subtype <- NA
243
-        all_edges$type <- NA
244
-        all_edges$is_direct <- 1
245
-        all_edges$edgeID <- seq(1:nrow(un_mapped_edges))
246
-        refcols <- c("edgeID", "entry1accession", "entry2accession", "entry1", 
247
-            "entry2")
248
-        all_edges <- all_edges[, c(refcols, setdiff(names(all_edges), refcols))]
249
-        cat(paste0("All documented edges are of type maplink; only data for 
250
-                    de-novo edges can be mapped \n"))
251
-    }
252
+            all_edges <- un_mapped_edges
253
+            all_edges$value <- NA
254
+            all_edges$subtype2 <- NA
255
+            all_edges$value2 <- NA
256
+            all_edges$specific_subtype <- NA
257
+            all_edges$type <- NA
258
+            all_edges$is_direct <- 1
259
+            all_edges$edgeID <- seq(1:nrow(un_mapped_edges))
260
+            refcols <- c("edgeID", "entry1accession", "entry2accession", "entry1",
261
+                         "entry2")
262
+            all_edges <- all_edges[, c(refcols, setdiff(names(all_edges), refcols))]
263
+            cat(paste0("All documented edges are of type maplink; only data for
264
+                       de-novo edges can be mapped \n"))
265
+        }
252 266
     
253 267
     for (i in 1:nrow(all_edges)) {
254 268
         if (all_edges$entry1[i] == all_edges$entry2[i]) {
... ...
@@ -258,4 +272,4 @@ add_edge_data <- function(expanded_edges, KEGG_mappings,
258 272
         }
259 273
     }
260 274
     return(all_edges)
261
-}
275
+        }
... ...
@@ -27,8 +27,9 @@
27 27
 #'                                "HA1E", data_type = "100_bing")
28 28
 #' 
29 29
 #' p53_edges_HA1E_data_MAPPED <- add_edge_data(p53_edges, p53_KEGG_mappings, 
30
-#'                                                p53_HA1E_data, c(3, 10,12),
31
-#'                                                only_mapped = TRUE)
30
+#'                                             p53_HA1E_data, 
31
+#'                                             data_column_no = c(3, 10,12),
32
+#'                                             only_mapped = TRUE)
32 33
 #'                                                
33 34
 #' p53_edge_mapping_HA1E <- edge_mapping_info(p53_edges_HA1E_data_MAPPED, 
34 35
 #'                                                       data_added = TRUE)
... ...
@@ -4,8 +4,8 @@
4 4
 \alias{add_edge_data}
5 5
 \title{Annotate KEGG edge mappings with user data}
6 6
 \usage{
7
-add_edge_data(expanded_edges, KEGG_mappings, user_data, data_column_no = 3,
8
-  only_mapped = FALSE)
7
+add_edge_data(expanded_edges, KEGG_mappings, user_data, map_type = "SYMBOL",
8
+  data_column_no = 3, only_mapped = FALSE)
9 9
 }
10 10
 \arguments{
11 11
 \item{expanded_edges}{The data frame object generated via the function 
... ...
@@ -18,6 +18,9 @@ function expand_KEGG_mappings}
18 18
 gene symbols representing an edge and any/all other column[s] contain 
19 19
 corresponding edge data.}
20 20
 
21
+\item{map_type}{If the genes in your data set are left untranslated 
22
+set to "NUMBER" (assuming numbers are gene accession numbers)}
23
+
21 24
 \item{data_column_no}{The column index for desired user data to be added}
22 25
 
23 26
 \item{only_mapped}{A logical indicator; if set to FALSE will return 'de-novo'
... ...
@@ -43,8 +43,9 @@ p53_HA1E_data <- overlap_info(p53_KGML, p53_KEGG_mappings,
43 43
                                "HA1E", data_type = "100_bing")
44 44
 
45 45
 p53_edges_HA1E_data_MAPPED <- add_edge_data(p53_edges, p53_KEGG_mappings, 
46
-                                               p53_HA1E_data, c(3, 10,12),
47
-                                               only_mapped = TRUE)
46
+                                            p53_HA1E_data, 
47
+                                            data_column_no = c(3, 10,12),
48
+                                            only_mapped = TRUE)
48 49
                                                
49 50
 p53_edge_mapping_HA1E <- edge_mapping_info(p53_edges_HA1E_data_MAPPED, 
50 51
                                                       data_added = TRUE)