Browse code

Update Coding Practice

Yeah that's right, just formating based on devtools and bioccheck to commit to bioconductor, not particularly sexy

Nick authored on 16/06/2021 14:31:31
Showing1 changed files
... ...
@@ -1,11 +1,13 @@
1 1
 #' Clustering T cell receptors
2 2
 #'
3 3
 #' This function uses edit distances of either the nucleotide or amino acid 
4
-#' sequences of the CDR3 to cluster similar TCRs together. The distance clustering
5
-#' will then be ammended to the end of the list of combined contigs with the corresponding Vgene. The
6
-#' cluster will appear as CHAIN.num if a unique sequence or CHAIN:LD.num if clustered together.
7
-#' This function will only two chains recovered, multiple chains will automatically be reduced. This 
8
-#' function also underlies the combineBCR() function and therefore not needed for B cells. This may
4
+#' sequences of the CDR3 to cluster similar TCRs together. The distance 
5
+#' clustering will then be amended to the end of the list of combined 
6
+#' contigs with the corresponding Vgene. The cluster will appear as 
7
+#' CHAIN.num if a unique sequence or CHAIN:LD.num if clustered together.
8
+#' This function will only two chains recovered, multiple chains will 
9
+#' automatically be reduced. This function also underlies the combineBCR() 
10
+#' function and therefore not needed for B cells. This may
9 11
 #' take some time to calculate the distances and cluster. 
10 12
 #' 
11 13
 #' @examples
... ...
@@ -18,8 +20,8 @@
18 20
 #' @param df The product of CombineTCR() or CombineBCR().
19 21
 #' @param chain The TCR to cluster
20 22
 #' @param sequence Clustering based on either "aa" or "nt"
21
-#' @param threshold The normalized edit distance to consider. The higher the number the more 
22
-#' similarity of sequence will be used for clustering.
23
+#' @param threshold The normalized edit distance to consider. The higher 
24
+#' the number the more similarity of sequence will be used for clustering.
23 25
 #' @param group The column header used for to calculate the cluster by.
24 26
 #' @importFrom stringdist stringdistmatrix
25 27
 #' @importFrom igraph graph_from_data_frame components
... ...
@@ -28,91 +30,79 @@
28 30
 #' @export
29 31
 #' @return List of clonotypes for individual cell barcodes
30 32
 
31
-clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85, group = NULL) {
33
+clusterTCR <- function(df, chain = NULL, sequence = NULL, 
34
+                        threshold = 0.85, group = NULL) {
32 35
     output.list <- list()
33 36
     `%!in%` = Negate(`%in%`)
34 37
     df <- checkList(df)
35 38
     if(chain %in% c("TCRA", "TCRG")) {
36 39
         ref <- 1
37 40
     } else if(chain %in% c("TCRB", "TCRD")) {
38
-        ref <- 2
39
-    }
41
+        ref <- 2 }
40 42
     ref2 <- paste0("cdr3_", sequence, ref)
41 43
     bound <- bind_rows(df)
42 44
     #Should make it work as either grouped or non-grouped
43 45
     if (!is.null(group)) {
44
-      bound <- split(bound, bound[,group])
45
-      list.length <- length(bound)
46
+        bound <- split(bound, bound[,group])
47
+        list.length <- length(bound)
46 48
     } else {
47
-      bound <- list(bound)
48
-      list.length <- 1
49
-    }
50
-    
51
-    for (x in seq_along(bound)) {
52
-      dictionary <- na.omit(unique(bound[[x]][,ref2]))
53
-      dictionary <- str_split(dictionary, ";", simplify = TRUE)[,1]
54
-      length <- nchar(dictionary)
55
-      matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
56
-      out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
57
-      for (j in seq_len(ncol(matrix))) {
58
-          for (k in seq_len(nrow(matrix))) {
59
-              if (j == k) {
60
-                  out_matrix[j,k] <- NA
61
-              } else{
62
-                  if (length[j] - length[k] >= round(mean(length)/2)) {
63
-                      out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
64
-                      out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
65
-                  } else {
66
-                  out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
67
-                  out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
68
-                  }
69
-              }
70
-          }
71
-      }
72
-      filtered <- which(out_matrix <= (1-threshold), arr.ind = TRUE)
73
-      if (nrow(filtered) > 0) { 
74
-          for (i in 1:nrow(filtered)) {
75
-              max <- max(filtered[i,])
76
-              min <- min(filtered[i,])
77
-              filtered[i,1] <- max
78
-              filtered[i,2] <- min
79
-          }
80
-          filtered <- unique(filtered) #removing redundant comparisons
81
-          out <- NULL
82
-          colnames(filtered) <- c("To", "From")
83
-          
84
-          g <- graph_from_data_frame(filtered)
85
-          components <- components(g, mode = c("weak"))
86
-          out <- data.frame("cluster" = components$membership, 
87
-                            "filtered" = names(components$membership))
88
-          if (list.length == 1) {
89
-            out$cluster <- paste0(chain, ":LD", ".", out$cluster)
90
-          } else {
91
-            out$cluster <- paste0(names(bound)[x], ".", chain, ":LD", ".", out$cluster)
92
-          }
93
-          out$filtered <- dictionary[as.numeric(out$filtered)]
94
-          
95
-          uni_IG <- as.data.frame(unique(dictionary[dictionary %!in% out$filtered]))
96
-          colnames(uni_IG) <- "filtered"
97
-          if (list.length == 1) {
98
-            uni_IG$cluster <- paste0(chain, ".", seq_len(nrow(uni_IG))) 
99
-          } else {
100
-            uni_IG$cluster <- paste0(names(bound)[x], ".", chain, ".", seq_len(nrow(uni_IG))) 
101
-          }
102
-      }
103
-      output <- rbind.data.frame(out, uni_IG)
104
-      colname <- paste0(chain, "_cluster")
105
-      colnames(output) <- c(colname, ref2)
106
-      output.list[[x]] <- output
107
-    }
108
-      for (i in seq_along(df)) {
109
-          tmp <- df[[i]]
110
-          output <- bind_rows(output.list)
111
-          tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = TRUE)[,1]
112
-          output2 <- output[output[,2] %in% tmp[,ref2],]
113
-          
114
-          tmp <-  suppressMessages(join(tmp,  output2))
115
-          df[[i]] <- tmp
116
-      }
49
+        bound <- list(bound)
50
+        list.length <- 1 }
51
+    for (x in seq_along(bound)) {    
52
+        dictionary <- na.omit(unique(bound[[x]][,ref2]))
53
+        dictionary <- str_split(dictionary, ";", simplify = TRUE)[,1]
54
+        length <- nchar(dictionary)
55
+        matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
56
+        out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
57
+        for (j in seq_len(ncol(matrix))) {
58
+            for (k in seq_len(nrow(matrix))) {
59
+                if (j == k) {
60
+                    out_matrix[j,k] <- NA
61
+                } else{
62
+                    if (length[j] - length[k] >= round(mean(length)/2)) {
63
+                        out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
64
+                        out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
65
+                    } else {
66
+                      out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
67
+                      out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
68
+        }}}}
69
+        filtered <- which(out_matrix <= (1-threshold), arr.ind = TRUE)
70
+        if (nrow(filtered) > 0) { 
71
+            for (i in seq_len(nrow(filtered))) {
72
+                max <- max(filtered[i,])
73
+                min <- min(filtered[i,])
74
+                filtered[i,1] <- max
75
+                filtered[i,2] <- min }
76
+            filtered <- unique(filtered) #removing redundant comparisons
77
+            out <- NULL
78
+            colnames(filtered) <- c("To", "From")
79
+            g <- graph_from_data_frame(filtered)
80
+            components <- components(g, mode = c("weak"))
81
+            out <- data.frame("cluster" = components$membership, 
82
+                              "filtered" = names(components$membership))
83
+            if (list.length == 1) {
84
+              out$cluster <- paste0(chain, ":LD", ".", out$cluster)
85
+            } else {
86
+              out$cluster <- paste0(names(bound)[x], ".", chain, ":LD", ".", 
87
+                                    out$cluster)}
88
+            out$filtered <- dictionary[as.numeric(out$filtered)]
89
+            uni_IG <- as.data.frame(unique(dictionary[dictionary %!in% out$filtered]))
90
+            colnames(uni_IG) <- "filtered"
91
+            if (list.length == 1) {
92
+              uni_IG$cluster <- paste0(chain, ".", seq_len(nrow(uni_IG))) 
93
+            } else {
94
+              uni_IG$cluster <- paste0(names(bound)[x], ".", chain, ".", 
95
+                                       seq_len(nrow(uni_IG)))}}
96
+        output <- rbind.data.frame(out, uni_IG)
97
+        colname <- paste0(chain, "_cluster")
98
+        colnames(output) <- c(colname, ref2)
99
+        output.list[[x]] <- output }
100
+        for (i in seq_along(df)) {
101
+            tmp <- df[[i]]
102
+            output <- bind_rows(output.list)
103
+            tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = TRUE)[,1]
104
+            output2 <- output[output[,2] %in% tmp[,ref2],]
105
+            tmp <-  suppressMessages(join(tmp,  output2))
106
+            df[[i]] <- tmp }
117 107
     return(df)
118 108
 }  
Browse code

Update clusterTCR.R

Add group variable, reflect new annotation

theHumanBorch authored on 25/05/2021 12:49:59 • GitHub committed on 25/05/2021 12:49:59
Showing1 changed files
... ...
@@ -13,20 +13,23 @@
13 13
 #' combined <- combineTCR(contig_list, rep(c("PX", "PY", "PZ"), each=2), 
14 14
 #' rep(c("P", "T"), 3), cells ="T-AB")
15 15
 #' 
16
-#' sub_combined <- clusterTCR(combined[[1]], chain = "TCRA", sequence = "aa")
16
+#' sub_combined <- clusterTCR(combined[[2]], chain = "TCRA", sequence = "aa")
17 17
 #' 
18 18
 #' @param df The product of CombineTCR() or CombineBCR().
19 19
 #' @param chain The TCR to cluster
20 20
 #' @param sequence Clustering based on either "aa" or "nt"
21 21
 #' @param threshold The normalized edit distance to consider. The higher the number the more 
22 22
 #' similarity of sequence will be used for clustering.
23
+#' @param group The column header used for to calculate the cluster by.
23 24
 #' @importFrom stringdist stringdistmatrix
24 25
 #' @importFrom igraph graph_from_data_frame components
25 26
 #' @importFrom plyr join
27
+#' @importFrom dplyr bind_rows
26 28
 #' @export
27 29
 #' @return List of clonotypes for individual cell barcodes
28 30
 
29
-clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
31
+clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85, group = NULL) {
32
+    output.list <- list()
30 33
     `%!in%` = Negate(`%in%`)
31 34
     df <- checkList(df)
32 35
     if(chain %in% c("TCRA", "TCRG")) {
... ...
@@ -35,61 +38,81 @@ clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
35 38
         ref <- 2
36 39
     }
37 40
     ref2 <- paste0("cdr3_", sequence, ref)
38
-    bound <- dplyr::bind_rows(df)
39
-    dictionary <- na.omit(unique(bound[,ref2]))
40
-    dictionary <- str_split(dictionary, ";", simplify = TRUE)[,1]
41
-    length <- nchar(dictionary)
42
-    matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
43
-    out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
44
-    for (j in seq_len(ncol(matrix))) {
45
-        for (k in seq_len(nrow(matrix))) {
46
-            if (j == k) {
47
-                out_matrix[j,k] <- NA
48
-            } else{
49
-                if (length[j] - length[k] >= round(mean(length)/2)) {
50
-                    out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
51
-                    out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
52
-                } else {
53
-                out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
54
-                out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
55
-                }
56
-            }
57
-        }
41
+    bound <- bind_rows(df)
42
+    #Should make it work as either grouped or non-grouped
43
+    if (!is.null(group)) {
44
+      bound <- split(bound, bound[,group])
45
+      list.length <- length(bound)
46
+    } else {
47
+      bound <- list(bound)
48
+      list.length <- 1
58 49
     }
59
-    filtered <- which(out_matrix <= (1-threshold), arr.ind = TRUE)
60
-    if (nrow(filtered) > 0) { 
61
-        for (i in 1:nrow(filtered)) {
62
-            max <- max(filtered[i,])
63
-            min <- min(filtered[i,])
64
-            filtered[i,1] <- max
65
-            filtered[i,2] <- min
66
-        }
67
-        filtered <- unique(filtered) #removing redundant comparisons
68
-        out <- NULL
69
-        colnames(filtered) <- c("To", "From")
70
-        
71
-        g <- graph_from_data_frame(filtered)
72
-        components <- components(g, mode = c("weak"))
73
-        out <- data.frame("cluster" = components$membership, 
74
-                          "filtered" = names(components$membership))
75
-        out$cluster <- paste0(chain, ":LD", ".", out$cluster)
76
-        out$filtered <- dictionary[as.numeric(out$filtered)]
77
-        
78
-        uni_IG <- as.data.frame(unique(dictionary[dictionary %!in% out$filtered]))
79
-        colnames(uni_IG) <- "filtered"
80
-        uni_IG$cluster <- paste0(chain, ".", seq_len(nrow(uni_IG)))
81
-    }
82
-    output <- rbind.data.frame(out, uni_IG)
83
-    colname <- paste0(chain, "_cluster")
84
-    colnames(output) <- c(colname, ref2)
85
-    for (i in seq_along(df)) {
86
-        tmp <- df[[i]]
87
-        tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = TRUE)[,1]
88
-        output2 <- output[output[,2] %in% tmp[,ref2],]
89
-        
90
-        tmp <-  suppressMessages(join(tmp,  output2))
91
-        df[[i]] <- tmp
50
+    
51
+    for (x in seq_along(bound)) {
52
+      dictionary <- na.omit(unique(bound[[x]][,ref2]))
53
+      dictionary <- str_split(dictionary, ";", simplify = TRUE)[,1]
54
+      length <- nchar(dictionary)
55
+      matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
56
+      out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
57
+      for (j in seq_len(ncol(matrix))) {
58
+          for (k in seq_len(nrow(matrix))) {
59
+              if (j == k) {
60
+                  out_matrix[j,k] <- NA
61
+              } else{
62
+                  if (length[j] - length[k] >= round(mean(length)/2)) {
63
+                      out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
64
+                      out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
65
+                  } else {
66
+                  out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
67
+                  out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
68
+                  }
69
+              }
70
+          }
71
+      }
72
+      filtered <- which(out_matrix <= (1-threshold), arr.ind = TRUE)
73
+      if (nrow(filtered) > 0) { 
74
+          for (i in 1:nrow(filtered)) {
75
+              max <- max(filtered[i,])
76
+              min <- min(filtered[i,])
77
+              filtered[i,1] <- max
78
+              filtered[i,2] <- min
79
+          }
80
+          filtered <- unique(filtered) #removing redundant comparisons
81
+          out <- NULL
82
+          colnames(filtered) <- c("To", "From")
83
+          
84
+          g <- graph_from_data_frame(filtered)
85
+          components <- components(g, mode = c("weak"))
86
+          out <- data.frame("cluster" = components$membership, 
87
+                            "filtered" = names(components$membership))
88
+          if (list.length == 1) {
89
+            out$cluster <- paste0(chain, ":LD", ".", out$cluster)
90
+          } else {
91
+            out$cluster <- paste0(names(bound)[x], ".", chain, ":LD", ".", out$cluster)
92
+          }
93
+          out$filtered <- dictionary[as.numeric(out$filtered)]
94
+          
95
+          uni_IG <- as.data.frame(unique(dictionary[dictionary %!in% out$filtered]))
96
+          colnames(uni_IG) <- "filtered"
97
+          if (list.length == 1) {
98
+            uni_IG$cluster <- paste0(chain, ".", seq_len(nrow(uni_IG))) 
99
+          } else {
100
+            uni_IG$cluster <- paste0(names(bound)[x], ".", chain, ".", seq_len(nrow(uni_IG))) 
101
+          }
102
+      }
103
+      output <- rbind.data.frame(out, uni_IG)
104
+      colname <- paste0(chain, "_cluster")
105
+      colnames(output) <- c(colname, ref2)
106
+      output.list[[x]] <- output
92 107
     }
108
+      for (i in seq_along(df)) {
109
+          tmp <- df[[i]]
110
+          output <- bind_rows(output.list)
111
+          tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = TRUE)[,1]
112
+          output2 <- output[output[,2] %in% tmp[,ref2],]
113
+          
114
+          tmp <-  suppressMessages(join(tmp,  output2))
115
+          df[[i]] <- tmp
116
+      }
93 117
     return(df)
94 118
 }  
95
-
Browse code

Merge branch 'dev' into master

theHumanBorch authored on 15/05/2021 18:09:46 • GitHub committed on 15/05/2021 18:09:46
Showing0 changed files
Browse code

CombineTCR() issue

Issue with situation in which given two contigs of a barcode - one is assumed TCRA and the other is assumed TCRB

Nick authored on 09/05/2021 12:38:47
Showing1 changed files
... ...
@@ -37,7 +37,7 @@ clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
37 37
     ref2 <- paste0("cdr3_", sequence, ref)
38 38
     bound <- dplyr::bind_rows(df)
39 39
     dictionary <- na.omit(unique(bound[,ref2]))
40
-    dictionary <- str_split(dictionary, ";", simplify = T)[,1]
40
+    dictionary <- str_split(dictionary, ";", simplify = TRUE)[,1]
41 41
     length <- nchar(dictionary)
42 42
     matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
43 43
     out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
... ...
@@ -84,7 +84,7 @@ clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
84 84
     colnames(output) <- c(colname, ref2)
85 85
     for (i in seq_along(df)) {
86 86
         tmp <- df[[i]]
87
-        tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = T)[,1]
87
+        tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = TRUE)[,1]
88 88
         output2 <- output[output[,2] %in% tmp[,ref2],]
89 89
         
90 90
         tmp <-  suppressMessages(join(tmp,  output2))
Browse code

adding else

combineBCR and clusterTCR add else to generate mean edit distance by length of the two cdr3 sequence lengths

Nick authored on 07/05/2021 11:01:39
Showing1 changed files
... ...
@@ -49,9 +49,10 @@ clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
49 49
                 if (length[j] - length[k] >= round(mean(length)/2)) {
50 50
                     out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
51 51
                     out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
52
-                }
52
+                } else {
53 53
                 out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
54 54
                 out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
55
+                }
55 56
             }
56 57
         }
57 58
     }
Browse code

Update clusterTCR.R

Removing V genes
Fixed issue with multiple chains

Nick authored on 09/04/2021 17:49:26
Showing1 changed files
... ...
@@ -27,67 +27,67 @@
27 27
 #' @return List of clonotypes for individual cell barcodes
28 28
 
29 29
 clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
30
-    `%!in%` = Negate(`%in%`)
31
-    df <- checkList(df)
32
-    if(chain %in% c("TCRA", "TCRG")) {
33
-        ref <- 1
34
-    } else if(chain %in% c("TCRB", "TCRD")) {
35
-        ref <- 2
36
-    }
37
-    ref2 <- paste0("cdr3_", sequence, ref)
38
-    bound <- dplyr::bind_rows(df)
39
-    dictionary <- na.omit(unique(bound[,ref2]))
40
-    dictionary <- str_split(dictionary, ";", simplify = T)[,1]
41
-    length <- nchar(dictionary)
42
-    matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
43
-    out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
44
-    for (j in seq_len(ncol(matrix))) {
45
-        for (k in seq_len(nrow(matrix))) {
46
-            if (j == k) {
47
-                out_matrix[j,k] <- NA
48
-            } else{
49
-                if (length[j] - length[k] >= round(mean(length)/2)) {
50
-                    out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
51
-                    out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
52
-                }
53
-                out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
54
-                out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
55
-            }
56
-        }
57
-    }
58
-    filtered <- which(out_matrix <= (1-threshold), arr.ind = TRUE)
59
-    if (nrow(filtered) > 0) { 
60
-        for (i in 1:nrow(filtered)) {
61
-            max <- max(filtered[i,])
62
-            min <- min(filtered[i,])
63
-            filtered[i,1] <- max
64
-            filtered[i,2] <- min
30
+  `%!in%` = Negate(`%in%`)
31
+  df <- checkList(df)
32
+  if(chain %in% c("TCRA", "TCRG")) {
33
+    ref <- 1
34
+  } else if(chain %in% c("TCRB", "TCRD")) {
35
+    ref <- 2
36
+  }
37
+  ref2 <- paste0("cdr3_", sequence, ref)
38
+  bound <- dplyr::bind_rows(df)
39
+  dictionary <- na.omit(unique(bound[,ref2]))
40
+  dictionary <- str_split(dictionary, ";", simplify = T)[,1]
41
+  length <- nchar(dictionary)
42
+  matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
43
+  out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
44
+  for (j in seq_len(ncol(matrix))) {
45
+    for (k in seq_len(nrow(matrix))) {
46
+      if (j == k) {
47
+        out_matrix[j,k] <- NA
48
+      } else{
49
+        if (length[j] - length[k] >= round(mean(length)/2)) {
50
+          out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
51
+          out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
65 52
         }
66
-        filtered <- unique(filtered) #removing redundant comparisons
67
-        out <- NULL
68
-        colnames(filtered) <- c("To", "From")
69
-        
70
-        g <- graph_from_data_frame(filtered)
71
-        components <- components(g, mode = c("weak"))
72
-        out <- data.frame("cluster" = components$membership, 
73
-                          "filtered" = names(components$membership))
74
-        out$cluster <- paste0(chain, ":LD", ".", out$cluster)
75
-        out$filtered <- dictionary[as.numeric(out$filtered)]
76
-        
77
-        uni_IG <- as.data.frame(unique(dictionary[dictionary %!in% out$filtered]))
78
-        colnames(uni_IG) <- "filtered"
79
-        uni_IG$cluster <- paste0(chain, ".", seq_len(nrow(uni_IG)))
53
+        out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
54
+        out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
55
+      }
80 56
     }
81
-    output <- rbind.data.frame(out, uni_IG)
82
-    colname <- paste0(chain, "_cluster")
83
-    colnames(output) <- c(colname, ref2)
84
-    for (i in seq_along(df)) {
85
-        tmp <- df[[i]]
86
-        output2 <- output[output[,2] %in% tmp[,ref2],]
87
-        tmp <-  suppressMessages(join(tmp,output2))
88
-        tmp[,colname] <- paste0(tmp[,colname], ".", str_split(tmp[,paste0("TCR", ref)], "[.]", simplify = TRUE)[,1])
89
-        tmp[,colname][which(tmp[,colname] == "NA.NA")] <- NA ###here
90
-         df[[i]] <- tmp
57
+  }
58
+  filtered <- which(out_matrix <= (1-threshold), arr.ind = TRUE)
59
+  if (nrow(filtered) > 0) { 
60
+    for (i in 1:nrow(filtered)) {
61
+      max <- max(filtered[i,])
62
+      min <- min(filtered[i,])
63
+      filtered[i,1] <- max
64
+      filtered[i,2] <- min
91 65
     }
92
-    return(df)
93
-}  
94 66
\ No newline at end of file
67
+    filtered <- unique(filtered) #removing redundant comparisons
68
+    out <- NULL
69
+    colnames(filtered) <- c("To", "From")
70
+    
71
+    g <- graph_from_data_frame(filtered)
72
+    components <- components(g, mode = c("weak"))
73
+    out <- data.frame("cluster" = components$membership, 
74
+                      "filtered" = names(components$membership))
75
+    out$cluster <- paste0(chain, ":LD", ".", out$cluster)
76
+    out$filtered <- dictionary[as.numeric(out$filtered)]
77
+    
78
+    uni_IG <- as.data.frame(unique(dictionary[dictionary %!in% out$filtered]))
79
+    colnames(uni_IG) <- "filtered"
80
+    uni_IG$cluster <- paste0(chain, ".", seq_len(nrow(uni_IG)))
81
+  }
82
+  output <- rbind.data.frame(out, uni_IG)
83
+  colname <- paste0(chain, "_cluster")
84
+  colnames(output) <- c(colname, ref2)
85
+  for (i in seq_along(df)) {
86
+    tmp <- df[[i]]
87
+    tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = T)[,1]
88
+    output2 <- output[output[,2] %in% tmp[,ref2],]
89
+    
90
+    tmp <-  suppressMessages(join(tmp,  output2))
91
+    df[[i]] <- tmp
92
+  }
93
+  return(df)
94
+} 
95 95
\ No newline at end of file
Browse code

Update clusterTCR.R

Removal of Vgene addition

Nick authored on 09/04/2021 13:58:19
Showing1 changed files
... ...
@@ -83,12 +83,11 @@ clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
83 83
     colnames(output) <- c(colname, ref2)
84 84
     for (i in seq_along(df)) {
85 85
         tmp <- df[[i]]
86
-        output2 <- output[output[,2] %in% tmp[,ref2],]
87 86
         tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = T)[,1]
87
+        output2 <- output[output[,2] %in% tmp[,ref2],]
88
+        
88 89
         tmp <-  suppressMessages(join(tmp,  output2))
89
-        tmp[,colname] <- paste0(tmp[,colname], ".", str_split(tmp[,paste0("TCR", ref)], "[.]", simplify = TRUE)[,1])
90
-        tmp[,colname][which(tmp[,colname] == "NA.NA")] <- NA ###here
91
-         df[[i]] <- tmp
90
+        df[[i]] <- tmp
92 91
     }
93 92
     return(df)
94
-}  
95 93
\ No newline at end of file
94
+}  
Browse code

Update clusterTCR.R

Handling of multiple chains

Nick authored on 09/04/2021 13:24:58
Showing1 changed files
... ...
@@ -84,7 +84,8 @@ clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
84 84
     for (i in seq_along(df)) {
85 85
         tmp <- df[[i]]
86 86
         output2 <- output[output[,2] %in% tmp[,ref2],]
87
-        tmp <-  suppressMessages(join(tmp,output2))
87
+        tmp[,ref2] <- str_split(tmp[,ref2], ";", simplify = T)[,1]
88
+        tmp <-  suppressMessages(join(tmp,  output2))
88 89
         tmp[,colname] <- paste0(tmp[,colname], ".", str_split(tmp[,paste0("TCR", ref)], "[.]", simplify = TRUE)[,1])
89 90
         tmp[,colname][which(tmp[,colname] == "NA.NA")] <- NA ###here
90 91
          df[[i]] <- tmp
Browse code

v1.0.3 Commit

Nick authored on 08/04/2021 17:40:20
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,93 @@
1
+#' Clustering T cell receptors
2
+#'
3
+#' This function uses edit distances of either the nucleotide or amino acid 
4
+#' sequences of the CDR3 to cluster similar TCRs together. The distance clustering
5
+#' will then be ammended to the end of the list of combined contigs with the corresponding Vgene. The
6
+#' cluster will appear as CHAIN.num if a unique sequence or CHAIN:LD.num if clustered together.
7
+#' This function will only two chains recovered, multiple chains will automatically be reduced. This 
8
+#' function also underlies the combineBCR() function and therefore not needed for B cells. This may
9
+#' take some time to calculate the distances and cluster. 
10
+#' 
11
+#' @examples
12
+# Getting the combined contigs
13
+#' combined <- combineTCR(contig_list, rep(c("PX", "PY", "PZ"), each=2), 
14
+#' rep(c("P", "T"), 3), cells ="T-AB")
15
+#' 
16
+#' sub_combined <- clusterTCR(combined[[1]], chain = "TCRA", sequence = "aa")
17
+#' 
18
+#' @param df The product of CombineTCR() or CombineBCR().
19
+#' @param chain The TCR to cluster
20
+#' @param sequence Clustering based on either "aa" or "nt"
21
+#' @param threshold The normalized edit distance to consider. The higher the number the more 
22
+#' similarity of sequence will be used for clustering.
23
+#' @importFrom stringdist stringdistmatrix
24
+#' @importFrom igraph graph_from_data_frame components
25
+#' @importFrom plyr join
26
+#' @export
27
+#' @return List of clonotypes for individual cell barcodes
28
+
29
+clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
30
+    `%!in%` = Negate(`%in%`)
31
+    df <- checkList(df)
32
+    if(chain %in% c("TCRA", "TCRG")) {
33
+        ref <- 1
34
+    } else if(chain %in% c("TCRB", "TCRD")) {
35
+        ref <- 2
36
+    }
37
+    ref2 <- paste0("cdr3_", sequence, ref)
38
+    bound <- dplyr::bind_rows(df)
39
+    dictionary <- na.omit(unique(bound[,ref2]))
40
+    dictionary <- str_split(dictionary, ";", simplify = T)[,1]
41
+    length <- nchar(dictionary)
42
+    matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
43
+    out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
44
+    for (j in seq_len(ncol(matrix))) {
45
+        for (k in seq_len(nrow(matrix))) {
46
+            if (j == k) {
47
+                out_matrix[j,k] <- NA
48
+            } else{
49
+                if (length[j] - length[k] >= round(mean(length)/2)) {
50
+                    out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
51
+                    out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
52
+                }
53
+                out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
54
+                out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
55
+            }
56
+        }
57
+    }
58
+    filtered <- which(out_matrix <= (1-threshold), arr.ind = TRUE)
59
+    if (nrow(filtered) > 0) { 
60
+        for (i in 1:nrow(filtered)) {
61
+            max <- max(filtered[i,])
62
+            min <- min(filtered[i,])
63
+            filtered[i,1] <- max
64
+            filtered[i,2] <- min
65
+        }
66
+        filtered <- unique(filtered) #removing redundant comparisons
67
+        out <- NULL
68
+        colnames(filtered) <- c("To", "From")
69
+        
70
+        g <- graph_from_data_frame(filtered)
71
+        components <- components(g, mode = c("weak"))
72
+        out <- data.frame("cluster" = components$membership, 
73
+                          "filtered" = names(components$membership))
74
+        out$cluster <- paste0(chain, ":LD", ".", out$cluster)
75
+        out$filtered <- dictionary[as.numeric(out$filtered)]
76
+        
77
+        uni_IG <- as.data.frame(unique(dictionary[dictionary %!in% out$filtered]))
78
+        colnames(uni_IG) <- "filtered"
79
+        uni_IG$cluster <- paste0(chain, ".", seq_len(nrow(uni_IG)))
80
+    }
81
+    output <- rbind.data.frame(out, uni_IG)
82
+    colname <- paste0(chain, "_cluster")
83
+    colnames(output) <- c(colname, ref2)
84
+    for (i in seq_along(df)) {
85
+        tmp <- df[[i]]
86
+        output2 <- output[output[,2] %in% tmp[,ref2],]
87
+        tmp <-  suppressMessages(join(tmp,output2))
88
+        tmp[,colname] <- paste0(tmp[,colname], ".", str_split(tmp[,paste0("TCR", ref)], "[.]", simplify = TRUE)[,1])
89
+        tmp[,colname][which(tmp[,colname] == "NA.NA")] <- NA ###here
90
+         df[[i]] <- tmp
91
+    }
92
+    return(df)
93
+}  
0 94
\ No newline at end of file
Browse code

Update vignette

Nick authored on 26/02/2021 15:05:21
Showing1 changed files
... ...
@@ -90,6 +90,4 @@ clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
90 90
          df[[i]] <- tmp
91 91
     }
92 92
     return(df)
93
-}  
94
-    
95
-    
96 93
\ No newline at end of file
94
+}  
97 95
\ No newline at end of file
Browse code

Update clusterTCR.R

Nick authored on 25/02/2021 22:50:55
Showing1 changed files
... ...
@@ -89,6 +89,7 @@ clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
89 89
         tmp[,colname][which(tmp[,colname] == "NA.NA")] <- NA ###here
90 90
          df[[i]] <- tmp
91 91
     }
92
+    return(df)
92 93
 }  
93 94
     
94 95
     
95 96
\ No newline at end of file
Browse code

Update clusterTCR.R

Nick authored on 19/02/2021 18:47:43
Showing1 changed files
... ...
@@ -13,7 +13,7 @@
13 13
 #' combined <- combineTCR(contig_list, rep(c("PX", "PY", "PZ"), each=2), 
14 14
 #' rep(c("P", "T"), 3), cells ="T-AB")
15 15
 #' 
16
-#' sub_combined <- combineTCR(combined[[1]], chain = "TCRA", sequence = "aa")
16
+#' sub_combined <- clusterTCR(combined[[1]], chain = "TCRA", sequence = "aa")
17 17
 #' 
18 18
 #' @param df The product of CombineTCR() or CombineBCR().
19 19
 #' @param chain The TCR to cluster
Browse code

Update clusterTCR.R

Update documentation

Nick authored on 19/02/2021 18:16:59
Showing1 changed files
... ...
@@ -9,7 +9,11 @@
9 9
 #' take some time to calculate the distances and cluster. 
10 10
 #' 
11 11
 #' @examples
12
-#' clusterTCR(contig_list, chain = "TCRA", sequence = "aa")
12
+# Getting the combined contigs
13
+#' combined <- combineTCR(contig_list, rep(c("PX", "PY", "PZ"), each=2), 
14
+#' rep(c("P", "T"), 3), cells ="T-AB")
15
+#' 
16
+#' sub_combined <- combineTCR(combined[[1]], chain = "TCRA", sequence = "aa")
13 17
 #' 
14 18
 #' @param df The product of CombineTCR() or CombineBCR().
15 19
 #' @param chain The TCR to cluster
... ...
@@ -24,6 +28,7 @@
24 28
 
25 29
 clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
26 30
     `%!in%` = Negate(`%in%`)
31
+    df <- checkList(df)
27 32
     if(chain %in% c("TCRA", "TCRG")) {
28 33
         ref <- 1
29 34
     } else if(chain %in% c("TCRB", "TCRD")) {
Browse code

Create clusterTCR.R

Use to cluster TCR chains by edit distance.

Nick authored on 19/02/2021 18:08:18
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,89 @@
1
+#' Clustering T cell receptors
2
+#'
3
+#' This function uses edit distances of either the nucleotide or amino acid 
4
+#' sequences of the CDR3 to cluster similar TCRs together. The distance clustering
5
+#' will then be ammended to the end of the list of combined contigs with the corresponding Vgene. The
6
+#' cluster will appear as CHAIN.num if a unique sequence or CHAIN:LD.num if clustered together.
7
+#' This function will only two chains recovered, multiple chains will automatically be reduced. This 
8
+#' function also underlies the combineBCR() function and therefore not needed for B cells. This may
9
+#' take some time to calculate the distances and cluster. 
10
+#' 
11
+#' @examples
12
+#' clusterTCR(contig_list, chain = "TCRA", sequence = "aa")
13
+#' 
14
+#' @param df The product of CombineTCR() or CombineBCR().
15
+#' @param chain The TCR to cluster
16
+#' @param sequence Clustering based on either "aa" or "nt"
17
+#' @param threshold The normalized edit distance to consider. The higher the number the more 
18
+#' similarity of sequence will be used for clustering.
19
+#' @importFrom stringdist stringdistmatrix
20
+#' @importFrom igraph graph_from_data_frame components
21
+#' @importFrom plyr join
22
+#' @export
23
+#' @return List of clonotypes for individual cell barcodes
24
+
25
+clusterTCR <- function(df, chain = NULL, sequence = NULL, threshold = 0.85) {
26
+    `%!in%` = Negate(`%in%`)
27
+    if(chain %in% c("TCRA", "TCRG")) {
28
+        ref <- 1
29
+    } else if(chain %in% c("TCRB", "TCRD")) {
30
+        ref <- 2
31
+    }
32
+    ref2 <- paste0("cdr3_", sequence, ref)
33
+    bound <- dplyr::bind_rows(df)
34
+    dictionary <- na.omit(unique(bound[,ref2]))
35
+    dictionary <- str_split(dictionary, ";", simplify = T)[,1]
36
+    length <- nchar(dictionary)
37
+    matrix <- as.matrix(stringdistmatrix(dictionary, method = "lv"))    
38
+    out_matrix <- matrix(ncol = ncol(matrix), nrow=ncol(matrix))
39
+    for (j in seq_len(ncol(matrix))) {
40
+        for (k in seq_len(nrow(matrix))) {
41
+            if (j == k) {
42
+                out_matrix[j,k] <- NA
43
+            } else{
44
+                if (length[j] - length[k] >= round(mean(length)/2)) {
45
+                    out_matrix[j,k] <- matrix[j,k]/(max(length[j], length[k]))
46
+                    out_matrix[k,j] <- matrix[k,j]/(max(length[j], length[k]))
47
+                }
48
+                out_matrix[j,k] <- matrix[j,k]/((length[j]+ length[k])/2)
49
+                out_matrix[k,j] <- matrix[k,j]/((length[j]+ length[k])/2)
50
+            }
51
+        }
52
+    }
53
+    filtered <- which(out_matrix <= (1-threshold), arr.ind = TRUE)
54
+    if (nrow(filtered) > 0) { 
55
+        for (i in 1:nrow(filtered)) {
56
+            max <- max(filtered[i,])
57
+            min <- min(filtered[i,])
58
+            filtered[i,1] <- max
59
+            filtered[i,2] <- min
60
+        }
61
+        filtered <- unique(filtered) #removing redundant comparisons
62
+        out <- NULL
63
+        colnames(filtered) <- c("To", "From")
64
+        
65
+        g <- graph_from_data_frame(filtered)
66
+        components <- components(g, mode = c("weak"))
67
+        out <- data.frame("cluster" = components$membership, 
68
+                          "filtered" = names(components$membership))
69
+        out$cluster <- paste0(chain, ":LD", ".", out$cluster)
70
+        out$filtered <- dictionary[as.numeric(out$filtered)]
71
+        
72
+        uni_IG <- as.data.frame(unique(dictionary[dictionary %!in% out$filtered]))
73
+        colnames(uni_IG) <- "filtered"
74
+        uni_IG$cluster <- paste0(chain, ".", seq_len(nrow(uni_IG)))
75
+    }
76
+    output <- rbind.data.frame(out, uni_IG)
77
+    colname <- paste0(chain, "_cluster")
78
+    colnames(output) <- c(colname, ref2)
79
+    for (i in seq_along(df)) {
80
+        tmp <- df[[i]]
81
+        output2 <- output[output[,2] %in% tmp[,ref2],]
82
+        tmp <-  suppressMessages(join(tmp,output2))
83
+        tmp[,colname] <- paste0(tmp[,colname], ".", str_split(tmp[,paste0("TCR", ref)], "[.]", simplify = TRUE)[,1])
84
+        tmp[,colname][which(tmp[,colname] == "NA.NA")] <- NA ###here
85
+         df[[i]] <- tmp
86
+    }
87
+}  
88
+    
89
+    
0 90
\ No newline at end of file