Browse code

Update dedup functionality for import data functions & update tutorial with VAM

Yichen Wang authored on 22/11/2021 17:57:17
Showing1 changed files
... ...
@@ -74,7 +74,8 @@
74 74
     delayedArray,
75 75
     cbNotFirstCol,
76 76
     feNotFirstCol,
77
-    combinedSample) {
77
+    combinedSample,
78
+    rowNamesDedup) {
78 79
 
79 80
     if (length(seqcDirs) != length(samples)) {
80 81
         stop("'seqcDirs' and 'samples' have unequal lengths!")
... ...
@@ -121,6 +122,13 @@
121 122
 
122 123
         }
123 124
         sce <- do.call(SingleCellExperiment::cbind, res)
125
+        if (isTRUE(rowNamesDedup)) {
126
+            if (any(duplicated(rownames(sce)))) {
127
+                message("Duplicated gene names found, adding '-1', '-2', ",
128
+                        "... suffix to them.")
129
+            }
130
+            sce <- dedupRowNames(sce)
131
+        }
124 132
         return(sce)
125 133
 
126 134
     } else {
... ...
@@ -133,7 +141,15 @@
133 141
             res[[i]] <- scei
134 142
         }
135 143
         if (length(seqcDirs) == 1) {
136
-            return(res[[1]])
144
+            sce <- res[[1]]
145
+            if (isTRUE(rowNamesDedup)) {
146
+                if (any(duplicated(rownames(sce)))) {
147
+                    message("Duplicated gene names found, adding '-1', '-2', ",
148
+                            "... suffix to them.")
149
+                }
150
+                sce <- dedupRowNames(sce)
151
+            }
152
+            return(sce)
137 153
         } else {
138 154
             return(res)
139 155
         }
... ...
@@ -145,12 +161,11 @@
145 161
 #' @rdname importSEQC
146 162
 #' @title Construct SCE object from seqc output
147 163
 #' @description Read the filtered barcodes, features, and matrices for all
148
-#'  samples from (preferably a single run of) seqc output. Import and
149
-#'  combine them as one big \link[SingleCellExperiment]{SingleCellExperiment}
150
-#'  object.
164
+#'  samples from (preferably a single run of) seqc output. Import and combine
165
+#'  them as one big \link[SingleCellExperiment]{SingleCellExperiment} object.
151 166
 #' @param seqcDirs A vector of paths to seqc output files. Each sample
152
-#'  should have its own path. For example: \code{./pbmc_1k_50x50}.
153
-#'  Must have the same length as \code{samples}.
167
+#'  should have its own path. For example: \code{"./pbmc_1k_50x50"}. Must have 
168
+#'  the same length as \code{samples}.
154 169
 #' @param samples A vector of user-defined sample names for the samples to be
155 170
 #'  imported. Must have the same length as \code{seqcDirs}.
156 171
 #' @param prefix A vector containing the prefix of file names within each
... ...
@@ -158,44 +173,39 @@
158 173
 #'  length as \emph{samples}.
159 174
 #' @param gzipped Boolean. \code{TRUE} if the seqc output files
160 175
 #'  (sparse_counts_barcode.csv, sparse_counts_genes.csv, and
161
-#'  sparse_molecule_counts.mtx)
162
-#'  were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are
163
-#'  not gzipped.
164
-#' Default \code{FALSE}.
176
+#'  sparse_molecule_counts.mtx) were gzip compressed. \code{FALSE} otherwise. 
177
+#'  Default seqc outputs are not gzipped. Default \code{FALSE}.
165 178
 #' @param class Character. The class of the expression matrix stored in the SCE
166
-#'  object. Can be one of "Matrix" (as returned by
167
-#'  \link{readMM} function), or "matrix" (as returned by
168
-#'  \link[base]{matrix} function). Default "Matrix".
179
+#'  object. Can be one of \code{"Matrix"} (as returned by \link{readMM} 
180
+#'  function), or \code{"matrix"} (as returned by \link[base]{matrix} function).
181
+#'  Default \code{"Matrix"}.
169 182
 #' @param delayedArray Boolean. Whether to read the expression matrix as
170 183
 #'  \link{DelayedArray} object or not. Default \code{FALSE}.
171 184
 #' @param feNotFirstCol Boolean. \code{TRUE} if first column of
172
-#'  sparse_counts_genes.csv
173
-#' is row index and it will be removed. \code{FALSE} the first column will
174
-#'  be kept.
185
+#'  sparse_counts_genes.csv is row index and it will be removed. \code{FALSE} 
186
+#'  the first column will be kept.
175 187
 #' @param cbNotFirstCol Boolean. \code{TRUE} if first column of
176
-#'  sparse_counts_barcode.csv
177
-#' is row index and it will be removed. \code{FALSE} the first column will
178
-#'  be kept.
188
+#'  sparse_counts_barcode.csv is row index and it will be removed. \code{FALSE} 
189
+#'  the first column will be kept.
179 190
 #' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a
180 191
 #' \code{SingleCellExperiment} object containing the combined count matrix,
181
-#'  feature annotations
182
-#'  and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a
183
-#'  list containing multiple
192
+#'  feature annotations and the cell annotations. If \code{FALSE}, 
193
+#'  \code{importSEQC} returns a list containing multiple
184 194
 #'  \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment}
185 195
 #'  contains count matrix, feature annotations and cell annotations for
186 196
 #'  each sample.
197
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Only applied 
198
+#' if \code{combinedSample} is \code{TRUE} or only one \code{seqcDirs} 
199
+#' specified. Default \code{TRUE}.
187 200
 #' @details
188
-#' \code{importSEQC} imports output from seqc.
189
-#'  The default sparse_counts_barcode.csv or sparse_counts_genes.csv from
190
-#'  seqc output
201
+#' \code{importSEQC} imports output from seqc. The default 
202
+#'  sparse_counts_barcode.csv or sparse_counts_genes.csv from seqc output
191 203
 #'  contains two columns. The first column is row index and the second column
192
-#'  is cell-barcode
193
-#'  or gene symbol. \code{importSEQC} will remove first column. Alternatively,
194
-#'  user can call
204
+#'  is cell-barcode or gene symbol. \code{importSEQC} will remove first column. 
205
+#'  Alternatively, user can call
195 206
 #'  \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first
196
-#'  column of these files.
197
-#'  When \code{combinedSample} is TRUE, \code{importSEQC} will combined count
198
-#'  matrix with genes detected in at least one sample.
207
+#'  column of these files. When \code{combinedSample} is TRUE, \code{importSEQC}
208
+#'  will combined count matrix with genes detected in at least one sample.
199 209
 #' @return A \code{SingleCellExperiment} object containing the combined count
200 210
 #'  matrix, the feature annotations, and the cell annotation.
201 211
 #' @examples
... ...
@@ -220,7 +230,8 @@ importSEQC <- function(
220 230
     delayedArray = FALSE,
221 231
     cbNotFirstCol = TRUE,
222 232
     feNotFirstCol = TRUE,
223
-    combinedSample = TRUE) {
233
+    combinedSample = TRUE,
234
+    rowNamesDedup = TRUE) {
224 235
 
225 236
     class <- match.arg(class)
226 237
 
... ...
@@ -232,5 +243,6 @@ importSEQC <- function(
232 243
         delayedArray = delayedArray,
233 244
         cbNotFirstCol = cbNotFirstCol,
234 245
         feNotFirstCol = feNotFirstCol,
235
-        combinedSample = combinedSample)
246
+        combinedSample = combinedSample,
247
+        rowNamesDedup = rowNamesDedup)
236 248
 }
Browse code

Update importMitoGeneSet function. Change delayedArray=False for all import function. Minor fixs in QC HTML report

rz2333 authored on 30/12/2020 17:40:20
Showing1 changed files
... ...
@@ -167,7 +167,7 @@
167 167
 #'  \link{readMM} function), or "matrix" (as returned by
168 168
 #'  \link[base]{matrix} function). Default "Matrix".
169 169
 #' @param delayedArray Boolean. Whether to read the expression matrix as
170
-#'  \link{DelayedArray} object or not. Default \code{TRUE}.
170
+#'  \link{DelayedArray} object or not. Default \code{FALSE}.
171 171
 #' @param feNotFirstCol Boolean. \code{TRUE} if first column of
172 172
 #'  sparse_counts_genes.csv
173 173
 #' is row index and it will be removed. \code{FALSE} the first column will
... ...
@@ -217,7 +217,7 @@ importSEQC <- function(
217 217
     prefix = NULL,
218 218
     gzipped = FALSE,
219 219
     class = c("Matrix", "matrix"),
220
-    delayedArray = TRUE,
220
+    delayedArray = FALSE,
221 221
     cbNotFirstCol = TRUE,
222 222
     feNotFirstCol = TRUE,
223 223
     combinedSample = TRUE) {
Browse code

Edit links to documentation

unknown authored on 22/10/2020 03:39:09
Showing1 changed files
... ...
@@ -164,10 +164,10 @@
164 164
 #' Default \code{FALSE}.
165 165
 #' @param class Character. The class of the expression matrix stored in the SCE
166 166
 #'  object. Can be one of "Matrix" (as returned by
167
-#'  \link[Matrix]{readMM} function), or "matrix" (as returned by
167
+#'  \link{readMM} function), or "matrix" (as returned by
168 168
 #'  \link[base]{matrix} function). Default "Matrix".
169 169
 #' @param delayedArray Boolean. Whether to read the expression matrix as
170
-#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
170
+#'  \link{DelayedArray} object or not. Default \code{TRUE}.
171 171
 #' @param feNotFirstCol Boolean. \code{TRUE} if first column of
172 172
 #'  sparse_counts_genes.csv
173 173
 #' is row index and it will be removed. \code{FALSE} the first column will
Browse code

Merge devel branch (Oct 5) into master branch

Yusuke Koga authored on 09/10/2020 17:57:06
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,236 @@
1
+.constructSCEFromSeqcOutputs <- function(
2
+    sampleName,
3
+    matrix,
4
+    features,
5
+    barcodes) {
6
+
7
+    coln <- paste(sampleName, barcodes[[1]], sep = "_")
8
+    rownames(matrix) <- features[[1]]
9
+
10
+    sce <- SingleCellExperiment::SingleCellExperiment(
11
+        assays = list(counts = matrix))
12
+    SummarizedExperiment::rowData(sce) <- features
13
+    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(
14
+        cell_barcode = as.character(barcodes[[1]]),
15
+        column_name = coln,
16
+        sample = sampleName,
17
+        row.names = coln)
18
+
19
+    return(sce)
20
+}
21
+
22
+
23
+.unionGeneMatrix <- function(geneUnion, matrix){
24
+    missGene <- geneUnion[!geneUnion %in% rownames(matrix)]
25
+    missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix),
26
+        dimnames = list(missGene, NULL))
27
+
28
+    matb <- methods::as(matrix, "dgCMatrix")
29
+    rownames(matb) <- rownames(matrix)
30
+
31
+    mat <- rbind(matb, missMat)
32
+    if (anyDuplicated(rownames(mat))) {
33
+        mat <- mat[!duplicated(rownames(mat)), ]
34
+        warning("Duplicated genes exist in count matrix. Filtered",
35
+            " duplicated genes.")
36
+    }
37
+    return(mat)
38
+}
39
+
40
+
41
+.getGeneUnion <- function(geneList){
42
+    gene <- geneList
43
+    for (i in seq_along(geneList)){
44
+        gene[[i]] <- geneList[[i]][[1]]
45
+    }
46
+
47
+    geneUnion <- base::Reduce(union, gene)
48
+    return(geneUnion)
49
+}
50
+
51
+
52
+.readBarcodesSEQC <- function(path) {
53
+    res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
54
+    res <- res[,-1,drop = FALSE]
55
+    colnames(res) <- "cell_barcode"
56
+    return(res)
57
+}
58
+
59
+
60
+.readFeaturesSEQC <- function(path) {
61
+    res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
62
+    res <- res[,-1,drop = FALSE]
63
+    colnames(res) <- "feature_name"
64
+    return(res)
65
+}
66
+
67
+
68
+.importSEQC <- function(
69
+    seqcDirs,
70
+    samples,
71
+    prefix,
72
+    gzipped,
73
+    class,
74
+    delayedArray,
75
+    cbNotFirstCol,
76
+    feNotFirstCol,
77
+    combinedSample) {
78
+
79
+    if (length(seqcDirs) != length(samples)) {
80
+        stop("'seqcDirs' and 'samples' have unequal lengths!")
81
+    }
82
+
83
+    if (length(seqcDirs) != length(prefix)) {
84
+        stop("'seqcDirs' and 'prefix' have unequal lengths!")
85
+    }
86
+
87
+    res <- vector("list", length = length(seqcDirs))
88
+    cb <- vector("list", length = length(seqcDirs))
89
+    fe <- vector("list", length = length(seqcDirs))
90
+    mat <- vector("list", length = length(seqcDirs))
91
+
92
+    for (i in seq_along(seqcDirs)) {
93
+        dir <- seqcDirs[i]
94
+        matrixFile <- paste(prefix[i], 'sparse_molecule_counts.mtx', sep = "_")
95
+        featuresFile <- paste(prefix[i], 'sparse_counts_genes.csv', sep = "_")
96
+        barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv',
97
+            sep = "_")
98
+
99
+        cb[[i]] <- .readBarcodesSEQC(file.path(dir, barcodesFile))
100
+        fe[[i]] <- .readFeaturesSEQC(file.path(dir, featuresFile))
101
+
102
+        mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile),
103
+            gzipped = gzipped, class = class, delayedArray = delayedArray)
104
+        mat[[i]] <- t(mat[[i]])
105
+        rownames(mat[[i]]) <- fe[[i]][[1]]
106
+    }
107
+
108
+    if (isTRUE(combinedSample) & length(seqcDirs) > 1) {
109
+        geneUnion <- .getGeneUnion(fe)
110
+        for (i in seq_along(seqcDirs)) {
111
+            matrix <- .unionGeneMatrix(geneUnion = geneUnion, matrix = mat[[i]])
112
+            matrix <- matrix[geneUnion, ]
113
+            feature <- S4Vectors::DataFrame('feature_name' = rownames(matrix))
114
+
115
+            scei <- .constructSCEFromSeqcOutputs(
116
+                sampleName = samples[i],
117
+                matrix = matrix,
118
+                features = feature,
119
+                barcodes = cb[[i]])
120
+            res[[i]] <- scei
121
+
122
+        }
123
+        sce <- do.call(SingleCellExperiment::cbind, res)
124
+        return(sce)
125
+
126
+    } else {
127
+        for (i in seq_along(seqcDirs)) {
128
+            scei <- .constructSCEFromSeqcOutputs(
129
+                sampleName = samples[i],
130
+                matrix = mat[[i]],
131
+                features = fe[[i]],
132
+                barcodes = cb[[i]])
133
+            res[[i]] <- scei
134
+        }
135
+        if (length(seqcDirs) == 1) {
136
+            return(res[[1]])
137
+        } else {
138
+            return(res)
139
+        }
140
+    }
141
+}
142
+
143
+
144
+#' @name importSEQC
145
+#' @rdname importSEQC
146
+#' @title Construct SCE object from seqc output
147
+#' @description Read the filtered barcodes, features, and matrices for all
148
+#'  samples from (preferably a single run of) seqc output. Import and
149
+#'  combine them as one big \link[SingleCellExperiment]{SingleCellExperiment}
150
+#'  object.
151
+#' @param seqcDirs A vector of paths to seqc output files. Each sample
152
+#'  should have its own path. For example: \code{./pbmc_1k_50x50}.
153
+#'  Must have the same length as \code{samples}.
154
+#' @param samples A vector of user-defined sample names for the samples to be
155
+#'  imported. Must have the same length as \code{seqcDirs}.
156
+#' @param prefix A vector containing the prefix of file names within each
157
+#'  sample directory. It cannot be null and the vector should have the same
158
+#'  length as \emph{samples}.
159
+#' @param gzipped Boolean. \code{TRUE} if the seqc output files
160
+#'  (sparse_counts_barcode.csv, sparse_counts_genes.csv, and
161
+#'  sparse_molecule_counts.mtx)
162
+#'  were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are
163
+#'  not gzipped.
164
+#' Default \code{FALSE}.
165
+#' @param class Character. The class of the expression matrix stored in the SCE
166
+#'  object. Can be one of "Matrix" (as returned by
167
+#'  \link[Matrix]{readMM} function), or "matrix" (as returned by
168
+#'  \link[base]{matrix} function). Default "Matrix".
169
+#' @param delayedArray Boolean. Whether to read the expression matrix as
170
+#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
171
+#' @param feNotFirstCol Boolean. \code{TRUE} if first column of
172
+#'  sparse_counts_genes.csv
173
+#' is row index and it will be removed. \code{FALSE} the first column will
174
+#'  be kept.
175
+#' @param cbNotFirstCol Boolean. \code{TRUE} if first column of
176
+#'  sparse_counts_barcode.csv
177
+#' is row index and it will be removed. \code{FALSE} the first column will
178
+#'  be kept.
179
+#' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a
180
+#' \code{SingleCellExperiment} object containing the combined count matrix,
181
+#'  feature annotations
182
+#'  and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a
183
+#'  list containing multiple
184
+#'  \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment}
185
+#'  contains count matrix, feature annotations and cell annotations for
186
+#'  each sample.
187
+#' @details
188
+#' \code{importSEQC} imports output from seqc.
189
+#'  The default sparse_counts_barcode.csv or sparse_counts_genes.csv from
190
+#'  seqc output
191
+#'  contains two columns. The first column is row index and the second column
192
+#'  is cell-barcode
193
+#'  or gene symbol. \code{importSEQC} will remove first column. Alternatively,
194
+#'  user can call
195
+#'  \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first
196
+#'  column of these files.
197
+#'  When \code{combinedSample} is TRUE, \code{importSEQC} will combined count
198
+#'  matrix with genes detected in at least one sample.
199
+#' @return A \code{SingleCellExperiment} object containing the combined count
200
+#'  matrix, the feature annotations, and the cell annotation.
201
+#' @examples
202
+#' # Example #1
203
+#' # The following filtered feature, cell, and matrix files were downloaded from
204
+#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/
205
+#' # 3.0.0/pbmc_1k_v3
206
+#' # The top 50 hg38 genes are included in this example.
207
+#' # Only the top 50 cells are included.
208
+#' sce <- importSEQC(
209
+#'     seqcDirs = system.file("extdata/pbmc_1k_50x50", package = "singleCellTK"),
210
+#'     samples = "pbmc_1k_50x50",
211
+#'     prefix = "pbmc_1k",
212
+#'     combinedSample = FALSE)
213
+#' @export
214
+importSEQC <- function(
215
+    seqcDirs = NULL,
216
+    samples = NULL,
217
+    prefix = NULL,
218
+    gzipped = FALSE,
219
+    class = c("Matrix", "matrix"),
220
+    delayedArray = TRUE,
221
+    cbNotFirstCol = TRUE,
222
+    feNotFirstCol = TRUE,
223
+    combinedSample = TRUE) {
224
+
225
+    class <- match.arg(class)
226
+
227
+    .importSEQC(seqcDirs = seqcDirs,
228
+        samples = samples,
229
+        prefix = prefix,
230
+        gzipped = gzipped,
231
+        class = class,
232
+        delayedArray = delayedArray,
233
+        cbNotFirstCol = cbNotFirstCol,
234
+        feNotFirstCol = feNotFirstCol,
235
+        combinedSample = combinedSample)
236
+}
Browse code

Revert "Sctk documentation "

Joshua D. Campbell authored on 09/06/2020 23:22:05 • GitHub committed on 09/06/2020 23:22:05
Showing1 changed files
1 1
deleted file mode 100644
... ...
@@ -1,236 +0,0 @@
1
-.constructSCEFromSeqcOutputs <- function(
2
-    sampleName,
3
-    matrix,
4
-    features,
5
-    barcodes) {
6
-
7
-    coln <- paste(sampleName, barcodes[[1]], sep = "_")
8
-    rownames(matrix) <- features[[1]]
9
-
10
-    sce <- SingleCellExperiment::SingleCellExperiment(
11
-        assays = list(counts = matrix))
12
-    SummarizedExperiment::rowData(sce) <- features
13
-    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(
14
-        cell_barcode = as.character(barcodes[[1]]),
15
-        column_name = coln,
16
-        sample = sampleName,
17
-        row.names = coln)
18
-
19
-    return(sce)
20
-}
21
-
22
-
23
-.unionGeneMatrix <- function(geneUnion, matrix){
24
-    missGene <- geneUnion[!geneUnion %in% rownames(matrix)]
25
-    missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix),
26
-        dimnames = list(missGene, NULL))
27
-
28
-    matb <- methods::as(matrix, "dgCMatrix")
29
-    rownames(matb) <- rownames(matrix)
30
-
31
-    mat <- rbind(matb, missMat)
32
-    if (anyDuplicated(rownames(mat))) {
33
-        mat <- mat[!duplicated(rownames(mat)), ]
34
-        warning("Duplicated genes exist in count matrix. Filtered",
35
-            " duplicated genes.")
36
-    }
37
-    return(mat)
38
-}
39
-
40
-
41
-.getGeneUnion <- function(geneList){
42
-    gene <- geneList
43
-    for (i in seq_along(geneList)){
44
-        gene[[i]] <- geneList[[i]][[1]]
45
-    }
46
-
47
-    geneUnion <- base::Reduce(union, gene)
48
-    return(geneUnion)
49
-}
50
-
51
-
52
-.readBarcodesSEQC <- function(path) {
53
-    res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
54
-    res <- res[,-1,drop = FALSE]
55
-    colnames(res) <- "cell_barcode"
56
-    return(res)
57
-}
58
-
59
-
60
-.readFeaturesSEQC <- function(path) {
61
-    res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
62
-    res <- res[,-1,drop = FALSE]
63
-    colnames(res) <- "feature_name"
64
-    return(res)
65
-}
66
-
67
-
68
-.importSEQC <- function(
69
-    seqcDirs,
70
-    samples,
71
-    prefix,
72
-    gzipped,
73
-    class,
74
-    delayedArray,
75
-    cbNotFirstCol,
76
-    feNotFirstCol,
77
-    combinedSample) {
78
-
79
-    if (length(seqcDirs) != length(samples)) {
80
-        stop("'seqcDirs' and 'samples' have unequal lengths!")
81
-    }
82
-
83
-    if (length(seqcDirs) != length(prefix)) {
84
-        stop("'seqcDirs' and 'prefix' have unequal lengths!")
85
-    }
86
-
87
-    res <- vector("list", length = length(seqcDirs))
88
-    cb <- vector("list", length = length(seqcDirs))
89
-    fe <- vector("list", length = length(seqcDirs))
90
-    mat <- vector("list", length = length(seqcDirs))
91
-
92
-    for (i in seq_along(seqcDirs)) {
93
-        dir <- seqcDirs[i]
94
-        matrixFile <- paste(prefix[i], 'sparse_molecule_counts.mtx', sep = "_")
95
-        featuresFile <- paste(prefix[i], 'sparse_counts_genes.csv', sep = "_")
96
-        barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv',
97
-            sep = "_")
98
-
99
-        cb[[i]] <- .readBarcodesSEQC(file.path(dir, barcodesFile))
100
-        fe[[i]] <- .readFeaturesSEQC(file.path(dir, featuresFile))
101
-
102
-        mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile),
103
-            gzipped = gzipped, class = class, delayedArray = delayedArray)
104
-        mat[[i]] <- t(mat[[i]])
105
-        rownames(mat[[i]]) <- fe[[i]][[1]]
106
-    }
107
-
108
-    if (isTRUE(combinedSample) & length(seqcDirs) > 1) {
109
-        geneUnion <- .getGeneUnion(fe)
110
-        for (i in seq_along(seqcDirs)) {
111
-            matrix <- .unionGeneMatrix(geneUnion = geneUnion, matrix = mat[[i]])
112
-            matrix <- matrix[geneUnion, ]
113
-            feature <- S4Vectors::DataFrame('feature_name' = rownames(matrix))
114
-
115
-            scei <- .constructSCEFromSeqcOutputs(
116
-                sampleName = samples[i],
117
-                matrix = matrix,
118
-                features = feature,
119
-                barcodes = cb[[i]])
120
-            res[[i]] <- scei
121
-
122
-        }
123
-        sce <- do.call(SingleCellExperiment::cbind, res)
124
-        return(sce)
125
-
126
-    } else {
127
-        for (i in seq_along(seqcDirs)) {
128
-            scei <- .constructSCEFromSeqcOutputs(
129
-                sampleName = samples[i],
130
-                matrix = mat[[i]],
131
-                features = fe[[i]],
132
-                barcodes = cb[[i]])
133
-            res[[i]] <- scei
134
-        }
135
-        if (length(seqcDirs) == 1) {
136
-            return(res[[1]])
137
-        } else {
138
-            return(res)
139
-        }
140
-    }
141
-}
142
-
143
-
144
-#' @name importSEQC
145
-#' @rdname importSEQC
146
-#' @title Construct SCE object from seqc output
147
-#' @description Read the filtered barcodes, features, and matrices for all
148
-#'  samples from (preferably a single run of) seqc output. Import and
149
-#'  combine them as one big \link[SingleCellExperiment]{SingleCellExperiment}
150
-#'  object.
151
-#' @param seqcDirs A vector of paths to seqc output files. Each sample
152
-#'  should have its own path. For example: \code{./pbmc_1k_50x50}.
153
-#'  Must have the same length as \code{samples}.
154
-#' @param samples A vector of user-defined sample names for the samples to be
155
-#'  imported. Must have the same length as \code{seqcDirs}.
156
-#' @param prefix A vector containing the prefix of file names within each
157
-#'  sample directory. It cannot be null and the vector should have the same
158
-#'  length as \emph{samples}.
159
-#' @param gzipped Boolean. \code{TRUE} if the seqc output files
160
-#'  (sparse_counts_barcode.csv, sparse_counts_genes.csv, and
161
-#'  sparse_molecule_counts.mtx)
162
-#'  were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are
163
-#'  not gzipped.
164
-#' Default \code{FALSE}.
165
-#' @param class Character. The class of the expression matrix stored in the SCE
166
-#'  object. Can be one of "Matrix" (as returned by
167
-#'  \link[Matrix]{readMM} function), or "matrix" (as returned by
168
-#'  \link[base]{matrix} function). Default "Matrix".
169
-#' @param delayedArray Boolean. Whether to read the expression matrix as
170
-#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
171
-#' @param feNotFirstCol Boolean. \code{TRUE} if first column of
172
-#'  sparse_counts_genes.csv
173
-#' is row index and it will be removed. \code{FALSE} the first column will
174
-#'  be kept.
175
-#' @param cbNotFirstCol Boolean. \code{TRUE} if first column of
176
-#'  sparse_counts_barcode.csv
177
-#' is row index and it will be removed. \code{FALSE} the first column will
178
-#'  be kept.
179
-#' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a
180
-#' \code{SingleCellExperiment} object containing the combined count matrix,
181
-#'  feature annotations
182
-#'  and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a
183
-#'  list containing multiple
184
-#'  \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment}
185
-#'  contains count matrix, feature annotations and cell annotations for
186
-#'  each sample.
187
-#' @details
188
-#' \code{importSEQC} imports output from seqc.
189
-#'  The default sparse_counts_barcode.csv or sparse_counts_genes.csv from
190
-#'  seqc output
191
-#'  contains two columns. The first column is row index and the second column
192
-#'  is cell-barcode
193
-#'  or gene symbol. \code{importSEQC} will remove first column. Alternatively,
194
-#'  user can call
195
-#'  \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first
196
-#'  column of these files.
197
-#'  When \code{combinedSample} is TRUE, \code{importSEQC} will combined count
198
-#'  matrix with genes detected in at least one sample.
199
-#' @return A \code{SingleCellExperiment} object containing the combined count
200
-#'  matrix, the feature annotations, and the cell annotation.
201
-#' @examples
202
-#' # Example #1
203
-#' # The following filtered feature, cell, and matrix files were downloaded from
204
-#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/
205
-#' # 3.0.0/pbmc_1k_v3
206
-#' # The top 50 hg38 genes are included in this example.
207
-#' # Only the top 50 cells are included.
208
-#' sce <- importSEQC(
209
-#'     seqcDirs = system.file("extdata/pbmc_1k_50x50", package = "singleCellTK"),
210
-#'     samples = "pbmc_1k_50x50",
211
-#'     prefix = "pbmc_1k",
212
-#'     combinedSample = FALSE)
213
-#' @export
214
-importSEQC <- function(
215
-    seqcDirs = NULL,
216
-    samples = NULL,
217
-    prefix = NULL,
218
-    gzipped = FALSE,
219
-    class = c("Matrix", "matrix"),
220
-    delayedArray = TRUE,
221
-    cbNotFirstCol = TRUE,
222
-    feNotFirstCol = TRUE,
223
-    combinedSample = TRUE) {
224
-
225
-    class <- match.arg(class)
226
-
227
-    .importSEQC(seqcDirs = seqcDirs,
228
-        samples = samples,
229
-        prefix = prefix,
230
-        gzipped = gzipped,
231
-        class = class,
232
-        delayedArray = delayedArray,
233
-        cbNotFirstCol = cbNotFirstCol,
234
-        feNotFirstCol = feNotFirstCol,
235
-        combinedSample = combinedSample)
236
-}
Browse code

Merge branch 'importQC' of github.com:joshua-d-campbell/singleCellTK into importQC

zhewa authored on 11/02/2020 01:16:21
Showing0 changed files
Browse code

fix bug

zhewa authored on 06/02/2020 04:32:28
Showing1 changed files
... ...
@@ -76,8 +76,6 @@
76 76
     feNotFirstCol,
77 77
     combinedSample) {
78 78
 
79
-    class <- match.arg(class)
80
-
81 79
     if (length(seqcDirs) != length(samples)) {
82 80
         stop("'seqcDirs' and 'samples' have unequal lengths!")
83 81
     }
... ...
@@ -224,6 +222,8 @@ importSEQC <- function(
224 222
     feNotFirstCol = TRUE,
225 223
     combinedSample = TRUE) {
226 224
 
225
+    class <- match.arg(class)
226
+
227 227
     .importSEQC(seqcDirs = seqcDirs,
228 228
         samples = samples,
229 229
         prefix = prefix,
Browse code

various updates

add dataType = c("raw", "filtered") for importCellRangerV2 & V3
delayedArray = TRUE
add importSingleCellMatrix

zhewa authored on 06/02/2020 04:08:26
Showing1 changed files
... ...
@@ -31,7 +31,8 @@
31 31
     mat <- rbind(matb, missMat)
32 32
     if (anyDuplicated(rownames(mat))) {
33 33
         mat <- mat[!duplicated(rownames(mat)), ]
34
-        warning('Duplicated genes exist in count matrix. Filtered duplicated genes.')
34
+        warning("Duplicated genes exist in count matrix. Filtered",
35
+            " duplicated genes.")
35 36
     }
36 37
     return(mat)
37 38
 }
... ...
@@ -70,10 +71,13 @@
70 71
     prefix,
71 72
     gzipped,
72 73
     class,
74
+    delayedArray,
73 75
     cbNotFirstCol,
74 76
     feNotFirstCol,
75 77
     combinedSample) {
76 78
 
79
+    class <- match.arg(class)
80
+
77 81
     if (length(seqcDirs) != length(samples)) {
78 82
         stop("'seqcDirs' and 'samples' have unequal lengths!")
79 83
     }
... ...
@@ -91,13 +95,14 @@
91 95
         dir <- seqcDirs[i]
92 96
         matrixFile <- paste(prefix[i], 'sparse_molecule_counts.mtx', sep = "_")
93 97
         featuresFile <- paste(prefix[i], 'sparse_counts_genes.csv', sep = "_")
94
-        barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv', sep = "_")
98
+        barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv',
99
+            sep = "_")
95 100
 
96 101
         cb[[i]] <- .readBarcodesSEQC(file.path(dir, barcodesFile))
97 102
         fe[[i]] <- .readFeaturesSEQC(file.path(dir, featuresFile))
98 103
 
99 104
         mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile),
100
-            gzipped = gzipped, class = class)
105
+            gzipped = gzipped, class = class, delayedArray = delayedArray)
101 106
         mat[[i]] <- t(mat[[i]])
102 107
         rownames(mat[[i]]) <- fe[[i]][[1]]
103 108
     }
... ...
@@ -122,12 +127,6 @@
122 127
 
123 128
     } else {
124 129
         for (i in seq_along(seqcDirs)) {
125
-            if (class == 'DelayedArray') {
126
-                mat[[i]] <- DelayedArray::DelayedArray(mat[[i]])
127
-            } else if (class == 'matrix') {
128
-                mat[[i]] <- as.matrix(mat[[i]])
129
-            }
130
-
131 130
             scei <- .constructSCEFromSeqcOutputs(
132 131
                 sampleName = samples[i],
133 132
                 matrix = mat[[i]],
... ...
@@ -135,7 +134,7 @@
135 134
                 barcodes = cb[[i]])
136 135
             res[[i]] <- scei
137 136
         }
138
-        if (length(seqcDirs) == 1){
137
+        if (length(seqcDirs) == 1) {
139 138
             return(res[[1]])
140 139
         } else {
141 140
             return(res)
... ...
@@ -149,41 +148,56 @@
149 148
 #' @title Construct SCE object from seqc output
150 149
 #' @description Read the filtered barcodes, features, and matrices for all
151 150
 #'  samples from (preferably a single run of) seqc output. Import and
152
-#'  combine them as one big \link[SingleCellExperiment]{SingleCellExperiment} object.
151
+#'  combine them as one big \link[SingleCellExperiment]{SingleCellExperiment}
152
+#'  object.
153 153
 #' @param seqcDirs A vector of paths to seqc output files. Each sample
154 154
 #'  should have its own path. For example: \code{./pbmc_1k_50x50}.
155 155
 #'  Must have the same length as \code{samples}.
156 156
 #' @param samples A vector of user-defined sample names for the samples to be
157 157
 #'  imported. Must have the same length as \code{seqcDirs}.
158
-#' @param prefix A vector containing the prefix of file names within each sample directory.
159
-#' It cannot be null and the vector should have the same length as \emph{samples}.
158
+#' @param prefix A vector containing the prefix of file names within each
159
+#'  sample directory. It cannot be null and the vector should have the same
160
+#'  length as \emph{samples}.
160 161
 #' @param gzipped Boolean. \code{TRUE} if the seqc output files
161
-#' (sparse_counts_barcode.csv, sparse_counts_genes.csv, and sparse_molecule_counts.mtx)
162
-#' were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are not gzipped.
162
+#'  (sparse_counts_barcode.csv, sparse_counts_genes.csv, and
163
+#'  sparse_molecule_counts.mtx)
164
+#'  were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are
165
+#'  not gzipped.
163 166
 #' Default \code{FALSE}.
164 167
 #' @param class Character. The class of the expression matrix stored in the SCE
165
-#' object. Can be one of "DelayedArray" (as returned by
166
-#' \link[DelayedArray]{DelayedArray} function), "Matrix" (as returned by
167
-#' \link[Matrix]{readMM} function), or "matrix" (as returned by
168
-#' \link[base]{matrix} function). Default "DelayedArray".
169
-#' @param feNotFirstCol Boolean. \code{TRUE} if first column of sparse_counts_genes.csv
170
-#' is row index and it will be removed. \code{FALSE} the first column will be kept.
171
-#' @param cbNotFirstCol Boolean. \code{TRUE} if first column of sparse_counts_barcode.csv
172
-#' is row index and it will be removed. \code{FALSE} the first column will be kept.
168
+#'  object. Can be one of "Matrix" (as returned by
169
+#'  \link[Matrix]{readMM} function), or "matrix" (as returned by
170
+#'  \link[base]{matrix} function). Default "Matrix".
171
+#' @param delayedArray Boolean. Whether to read the expression matrix as
172
+#'  \link[DelayedArray]{DelayedArray} object or not. Default \code{TRUE}.
173
+#' @param feNotFirstCol Boolean. \code{TRUE} if first column of
174
+#'  sparse_counts_genes.csv
175
+#' is row index and it will be removed. \code{FALSE} the first column will
176
+#'  be kept.
177
+#' @param cbNotFirstCol Boolean. \code{TRUE} if first column of
178
+#'  sparse_counts_barcode.csv
179
+#' is row index and it will be removed. \code{FALSE} the first column will
180
+#'  be kept.
173 181
 #' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a
174
-#' \code{SingleCellExperiment} object containing the combined count matrix, feature annotations
175
-#' and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a list containing multiple
176
-#' \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment} contains count matrix
177
-#' , feature annotations and cell annotations for each sample.
182
+#' \code{SingleCellExperiment} object containing the combined count matrix,
183
+#'  feature annotations
184
+#'  and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a
185
+#'  list containing multiple
186
+#'  \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment}
187
+#'  contains count matrix, feature annotations and cell annotations for
188
+#'  each sample.
178 189
 #' @details
179 190
 #' \code{importSEQC} imports output from seqc.
180
-#' The default sparse_counts_barcode.csv or sparse_counts_genes.csv from seqc output
181
-#' contains two columns. The first column is row index and the second column is cell-barcode
182
-#' or gene symbol. \code{importSEQC} will remove first column. Alternatively, user can call
183
-#' \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first column
184
-#' of these files.
185
-#' When \code{combinedSample} is TRUE, \code{importSEQC} will combined count matrix
186
-#' with genes detected in at least one sample.
191
+#'  The default sparse_counts_barcode.csv or sparse_counts_genes.csv from
192
+#'  seqc output
193
+#'  contains two columns. The first column is row index and the second column
194
+#'  is cell-barcode
195
+#'  or gene symbol. \code{importSEQC} will remove first column. Alternatively,
196
+#'  user can call
197
+#'  \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first
198
+#'  column of these files.
199
+#'  When \code{combinedSample} is TRUE, \code{importSEQC} will combined count
200
+#'  matrix with genes detected in at least one sample.
187 201
 #' @return A \code{SingleCellExperiment} object containing the combined count
188 202
 #'  matrix, the feature annotations, and the cell annotation.
189 203
 #' @examples
... ...
@@ -204,7 +218,8 @@ importSEQC <- function(
204 218
     samples = NULL,
205 219
     prefix = NULL,
206 220
     gzipped = FALSE,
207
-    class = "DelayedArray",
221
+    class = c("Matrix", "matrix"),
222
+    delayedArray = TRUE,
208 223
     cbNotFirstCol = TRUE,
209 224
     feNotFirstCol = TRUE,
210 225
     combinedSample = TRUE) {
... ...
@@ -214,6 +229,7 @@ importSEQC <- function(
214 229
         prefix = prefix,
215 230
         gzipped = gzipped,
216 231
         class = class,
232
+        delayedArray = delayedArray,
217 233
         cbNotFirstCol = cbNotFirstCol,
218 234
         feNotFirstCol = feNotFirstCol,
219 235
         combinedSample = combinedSample)
Browse code

Fixed package/function references

Joshua D. Campbell authored on 03/02/2020 21:03:41
Showing1 changed files
... ...
@@ -25,7 +25,7 @@
25 25
     missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix),
26 26
         dimnames = list(missGene, NULL))
27 27
 
28
-    matb <- as(matrix, "dgCMatrix")
28
+    matb <- methods::as(matrix, "dgCMatrix")
29 29
     rownames(matb) <- rownames(matrix)
30 30
 
31 31
     mat <- rbind(matb, missMat)
... ...
@@ -117,7 +117,7 @@
117 117
             res[[i]] <- scei
118 118
 
119 119
         }
120
-        sce <- do.call(BiocGenerics::cbind, res)
120
+        sce <- do.call(SingleCellExperiment::cbind, res)
121 121
         return(sce)
122 122
 
123 123
     } else {
Browse code

Fixed errors in docs

Joshua D. Campbell authored on 24/01/2020 21:31:15
Showing1 changed files
... ...
@@ -174,7 +174,7 @@
174 174
 #' \code{SingleCellExperiment} object containing the combined count matrix, feature annotations
175 175
 #' and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a list containing multiple
176 176
 #' \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment} contains count matrix
177
-#' , feature anotations and cell annotations for each sample.
177
+#' , feature annotations and cell annotations for each sample.
178 178
 #' @details
179 179
 #' \code{importSEQC} imports output from seqc.
180 180
 #' The default sparse_counts_barcode.csv or sparse_counts_genes.csv from seqc output
Browse code

Fixed column_name header

Joshua D. Campbell authored on 24/01/2020 03:11:21
Showing1 changed files
... ...
@@ -11,7 +11,7 @@
11 11
         assays = list(counts = matrix))
12 12
     SummarizedExperiment::rowData(sce) <- features
13 13
     SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(
14
-        cell_barcode = barcodes[[1]],
14
+        cell_barcode = as.character(barcodes[[1]]),
15 15
         column_name = coln,
16 16
         sample = sampleName,
17 17
         row.names = coln)
Browse code

Fixed column_name header

Joshua D. Campbell authored on 24/01/2020 03:00:21
Showing1 changed files
... ...
@@ -11,7 +11,7 @@
11 11
         assays = list(counts = matrix))
12 12
     SummarizedExperiment::rowData(sce) <- features
13 13
     SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(
14
-        cell_barcode = barcodes,
14
+        cell_barcode = barcodes[[1]],
15 15
         column_name = coln,
16 16
         sample = sampleName,
17 17
         row.names = coln)
Browse code

Updated SEQC function for importing features and cell barcodes

Joshua D. Campbell authored on 24/01/2020 02:52:57
Showing1 changed files
... ...
@@ -48,6 +48,22 @@
48 48
 }
49 49
 
50 50
 
51
+.readBarcodesSEQC <- function(path) {
52
+    res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
53
+    res <- res[,-1,drop = FALSE]
54
+    colnames(res) <- "cell_barcode"
55
+    return(res)
56
+}
57
+
58
+
59
+.readFeaturesSEQC <- function(path) {
60
+    res <- data.table::fread(path, header = FALSE, sep=",", colClasses = "character")
61
+    res <- res[,-1,drop = FALSE]
62
+    colnames(res) <- "feature_name"
63
+    return(res)
64
+}
65
+
66
+
51 67
 .importSEQC <- function(
52 68
     seqcDirs,
53 69
     samples,
... ...
@@ -77,15 +93,8 @@
77 93
         featuresFile <- paste(prefix[i], 'sparse_counts_genes.csv', sep = "_")
78 94
         barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv', sep = "_")
79 95
 
80
-        cb[[i]] <- .readBarcodes(file.path(dir, barcodesFile))
81
-        if (isTRUE(cbNotFirstCol)) {
82
-            cb[[i]] <- cb[[i]][, -1, drop = FALSE]
83
-        }
84
-
85
-        fe[[i]] <- .readFeatures(file.path(dir, featuresFile))
86
-        if (isTRUE(feNotFirstCol)) {
87
-            fe[[i]] <- fe[[i]][, -1, drop = FALSE]
88
-        }
96
+        cb[[i]] <- .readBarcodesSEQC(file.path(dir, barcodesFile))
97
+        fe[[i]] <- .readFeaturesSEQC(file.path(dir, featuresFile))
89 98
 
90 99
         mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile),
91 100
             gzipped = gzipped, class = class)
Browse code

Changed colname in output to match the rest

Joshua D. Campbell authored on 24/01/2020 01:04:35
Showing1 changed files
... ...
@@ -10,7 +10,8 @@
10 10
     sce <- SingleCellExperiment::SingleCellExperiment(
11 11
         assays = list(counts = matrix))
12 12
     SummarizedExperiment::rowData(sce) <- features
13
-    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(barcodes,
13
+    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(
14
+        cell_barcode = barcodes,
14 15
         column_name = coln,
15 16
         sample = sampleName,
16 17
         row.names = coln)
Browse code

removed unnecessary messages

Joshua D. Campbell authored on 24/01/2020 00:23:09
Showing1 changed files
... ...
@@ -78,13 +78,11 @@
78 78
 
79 79
         cb[[i]] <- .readBarcodes(file.path(dir, barcodesFile))
80 80
         if (isTRUE(cbNotFirstCol)) {
81
-            message("First column of barcode file was row index and it was removed.")
82 81
             cb[[i]] <- cb[[i]][, -1, drop = FALSE]
83 82
         }
84 83
 
85 84
         fe[[i]] <- .readFeatures(file.path(dir, featuresFile))
86 85
         if (isTRUE(feNotFirstCol)) {
87
-            message("First column of gene file was row index and it was removed.")
88 86
             fe[[i]] <- fe[[i]][, -1, drop = FALSE]
89 87
         }
90 88
 
Browse code

Merge branch 'importQC' of github.com:ykoga07/singleCellTK into importQC

zhewa authored on 18/01/2020 22:03:36
Showing0 changed files
Browse code

add Optimus

zhewa authored on 18/01/2020 21:54:49
Showing1 changed files
... ...
@@ -18,6 +18,7 @@
18 18
     return(sce)
19 19
 }
20 20
 
21
+
21 22
 .unionGeneMatrix <- function(geneUnion, matrix){
22 23
     missGene <- geneUnion[!geneUnion %in% rownames(matrix)]
23 24
     missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix),
... ...
@@ -34,6 +35,7 @@
34 35
     return(mat)
35 36
 }
36 37
 
38
+
37 39
 .getGeneUnion <- function(geneList){
38 40
     gene <- geneList
39 41
     for (i in seq_along(geneList)){
... ...
@@ -44,7 +46,8 @@
44 46
     return(geneUnion)
45 47
 }
46 48
 
47
-.importSeqc <- function(
49
+
50
+.importSEQC <- function(
48 51
     SeqcDirs,
49 52
     samples,
50 53
     prefix,
... ...
@@ -132,8 +135,9 @@
132 135
     }
133 136
 }
134 137
 
135
-#' @name importSeqc
136
-#' @rdname importSeqc
138
+
139
+#' @name importSEQC
140
+#' @rdname importSEQC
137 141
 #' @title Construct SCE object from seqc output
138 142
 #' @description Read the filtered barcodes, features, and matrices for all
139 143
 #'  samples from (preferably a single run of) seqc output. Import and
... ...
@@ -158,19 +162,19 @@
158 162
 #' is row index and it will be removed. \code{FALSE} the first column will be kept.
159 163
 #' @param cbNotFirstCol Boolean. \code{TRUE} if first column of sparse_counts_barcode.csv
160 164
 #' is row index and it will be removed. \code{FALSE} the first column will be kept.
161
-#' @param combinedSample Boolean. If \code{TRUE}, \code{importSeqc} returns a
165
+#' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a
162 166
 #' \code{SingleCellExperiment} object containing the combined count matrix, feature annotations
163
-#' and the cell annotations. If \code{FALSE}, \code{importSeqc} returns a list containing multiple
167
+#' and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a list containing multiple
164 168
 #' \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment} contains count matrix
165 169
 #' , feature anotations and cell annotations for each sample.
166 170
 #' @details
167
-#' \code{importSeqc} imports output from seqc.
171
+#' \code{importSEQC} imports output from seqc.
168 172
 #' The default sparse_counts_barcode.csv or sparse_counts_genes.csv from seqc output
169 173
 #' contains two columns. The first column is row index and the second column is cell-barcode
170
-#' or gene symbol. \code{importSeqc} will remove first column. Alternatively, user can call
174
+#' or gene symbol. \code{importSEQC} will remove first column. Alternatively, user can call
171 175
 #' \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first column
172 176
 #' of these files.
173
-#' When \code{combinedSample} is TRUE, \code{importSeqc} will combined count matrix
177
+#' When \code{combinedSample} is TRUE, \code{importSEQC} will combined count matrix
174 178
 #' with genes detected in at least one sample.
175 179
 #' @return A \code{SingleCellExperiment} object containing the combined count
176 180
 #'  matrix, the feature annotations, and the cell annotation.
... ...
@@ -181,13 +185,13 @@
181 185
 #' # 3.0.0/pbmc_1k_v3
182 186
 #' # The top 50 hg38 genes are included in this example.
183 187
 #' # Only the top 50 cells are included.
184
-#' sce <- importSeqc(
188
+#' sce <- importSEQC(
185 189
 #'     SeqcDirs = system.file("extdata/pbmc_1k_50x50", package = "singleCellTK"),
186 190
 #'     samples = "pbmc_1k_50x50",
187 191
 #'     prefix = "pbmc_1k",
188 192
 #'     combinedSample = FALSE)
189 193
 #' @export
190
-importSeqc <- function(
194
+importSEQC <- function(
191 195
     SeqcDirs = NULL,
192 196
     samples = NULL,
193 197
     prefix = NULL,
... ...
@@ -197,7 +201,7 @@ importSeqc <- function(
197 201
     feNotFirstCol = TRUE,
198 202
     combinedSample = TRUE) {
199 203
 
200
-    .importSeqc(SeqcDirs = SeqcDirs,
204
+    .importSEQC(SeqcDirs = SeqcDirs,
201 205
         samples = samples,
202 206
         prefix = prefix,
203 207
         gzipped = gzipped,
Browse code

drop = FALSE

zhewa authored on 08/01/2020 18:26:51
Showing1 changed files
... ...
@@ -23,7 +23,10 @@
23 23
     missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix),
24 24
         dimnames = list(missGene, NULL))
25 25
 
26
-    mat <- rbind(matrix,missMat)
26
+    matb <- as(matrix, "dgCMatrix")
27
+    rownames(matb) <- rownames(matrix)
28
+
29
+    mat <- rbind(matb, missMat)
27 30
     if (anyDuplicated(rownames(mat))) {
28 31
         mat <- mat[!duplicated(rownames(mat)), ]
29 32
         warning('Duplicated genes exist in count matrix. Filtered duplicated genes.')
... ...
@@ -36,7 +39,7 @@
36 39
     for (i in seq_along(geneList)){
37 40
         gene[[i]] <- geneList[[i]][[1]]
38 41
     }
39
-    
42
+
40 43
     geneUnion <- base::Reduce(union, gene)
41 44
     return(geneUnion)
42 45
 }
... ...
@@ -44,7 +47,7 @@
44 47
 .importSeqc <- function(
45 48
     SeqcDirs,
46 49
     samples,
47
-    prefix, 
50
+    prefix,
48 51
     gzipped,
49 52
     class,
50 53
     cbNotFirstCol,
... ...
@@ -73,17 +76,17 @@
73 76
         cb[[i]] <- .readBarcodes(file.path(dir, barcodesFile))
74 77
         if (isTRUE(cbNotFirstCol)) {
75 78
             message("First column of barcode file was row index and it was removed.")
76
-            cb[[i]] <- cb[[i]][, -1]            
79
+            cb[[i]] <- cb[[i]][, -1, drop = FALSE]
77 80
         }
78 81
 
79 82
         fe[[i]] <- .readFeatures(file.path(dir, featuresFile))
80 83
         if (isTRUE(feNotFirstCol)) {
81 84
             message("First column of gene file was row index and it was removed.")
82
-            fe[[i]] <- fe[[i]][, -1]            
85
+            fe[[i]] <- fe[[i]][, -1, drop = FALSE]
83 86
         }
84 87
 
85
-        mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile), 
86
-            gzipped = gzipped, class = 'Matrix')
88
+        mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile),
89
+            gzipped = gzipped, class = class)
87 90
         mat[[i]] <- t(mat[[i]])
88 91
         rownames(mat[[i]]) <- fe[[i]][[1]]
89 92
     }
... ...
@@ -124,7 +127,7 @@
124 127
         if (length(SeqcDirs) == 1){
125 128
             return(res[[1]])
126 129
         } else {
127
-            return(res) 
130
+            return(res)
128 131
         }
129 132
     }
130 133
 }
... ...
@@ -140,10 +143,10 @@
140 143
 #'  Must have the same length as \code{samples}.
141 144
 #' @param samples A vector of user-defined sample names for the samples to be
142 145
 #'  imported. Must have the same length as \code{SeqcDirs}.
143
-#' @param prefix A vector containing the prefix of file names within each sample directory. 
146
+#' @param prefix A vector containing the prefix of file names within each sample directory.
144 147
 #' It cannot be null and the vector should have the same length as \emph{samples}.
145 148
 #' @param gzipped Boolean. \code{TRUE} if the seqc output files
146
-#' (sparse_counts_barcode.csv, sparse_counts_genes.csv, and sparse_molecule_counts.mtx) 
149
+#' (sparse_counts_barcode.csv, sparse_counts_genes.csv, and sparse_molecule_counts.mtx)
147 150
 #' were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are not gzipped.
148 151
 #' Default \code{FALSE}.
149 152
 #' @param class Character. The class of the expression matrix stored in the SCE
... ...
@@ -152,23 +155,23 @@
152 155
 #' \link[Matrix]{readMM} function), or "matrix" (as returned by
153 156
 #' \link[base]{matrix} function). Default "DelayedArray".
154 157
 #' @param feNotFirstCol Boolean. \code{TRUE} if first column of sparse_counts_genes.csv
155
-#' is row index and it will be removed. \code{FALSE} the first column will be kept. 
158
+#' is row index and it will be removed. \code{FALSE} the first column will be kept.
156 159
 #' @param cbNotFirstCol Boolean. \code{TRUE} if first column of sparse_counts_barcode.csv
157
-#' is row index and it will be removed. \code{FALSE} the first column will be kept. 
158
-#' @param combinedSample Boolean. If \code{TRUE}, \code{importSeqc} returns a 
159
-#' \code{SingleCellExperiment} object containing the combined count matrix, feature annotations 
160
-#' and the cell annotations. If \code{FALSE}, \code{importSeqc} returns a list containing multiple 
160
+#' is row index and it will be removed. \code{FALSE} the first column will be kept.
161
+#' @param combinedSample Boolean. If \code{TRUE}, \code{importSeqc} returns a
162
+#' \code{SingleCellExperiment} object containing the combined count matrix, feature annotations
163
+#' and the cell annotations. If \code{FALSE}, \code{importSeqc} returns a list containing multiple
161 164
 #' \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment} contains count matrix
162
-#' , feature anotations and cell annotations for each sample. 
165
+#' , feature anotations and cell annotations for each sample.
163 166
 #' @details
164 167
 #' \code{importSeqc} imports output from seqc.
165 168
 #' The default sparse_counts_barcode.csv or sparse_counts_genes.csv from seqc output
166
-#' contains two columns. The first column is row index and the second column is cell-barcode 
167
-#' or gene symbol. \code{importSeqc} will remove first column. Alternatively, user can call 
169
+#' contains two columns. The first column is row index and the second column is cell-barcode
170
+#' or gene symbol. \code{importSeqc} will remove first column. Alternatively, user can call
168 171
 #' \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first column
169
-#' of these files.  
172
+#' of these files.
170 173
 #' When \code{combinedSample} is TRUE, \code{importSeqc} will combined count matrix
171
-#' with genes detected in at least one sample. 
174
+#' with genes detected in at least one sample.
172 175
 #' @return A \code{SingleCellExperiment} object containing the combined count
173 176
 #'  matrix, the feature annotations, and the cell annotation.
174 177
 #' @examples
... ...
@@ -180,27 +183,26 @@
180 183
 #' # Only the top 50 cells are included.
181 184
 #' sce <- importSeqc(
182 185
 #'     SeqcDirs = system.file("extdata/pbmc_1k_50x50", package = "singleCellTK"),
183
-#'     samples = "pbmc_1k_50x50", 
184
-#'     prefix = "pbmc_1k", 
186
+#'     samples = "pbmc_1k_50x50",
187
+#'     prefix = "pbmc_1k",
185 188
 #'     combinedSample = FALSE)
186 189
 #' @export
187
-
188 190
 importSeqc <- function(
189 191
     SeqcDirs = NULL,
190 192
     samples = NULL,
191
-    prefix = NULL, 
193
+    prefix = NULL,
192 194
     gzipped = FALSE,
193
-    class = "DelayedArray", 
195
+    class = "DelayedArray",
194 196
     cbNotFirstCol = TRUE,
195 197
     feNotFirstCol = TRUE,
196 198
     combinedSample = TRUE) {
197 199
 
198 200
     .importSeqc(SeqcDirs = SeqcDirs,
199 201
         samples = samples,
200
-        prefix = prefix, 
202
+        prefix = prefix,
201 203
         gzipped = gzipped,
202
-        class = class, 
204
+        class = class,
203 205
         cbNotFirstCol = cbNotFirstCol,
204
-        feNotFirstCol = feNotFirstCol, 
206
+        feNotFirstCol = feNotFirstCol,
205 207
         combinedSample = combinedSample)
206 208
 }
Browse code

add importSeqc.R

rz2333 authored on 07/01/2020 22:24:41
Showing1 changed files
... ...
@@ -1,69 +1,3 @@
1
-# .readBarcodes <- function(path,
2
-#     header = FALSE,
3
-#     colname = "cell_barcode", 
4
-#     removeFirstCol = TRUE) {
5
-
6
-#     res <- data.table::fread(path, header = header)
7
-
8
-#     if (ncol(res) == 1) {
9
-#         colnames(res) <- colname
10
-#     } else {
11
-#         if (ncol(res) == 2) {
12
-#             if (removeFirstCol) {
13
-#                 message("First column of barcode file was row index and it was removed.")
14
-#                 res <- res[, -1]
15
-#                 colnames(res) <- colname
16
-#             }
17
-#         } else if (ncol(res) > 2) {
18
-#             warning("'barcodes' file contains >2 columns!",
19
-#             " The column names are kept as is. ")
20
-#         }
21
-#     }
22
-#     return(res)
23
-# }
24
-
25
-# .readFeatures <- function(path,
26
-#     header = FALSE,
27
-#     colname = "feature_name",
28
-#     removeFirstCol = TRUE) {
29
-
30
-#     res <- data.table::fread(path, header = header)
31
-#     if (ncol(res) == 1) {
32
-#         colnames(res) <- colname
33
-#     } else {
34
-#         if (ncol(res) == 2) {
35
-#             if (removeFirstCol) {
36
-#                 message("First column of gene file was row index and it was removed.")
37
-#                 res <- res[, -1]
38
-#                 colnames(res) <- colname
39
-#             }
40
-#         } else if (ncol(res) > 2) {
41
-#             warning("'barcodes' file contains >2 columns!",
42
-#             " The column names are kept as is. ")
43
-#         }
44
-#     }
45
-#     return(res)
46
-# }
47
-
48
-
49
-# .readMatrixMM <- function(path, gzipped = FALSE, class = "DelayedArray") {
50
-#     if (isTRUE(gzipped)) {
51
-#         path <- gzfile(path)
52
-#     }
53
-
54
-#     res <- Matrix::readMM(path)
55
-#     res <- t(res)
56
-#     if (class == "Matrix") {
57
-#         return(res)
58
-#     } else if (class == "DelayedArray") {
59
-#         res <- DelayedArray::DelayedArray(res)
60
-#         return(res)
61
-#     } else if (class == "matrix") {
62
-#         res <- as.matrix(res)
63
-#         return(res)
64
-#     }
65
-# }
66
-
67 1
 .constructSCEFromSeqcOutputs <- function(
68 2
     sampleName,
69 3
     matrix,
... ...
@@ -84,79 +18,6 @@
84 18
     return(sce)
85 19
 }
86 20
 
87
-## We don't need .getOutputFolderPath for seqc output. 
88
-
89
-.checkArgsImportSeqc <- function(SeqcDirs, samples, class, prefix) {
90
-    if (is.null(SeqcDirs)) {
91
-        if (is.null(samples)) {
92
-            stop("samples can not be NULL if SeqcDirs is NULL!")
93
-        }
94
-        for (i in seq_along(samples)) {
95
-            if (!dir.exists(samples[i])) {
96
-                stop("Sample folder does not exist!\n", samples[i])
97
-            }
98
-        }
99
-    } else {
100
-        if (is.null(samples)) {
101
-            for (i in seq_along(SeqcDirs)) {
102
-                if (length(list.dirs(SeqcDirs[i],
103
-                    recursive = FALSE)) == 0) {
104
-                    warning("Empty folder. Skipping SeqcDirs ",
105
-                        SeqcDirs[i])
106
-                }
107
-            }
108
-        } else {
109
-            if (!(length(samples) == length(SeqcDirs))) {
110
-                stop("Length of samples is not equal to length of ",
111
-                    "SeqcDirs!")
112
-            } else {
113
-                for (i in seq_along(SeqcDirs)) {
114
-                    paths <- file.path(SeqcDirs[i], samples[i])
115
-                    ## why need a for loop below
116
-                    for (j in seq_along(paths)) {  
117
-                        if (!dir.exists(paths[j])) {
118
-                            stop("Sample folder does not exist!\n",
119
-                                paths[j])
120
-                        }
121
-                    }
122
-                }
123
-            }
124
-        }
125
-    }
126
-
127
-    if (!(class %in% c("DelayedArray", "Matrix", "matrix"))) {
128
-        stop("Invalid 'class' argument! ", "Only accept 'DelayedArray', 'Matric' or 'matrix'")
129
-    }
130
-
131
-    if (is.null(prefix)) {
132
-        stop("prefix of output files could not be null ")
133
-    }
134
-
135
-}
136
-
137
-# .getSamplesPaths <- function(SeqcDirs, samples){
138
-#     if (is.null(SeqcDirs)){
139
-#         res <- samples
140
-#     } else {
141
-#         if (is.null(samples)){
142
-#             ## We assume there are only sample directories udner SeqcDirs
143
-#             res <- list.dirs(SeqcDirs, recursive = FALSE)
144
-#         } else {
145
-#             res <- vector("list", length = length(SeqcDirs))
146
-#             for (i in seq_along(SeqcDirs)) {
147
-#                 res[[i]] <- file.path(SeqcDirs[i], samples[i])
148
-#             }
149
-#             res <- unlist(res)
150
-#         }
151
-#     }
152
-#     return(res)
153
-# }
154
-
155
-# .getSampleNames <- function(samplesDir) {
156
-#     res <- basename(samplesDir)
157
-#     return(res)
158
-# }
159
-
160 21
 .unionGeneMatrix <- function(geneUnion, matrix){
161 22
     missGene <- geneUnion[!geneUnion %in% rownames(matrix)]
162 23
     missMat <- Matrix::Matrix(0, nrow = length(missGene), ncol = ncol(matrix),
... ...
@@ -190,46 +51,52 @@
190 51
     feNotFirstCol,
191 52
     combinedSample) {
192 53
 
193
-    .checkArgsImportSeqc(SeqcDirs, samples, class, prefix)
194
-    sampleDirs <- .getSamplesPaths(SeqcDirs, samples)
54
+    if (length(SeqcDirs) != length(samples)) {
55
+        stop("'SeqcDirs' and 'samples' have unequal lengths!")
56
+    }
57
+
58
+    if (length(SeqcDirs) != length(prefix)) {
59
+        stop("'SeqcDirs' and 'prefix' have unequal lengths!")
60
+    }
195 61
 
196
-    res <- vector("list", length = length(sampleDirs))
197
-    cb <- vector("list", length = length(sampleDirs))
198
-    fe <- vector("list", length = length(sampleDirs))
199
-    mat <- vector("list", length = length(sampleDirs))
62
+    res <- vector("list", length = length(SeqcDirs))
63
+    cb <- vector("list", length = length(SeqcDirs))
64
+    fe <- vector("list", length = length(SeqcDirs))
65
+    mat <- vector("list", length = length(SeqcDirs))
200 66
 
201
-    for (i in seq_along(sampleDirs)) {
67
+    for (i in seq_along(SeqcDirs)) {
68
+        dir <- SeqcDirs[i]
202 69
         matrixFile <- paste(prefix[i], 'sparse_molecule_counts.mtx', sep = "_")
203 70
         featuresFile <- paste(prefix[i], 'sparse_counts_genes.csv', sep = "_")
204 71
         barcodesFile <- paste(prefix[i], 'sparse_counts_barcodes.csv', sep = "_")
205 72
 
206
-        cb[[i]] <- .readBarcodes(file.path(sampleDirs[i], barcodesFile))
73
+        cb[[i]] <- .readBarcodes(file.path(dir, barcodesFile))
207 74
         if (isTRUE(cbNotFirstCol)) {
208 75
             message("First column of barcode file was row index and it was removed.")
209 76
             cb[[i]] <- cb[[i]][, -1]            
210 77
         }
211 78
 
212
-        fe[[i]] <- .readFeatures(file.path(sampleDirs[i], featuresFile))
79
+        fe[[i]] <- .readFeatures(file.path(dir, featuresFile))
213 80
         if (isTRUE(feNotFirstCol)) {
214 81
             message("First column of gene file was row index and it was removed.")
215 82
             fe[[i]] <- fe[[i]][, -1]            
216 83
         }
217 84
 
218
-        mat[[i]] <- .readMatrixMM(file.path(sampleDirs[i], matrixFile), 
85
+        mat[[i]] <- .readMatrixMM(file.path(dir, matrixFile), 
219 86
             gzipped = gzipped, class = 'Matrix')
220 87
         mat[[i]] <- t(mat[[i]])
221 88
         rownames(mat[[i]]) <- fe[[i]][[1]]
222 89
     }
223 90
 
224
-    if (isTRUE(combinedSample) & length(sampleDirs) > 1) {
91
+    if (isTRUE(combinedSample) & length(SeqcDirs) > 1) {
225 92
         geneUnion <- .getGeneUnion(fe)
226
-        for (i in seq_along(sampleDirs)) {
93
+        for (i in seq_along(SeqcDirs)) {
227 94
             matrix <- .unionGeneMatrix(geneUnion = geneUnion, matrix = mat[[i]])
228 95
             matrix <- matrix[geneUnion, ]
229 96
             feature <- S4Vectors::DataFrame('feature_name' = rownames(matrix))
230 97
 
231 98
             scei <- .constructSCEFromSeqcOutputs(
232
-                sampleName = .getSampleNames(sampleDirs[i]),