Browse code

Update dedup functionality for import data functions & update tutorial with VAM

Yichen Wang authored on 22/11/2021 17:57:17
Showing 28 changed files

... ...
@@ -150,7 +150,7 @@ export(sctkPythonInstallConda)
150 150
 export(sctkPythonInstallVirtualEnv)
151 151
 export(selectSCTKConda)
152 152
 export(selectSCTKVirtualEnvironment)
153
-export(setSCERowNames)
153
+export(setRowNames)
154 154
 export(setSCTKDisplayRow)
155 155
 export(seuratComputeHeatmap)
156 156
 export(seuratComputeJackStraw)
... ...
@@ -14,6 +14,8 @@
14 14
 #'  object. Can be one of "Matrix" (as returned by
15 15
 #'  \link{readMM} function), or "matrix" (as returned by
16 16
 #'  \link[base]{matrix} function). Default "Matrix".
17
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
18
+#'  \code{TRUE}.
17 19
 #' @return A \code{SingleCellExperiment} object containing the count
18 20
 #'  matrix, the feature annotations, and the cell annotation
19 21
 #'  (which includes QC metrics stored in 'featureDump.txt').
... ...
@@ -24,7 +26,8 @@ importAlevin <- function(
24 26
   alevinDir = NULL,
25 27
   sampleName = 'sample',
26 28
   delayedArray = FALSE,
27
-  class = c("Matrix", "matrix")) {
29
+  class = c("Matrix", "matrix"),
30
+  rowNamesDedup = TRUE) {
28 31
 
29 32
   class <- match.arg(class)
30 33
 
... ...
@@ -45,6 +48,15 @@ importAlevin <- function(
45 48
   if (delayedArray) {
46 49
     mat <- DelayedArray::DelayedArray(mat)
47 50
   }
51
+  
52
+  if (isTRUE(rowNamesDedup)) {
53
+    if (any(duplicated(rownames(mat)))) {
54
+      message("Duplicated gene names found, adding '-1', '-2', ",
55
+              "... suffix to them.")
56
+    }
57
+    mat <- dedupRowNames(mat)
58
+  }
59
+  
48 60
   genes <- rownames(mat)
49 61
   cb <- .readBarcodes(file.path(alevinDir, 'alevin/featureDump.txt'),
50 62
                       header = 'auto',
... ...
@@ -62,6 +74,6 @@ importAlevin <- function(
62 74
     column_name = coln,
63 75
     sample = sampleName,
64 76
     row.names = coln)
65
-
77
+  
66 78
   return(sce)
67 79
 }
... ...
@@ -120,7 +120,8 @@
120 120
 importAnnData <- function(sampleDirs = NULL,
121 121
                           sampleNames = NULL,
122 122
                           delayedArray = FALSE,
123
-                          class = c("Matrix", "matrix")) {
123
+                          class = c("Matrix", "matrix"),
124
+                          rowNamesDedup = TRUE) {
124 125
 
125 126
   if (length(sampleDirs)!=length(sampleNames)){
126 127
     stop("Number of sampleDirs must be equal to number of SampleNames. Please provide sample names for all input directories")
... ...
@@ -138,6 +139,15 @@ importAnnData <- function(sampleDirs = NULL,
138 139
     res[[i]] <- scei
139 140
   }
140 141
   sce <- do.call(SingleCellExperiment::cbind, res)
142
+  
143
+  if (isTRUE(rowNamesDedup)) {
144
+    if (any(duplicated(rownames(sce)))) {
145
+      message("Duplicated gene names found, adding '-1', '-2', ",
146
+              "... suffix to them.")
147
+    }
148
+    sce <- dedupRowNames(sce)
149
+  }
150
+  
141 151
   return(sce)
142 152
 }
143 153
 
... ...
@@ -27,7 +27,7 @@
27 27
         column_name = coln,
28 28
         sample = sample,
29 29
         row.names = coln)
30
-
30
+    
31 31
     return(sce)
32 32
 }
33 33
 
... ...
@@ -41,7 +41,8 @@
41 41
     barcodesFileNames,
42 42
     gzipped,
43 43
     class,
44
-    delayedArray) {
44
+    delayedArray,
45
+    rowNamesDedup) {
45 46
 
46 47
     if (length(BUStoolsDirs) != length(samples)) {
47 48
         stop("'BUStoolsDirs' and 'samples' have unequal lengths!")
... ...
@@ -68,6 +69,15 @@
68 69
     }
69 70
 
70 71
     sce <- do.call(SingleCellExperiment::cbind, res)
72
+    
73
+    if (isTRUE(rowNamesDedup)) {
74
+        if (any(duplicated(rownames(sce)))) {
75
+            message("Duplicated gene names found, adding '-1', '-2', ",
76
+                    "... suffix to them.")
77
+        }
78
+        sce <- dedupRowNames(sce)
79
+    }
80
+    
71 81
     return(sce)
72 82
 }
73 83
 
... ...
@@ -103,6 +113,8 @@
103 113
 #'  \link[base]{matrix} function). Default "Matrix".
104 114
 #' @param delayedArray Boolean. Whether to read the expression matrix as
105 115
 #'  \link[DelayedArray]{DelayedArray-class} object or not. Default \code{FALSE}.
116
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
117
+#'  \code{TRUE}.
106 118
 #' @return A \code{SingleCellExperiment} object containing the count
107 119
 #'  matrix, the gene annotation, and the cell annotation.
108 120
 #' @examples
... ...
@@ -140,7 +152,8 @@ importBUStools <- function(
140 152
     barcodesFileNames = "genes.barcodes.txt",
141 153
     gzipped = "auto",
142 154
     class = c("Matrix", "matrix"),
143
-    delayedArray = FALSE) {
155
+    delayedArray = FALSE,
156
+    rowNamesDedup = TRUE) {
144 157
 
145 158
     class <- match.arg(class)
146 159
 
... ...
@@ -152,5 +165,6 @@ importBUStools <- function(
152 165
         barcodesFileNames = barcodesFileNames,
153 166
         gzipped = gzipped,
154 167
         class = class,
155
-        delayedArray = delayedArray)
168
+        delayedArray = delayedArray,
169
+        rowNamesDedup = rowNamesDedup)
156 170
 }
... ...
@@ -347,7 +347,8 @@
347 347
     barcodesFileNames,
348 348
     gzipped,
349 349
     class,
350
-    delayedArray) {
350
+    delayedArray,
351
+    rowNamesDedup) {
351 352
 
352 353
     .checkArgsImportCellRanger(cellRangerDirs,
353 354
         sampleDirs,
... ...
@@ -400,6 +401,14 @@
400 401
     }
401 402
 
402 403
     sce <- do.call(SingleCellExperiment::cbind, res)
404
+    
405
+    if (isTRUE(rowNamesDedup)) {
406
+        if (any(duplicated(rownames(sce)))) {
407
+            message("Duplicated gene names found, adding '-1', '-2', ",
408
+                    "... suffix to them.")
409
+        }
410
+        sce <- dedupRowNames(sce)
411
+    }
403 412
     return(sce)
404 413
 }
405 414
 
... ...
@@ -531,6 +540,8 @@
531 540
 #'  \code{dataTypeV2} will be used to determine Cell Ranger output directory. If it has
532 541
 #'  length 1, it assumes that all samples use the same genome reference and
533 542
 #'  the function will load only filtered or raw data.
543
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
544
+#'  \code{TRUE}.
534 545
 #' @details
535 546
 #'  \code{importCellRangerV2} imports output from Cell Ranger V2.
536 547
 #'  \code{importCellRangerV2Sample} imports output from one sample from Cell
... ...
@@ -571,7 +582,8 @@ importCellRanger <- function(
571 582
     barcodesFileNames = "barcodes.tsv.gz",
572 583
     gzipped = "auto",
573 584
     class = c("Matrix", "matrix"),
574
-    delayedArray = FALSE) {
585
+    delayedArray = FALSE,
586
+    rowNamesDedup = TRUE) {
575 587
 
576 588
     class <- match.arg(class)
577 589
     dataType <- match.arg(dataType)
... ...
@@ -586,7 +598,8 @@ importCellRanger <- function(
586 598
         barcodesFileNames = barcodesFileNames,
587 599
         gzipped = gzipped,
588 600
         class = class,
589
-        delayedArray = delayedArray)
601
+        delayedArray = delayedArray,
602
+        rowNamesDedup = rowNamesDedup)
590 603
 }
591 604
 
592 605
 
... ...
@@ -611,7 +624,8 @@ importCellRangerV2 <- function(
611 624
     class = c("Matrix", "matrix"),
612 625
     delayedArray = FALSE,
613 626
     reference = NULL,
614
-    cellRangerOutsV2 = NULL) {
627
+    cellRangerOutsV2 = NULL,
628
+    rowNamesDedup = TRUE) {
615 629
 
616 630
     class <- match.arg(class)
617 631
     dataTypeV2 <- match.arg(dataTypeV2)
... ...
@@ -628,7 +642,6 @@ importCellRangerV2 <- function(
628 642
         cellRangerOutsV2 <- .getCellRangerOutV2(dataTypeV2, reference)
629 643
     }
630 644
 
631
-
632 645
     .importCellRanger(cellRangerDirs = cellRangerDirs,
633 646
         sampleDirs = sampleDirs,
634 647
         sampleNames = sampleNames,
... ...
@@ -639,8 +652,8 @@ importCellRangerV2 <- function(
639 652
         barcodesFileNames = "barcodes.tsv",
640 653
         gzipped = FALSE,
641 654
         class = class,
642
-        delayedArray = delayedArray)
643
-
655
+        delayedArray = delayedArray,
656
+        rowNamesDedup = rowNamesDedup)
644 657
 }
645 658
 
646 659
 
... ...
@@ -658,6 +671,8 @@ importCellRangerV2 <- function(
658 671
 #'  \link[base]{matrix} function). Default "Matrix".
659 672
 #' @param delayedArray Boolean. Whether to read the expression matrix as
660 673
 #'  \link{DelayedArray} object or not. Default \code{FALSE}.
674
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
675
+#'  \code{TRUE}.
661 676
 #' @return A \code{SingleCellExperiment} object containing the count
662 677
 #'  matrix, the feature annotations, and the cell annotation for the sample.
663 678
 #' @examples
... ...
@@ -670,7 +685,8 @@ importCellRangerV2Sample <- function(
670 685
     dataDir = NULL,
671 686
     sampleName = NULL,
672 687
     class = c("Matrix", "matrix"),
673
-    delayedArray = FALSE) {
688
+    delayedArray = FALSE,
689
+    rowNamesDedup = TRUE) {
674 690
 
675 691
     class <- match.arg(class)
676 692
 
... ...
@@ -684,7 +700,8 @@ importCellRangerV2Sample <- function(
684 700
         barcodesFileNames = "barcodes.tsv",
685 701
         gzipped = FALSE,
686 702
         class = class,
687
-        delayedArray = delayedArray)
703
+        delayedArray = delayedArray,
704
+        rowNamesDedup = rowNamesDedup)
688 705
 }
689 706
 
690 707
 
... ...
@@ -702,7 +719,8 @@ importCellRangerV3 <- function(
702 719
     sampleNames = NULL,
703 720
     dataType = c("filtered", "raw"),
704 721
     class = c("Matrix", "matrix"),
705
-    delayedArray = FALSE) {
722
+    delayedArray = FALSE,
723
+    rowNamesDedup = TRUE) {
706 724
 
707 725
     class <- match.arg(class)
708 726
     dataType <- match.arg(dataType)
... ...
@@ -723,7 +741,8 @@ importCellRangerV3 <- function(
723 741
         barcodesFileNames = "barcodes.tsv.gz",
724 742
         gzipped = TRUE,
725 743
         class = class,
726
-        delayedArray = delayedArray)
744
+        delayedArray = delayedArray,
745
+        rowNamesDedup = rowNamesDedup)
727 746
 
728 747
 }
729 748
 
... ...
@@ -742,6 +761,8 @@ importCellRangerV3 <- function(
742 761
 #'  \link[base]{matrix} function). Default "Matrix".
743 762
 #' @param delayedArray Boolean. Whether to read the expression matrix as
744 763
 #'  \link{DelayedArray} object or not. Default \code{FALSE}.
764
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
765
+#'  \code{TRUE}.
745 766
 #' @return A \code{SingleCellExperiment} object containing the count
746 767
 #'  matrix, the feature annotations, and the cell annotation for the sample.
747 768
 #' @examples
... ...
@@ -754,7 +775,8 @@ importCellRangerV3Sample <- function(
754 775
     dataDir = "./",
755 776
     sampleName = "sample",
756 777
     class = c("Matrix", "matrix"),
757
-    delayedArray = FALSE) {
778
+    delayedArray = FALSE,
779
+    rowNamesDedup = TRUE) {
758 780
 
759 781
     class <- match.arg(class)
760 782
 
... ...
@@ -768,5 +790,6 @@ importCellRangerV3Sample <- function(
768 790
         barcodesFileNames = "barcodes.tsv.gz",
769 791
         gzipped = TRUE,
770 792
         class = class,
771
-        delayedArray = delayedArray)
793
+        delayedArray = delayedArray,
794
+        rowNamesDedup = rowNamesDedup)
772 795
 }
... ...
@@ -102,7 +102,8 @@
102 102
 #'  object. Can be one of "Matrix" (as returned by
103 103
 #'  \link{readMM} function), or "matrix" (as returned by
104 104
 #'  \link[base]{matrix} function). Default \code{"Matrix"}.
105
-
105
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
106
+#'  \code{TRUE}.
106 107
 #' @details
107 108
 #' \code{importDropEst} expects either raw counts matrix stored as "cm_raw" or filtered
108 109
 #' counts matrix stored as "cm" in the DropEst rds output.
... ...
@@ -125,7 +126,8 @@ importDropEst <- function(sampleDirs = NULL,
125 126
                           rdsFileName = 'cell.counts',
126 127
                           sampleNames = NULL,
127 128
                           delayedArray = FALSE,
128
-                          class = c("Matrix", "matrix")) {
129
+                          class = c("Matrix", "matrix"),
130
+                          rowNamesDedup = TRUE) {
129 131
   dataType <- match.arg(dataType)
130 132
   class <- match.arg(class)
131 133
 
... ...
@@ -145,6 +147,15 @@ importDropEst <- function(sampleDirs = NULL,
145 147
     res[[i]] <- scei
146 148
   }
147 149
   sce <- do.call(SingleCellExperiment::cbind, res)
150
+  
151
+  if (isTRUE(rowNamesDedup)) {
152
+    if (any(duplicated(rownames(sce)))) {
153
+      message("Duplicated gene names found, adding '-1', '-2', ",
154
+              "... suffix to them.")
155
+    }
156
+    sce <- dedupRowNames(sce)
157
+  }
158
+  
148 159
   return(sce)
149 160
 }
150 161
 
... ...
@@ -13,6 +13,8 @@
13 13
 #'   \code{"Matrix"}.
14 14
 #' @param delayedArray Boolean. Whether to read the expression matrix as
15 15
 #'   \link{DelayedArray} object or not. Default \code{FALSE}.
16
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
17
+#' \code{TRUE}.
16 18
 #' @details See the list below for the available datasets and their
17 19
 #' descriptions.
18 20
 #' \describe{
... ...
@@ -43,7 +45,7 @@
43 45
 #' @export
44 46
 #' @importFrom SummarizedExperiment colData rowData colData<- assay assays
45 47
 importExampleData <- function(dataset, class = c("Matrix", "matrix"),
46
-                              delayedArray = FALSE) {
48
+                              delayedArray = FALSE, rowNamesDedup = TRUE) {
47 49
   class <- match.arg(class)
48 50
 
49 51
   scRNAseqDatasets <- c("fluidigm_pollen", "allen_tasic")
... ...
@@ -62,13 +64,16 @@ importExampleData <- function(dataset, class = c("Matrix", "matrix"),
62 64
       temp <- scRNAseq::ReprocessedAllenData()
63 65
       temp$sample <- paste0(colData(temp)$driver_1_s, "_", colData(temp)$dissection_s)
64 66
     }
67
+    if (isTRUE(rowNamesDedup)) {
68
+      temp <- dedupRowNames(temp)
69
+    }
65 70
   } else if (dataset %in% tenxPbmcDatasets) {
66 71
     if(!("TENxPBMCData" %in% rownames(utils::installed.packages()))) {
67 72
       stop(paste0("Package 'TENxPBMCData' is not installed. Please install to load dataset '", dataset, "'."))
68 73
     }
69 74
     temp <- TENxPBMCData::TENxPBMCData(dataset = dataset)
70 75
     colnames(temp) <- paste(temp$Sample, temp$Barcode, sep="_")
71
-    rownames(temp) <- rowData(temp)$Symbol_TENx
76
+    temp <- setRowNames(temp, "Symbol_TENx", dedup = rowNamesDedup)
72 77
     colData(temp)$sample <- colData(temp)$Sample
73 78
   } else {
74 79
     stop("'dataset' must be one of: ", paste(c(scRNAseqDatasets, tenxPbmcDatasets), collapse = ","))
... ...
@@ -1,121 +1,147 @@
1
-.checkGzip <- function(path, gzipped){
2
-    if (gzipped == "auto") {
3
-      ext <- tools::file_ext(path)
4
-      if (ext == "gz") {
5
-            path <- gzfile(path)
6
-        }
7
-    } else if (isTRUE(gzipped)) {
8
-        path <- gzfile(path)
9
-    }
10
-
11
-    return(path)
12
-}
13
-
14
-#' Create a SingleCellExperiment object from files
15
-#'
16
-#' Creates a SingleCellExperiment object from a counts file in various formats.
17
-#' and a file of annotation information, .
18
-#'
19
-#' @param assayFile The path to a  file in .mtx, .txt, .csv, .tab, or .tsv format.
20
-#' @param annotFile The path to a text file that contains columns of annotation
21
-#' information for each sample in the assayFile. This file should have the same
22
-#' number of rows as there are columns in the assayFile. If multiple samples are
23
-#' represented in these files, this should be denoted by a column called \code{'sample'}
24
-#' within the \code{annotFile}.
25
-#' @param featureFile The path to a text file that contains columns of
26
-#' annotation information for each gene in the count matrix. This file should
27
-#' have the same genes in the same order as assayFile. This is optional.
28
-#' @param assayName The name of the assay that you are uploading. The default
29
-#' is "counts".
30
-#' @param inputDataFrames If TRUE, assayFile and annotFile are read as data
31
-#' frames instead of file paths. The default is FALSE.
32
-#' @param class Character. The class of the expression matrix stored in the SCE
33
-#'  object. Can be one of "Matrix" (as returned by
34
-#'  \link{readMM} function), or "matrix" (as returned by
35
-#'  \link[base]{matrix} function). Default "Matrix".
36
-#' @param annotFileHeader Whether there's a header (colnames) in the cell annotation file. Default is FALSE
37
-#' @param annotFileRowName Which column is used as the rownames for the cell annotation file. Default is 1 (first column).
38
-#' @param annotFileSep Separater used for the cell annotation file. Default is "\\t".
39
-#' @param featureHeader Whether there's a header (colnames) in the feature annotation file. Default is FALSE
40
-#' @param featureRowName Which column is used as the rownames for the feature annotation file. Default is 1 (first column).
41
-#' @param featureSep Separater used for the feature annotation file. Default is "\\t".
42
-#' @param gzipped Whether the input file is gzipped. Default is "auto" and it will automatically detect whether the file is gzipped. Other options is TRUE or FALSE.
43
-#' @param delayedArray Boolean. Whether to read the expression matrix as
44
-#'  \link{DelayedArray} object or not. Default \code{FALSE}.
45
-#' @return a SingleCellExperiment object
46
-#' @export
47
-
48
-importFromFiles <- function(assayFile, annotFile = NULL, featureFile = NULL,
49
-                            assayName = "counts", inputDataFrames = FALSE,
50
-                            class = c("Matrix", "matrix"), delayedArray = FALSE,
51
-                            annotFileHeader = FALSE, annotFileRowName = 1,
52
-                            annotFileSep = "\t", featureHeader = FALSE,
53
-                            featureRowName = 1, featureSep = "\t", gzipped = "auto"
54
-                            ){
55
-
56
-  class <- match.arg(class)
57
-
58
-  if (inputDataFrames){
59
-    countsin <- assayFile
60
-    annotin <- annotFile
61
-    featurein <- featureFile
62
-  } else{
63
-    countsin <- readSingleCellMatrix(assayFile, class = class, delayedArray = delayedArray)
64
-    if (!is.null(annotFile)){
65
-      annotFile <- .checkGzip(annotFile, gzipped = gzipped)
66
-      annotin <- utils::read.table(annotFile, sep = annotFileSep, header = annotFileHeader,
67
-                                   row.names = annotFileRowName, stringsAsFactors = FALSE)
68
-    }
69
-    if (!is.null(featureFile)){
70
-      featureFile <- .checkGzip(featureFile, gzipped = gzipped)
71
-      featurein <- utils::read.table(featureFile, sep = featureSep, header = featureHeader,
72
-                                     row.names = featureRowName, stringsAsFactors = FALSE)
73
-    }
74
-  }
75
-  if (is.null(annotFile)){
76
-    annotin <- data.frame(row.names = colnames(countsin))
77
-    annotin <- S4Vectors::DataFrame(annotin)
78
-  }
79
-  if (is.null(featureFile)){
80
-    featurein <- data.frame(Gene = rownames(countsin))
81
-    rownames(featurein) <- featurein$Gene
82
-    featurein <- S4Vectors::DataFrame(featurein)
83
-  }
84
-
85
-  if (nrow(annotin) != ncol(countsin)){
86
-    stop("Different number of samples in input matrix and annotations: annot: ",
87
-         nrow(annotin), ", counts: ", ncol(countsin))
88
-  }
89
-  if (nrow(featurein) != nrow(countsin)){
90
-    stop("Different number of samples in input matrix and feature annotation",
91
-         nrow(featurein), ", counts: ", nrow(countsin))
92
-  }
93
-  if (any(rownames(annotin) != colnames(countsin))){
94
-    stop("Sample names in input matrix and annotation do not match!\nExample: ",
95
-         rownames(annotin)[rownames(annotin) != colnames(countsin)][1], " vs. ",
96
-         colnames(countsin)[rownames(annotin) != colnames(countsin)][1])
97
-  }
98
-  if (any(rownames(featurein) != rownames(countsin))){
99
-    stop("Sample names in input matrix and feature annotation do not match!")
100
-  }
101
-
102
-  assaylist <- list()
103
-  if (is.null(rownames(countsin))){
104
-    rownames(countsin) <- rownames(featurein)
105
-  }
106
-  if (is.null(colnames(countsin))){
107
-    colnames(countsin) <- rownames(annotin)
108
-  }
109
-  #assaylist[[assayName]] <- methods::as(countsin, "dgCMatrix")
110
-  assaylist[[assayName]] <- .convertToMatrix(countsin)
111
-
112
-  newassay <- SingleCellExperiment::SingleCellExperiment(assays = assaylist,
113
-                                                         colData = annotin,
114
-                                                         rowData = featurein)
115
-
116
-  if(is.null(newassay$sample)) {
117
-    newassay$sample <- "sample"
118
-  }
119
-
120
-  return(newassay)
121
-}
1
+.checkGzip <- function(path, gzipped){
2
+    if (gzipped == "auto") {
3
+      ext <- tools::file_ext(path)
4
+      if (ext == "gz") {
5
+            path <- gzfile(path)
6
+        }
7
+    } else if (isTRUE(gzipped)) {
8
+        path <- gzfile(path)
9
+    }
10
+
11
+    return(path)
12
+}
13
+
14
+#' Create a SingleCellExperiment object from files
15
+#'
16
+#' @details Creates a \linkS4class{SingleCellExperiment} object from a counts 
17
+#' file in various formats, and files of cell and feature annotation.
18
+#' @param assayFile The path to a file in .mtx, .txt, .csv, .tab, or .tsv 
19
+#' format.
20
+#' @param annotFile The path to a text file that contains columns of annotation
21
+#' information for each cell in the \code{assayFile}. This file should have the 
22
+#' same number of rows as there are columns in the \code{assayFile}. If multiple
23
+#' samples are represented in the dataset, this should be denoted by a column 
24
+#' called \code{'sample'} within the \code{annotFile}.
25
+#' @param featureFile The path to a text file that contains columns of
26
+#' annotation information for each gene in the count matrix. This file should
27
+#' have the same genes in the same order as \code{assayFile}. This is optional.
28
+#' @param assayName The name of the assay that you are uploading. The default
29
+#' is \code{"counts"}.
30
+#' @param inputDataFrames If \code{TRUE}, \code{assayFile}, \code{annotFile} and
31
+#' \code{featureFile} should be \code{data.frames} object (or its inheritance) 
32
+#' instead of file paths. The default is \code{FALSE}.
33
+#' @param class Character. The class of the expression matrix stored in the SCE
34
+#'  object. Can be one of \code{"Matrix"} (as returned by
35
+#'  \link{readMM} function), or \code{"matrix"} (as returned by
36
+#'  \link[base]{matrix} function). Default \code{"Matrix"}.
37
+#' @param annotFileHeader Whether there's a header (colnames) in the cell 
38
+#' annotation file. Default is \code{FALSE}.
39
+#' @param annotFileRowName Which column is used as the rownames for the cell 
40
+#' annotation file. This should match to the colnames of the \code{assayFile}. 
41
+#' Default is \code{1} (first column).
42
+#' @param annotFileSep Separater used for the cell annotation file. Default is 
43
+#' \code{"\\t"}.
44
+#' @param featureHeader Whether there's a header (colnames) in the feature 
45
+#' annotation file. Default is \code{FALSE}.
46
+#' @param featureRowName Which column is used as the rownames for the feature 
47
+#' annotation file. This should match to the rownames of the \code{assayFile}. 
48
+#' Default is \code{1}. (first column).
49
+#' @param featureSep Separater used for the feature annotation file. Default is 
50
+#' \code{"\\t"}.
51
+#' @param gzipped Whether the input file is gzipped. Default is \code{"auto"} 
52
+#' and it will automatically detect whether the file is gzipped. Other options 
53
+#' are \code{TRUE} or \code{FALSE}.
54
+#' @param delayedArray Boolean. Whether to read the expression matrix as
55
+#'  \link{DelayedArray} object or not. Default \code{FALSE}.
56
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
57
+#'  \code{TRUE}.
58
+#' @return a \linkS4class{SingleCellExperiment} object
59
+#' @export
60
+
61
+importFromFiles <- function(assayFile, annotFile = NULL, featureFile = NULL,
62
+                            assayName = "counts", inputDataFrames = FALSE,
63
+                            class = c("Matrix", "matrix"), delayedArray = FALSE,
64
+                            annotFileHeader = FALSE, annotFileRowName = 1,
65
+                            annotFileSep = "\t", featureHeader = FALSE,
66
+                            featureRowName = 1, featureSep = "\t", 
67
+                            gzipped = "auto", rowNamesDedup = TRUE){
68
+
69
+  class <- match.arg(class)
70
+
71
+  if (inputDataFrames){
72
+    countsin <- assayFile
73
+    annotin <- annotFile
74
+    featurein <- featureFile
75
+  } else{
76
+    countsin <- readSingleCellMatrix(assayFile, class = class, 
77
+                                     delayedArray = delayedArray)
78
+    if (!is.null(annotFile)){
79
+      annotFile <- .checkGzip(annotFile, gzipped = gzipped)
80
+      annotin <- utils::read.table(annotFile, sep = annotFileSep, 
81
+                                   header = annotFileHeader,
82
+                                   row.names = annotFileRowName, 
83
+                                   stringsAsFactors = FALSE)
84
+    }
85
+    if (!is.null(featureFile)){
86
+      featureFile <- .checkGzip(featureFile, gzipped = gzipped)
87
+      featurein <- utils::read.table(featureFile, sep = featureSep, 
88
+                                     header = featureHeader,
89
+                                     row.names = featureRowName, 
90
+                                     stringsAsFactors = FALSE)
91
+    }
92
+  }
93
+  if (is.null(annotFile)){
94
+    annotin <- data.frame(row.names = colnames(countsin))
95
+    annotin <- S4Vectors::DataFrame(annotin)
96
+  }
97
+  if (is.null(featureFile)){
98
+    featurein <- data.frame(Gene = rownames(countsin))
99
+    rownames(featurein) <- featurein$Gene
100
+    featurein <- S4Vectors::DataFrame(featurein)
101
+  }
102
+
103
+  if (nrow(annotin) != ncol(countsin)){
104
+    stop("Different number of cells in input matrix and annotations: annot: ",
105
+         nrow(annotin), ", counts: ", ncol(countsin))
106
+  }
107
+  if (nrow(featurein) != nrow(countsin)){
108
+    stop("Different number of features in input matrix and feature annotation",
109
+         nrow(featurein), ", counts: ", nrow(countsin))
110
+  }
111
+  if (any(rownames(annotin) != colnames(countsin))){
112
+    stop("Cell names in input matrix and annotation do not match!\nExample: ",
113
+         rownames(annotin)[rownames(annotin) != colnames(countsin)][1], " vs. ",
114
+         colnames(countsin)[rownames(annotin) != colnames(countsin)][1])
115
+  }
116
+  if (any(rownames(featurein) != rownames(countsin))){
117
+    stop("Feature names in input matrix and feature annotation do not match!")
118
+  }
119
+
120
+  assaylist <- list()
121
+  if (is.null(rownames(countsin))){
122
+    rownames(countsin) <- rownames(featurein)
123
+  }
124
+  if (is.null(colnames(countsin))){
125
+    colnames(countsin) <- rownames(annotin)
126
+  }
127
+  #assaylist[[assayName]] <- methods::as(countsin, "dgCMatrix")
128
+  assaylist[[assayName]] <- .convertToMatrix(countsin)
129
+
130
+  newassay <- SingleCellExperiment::SingleCellExperiment(assays = assaylist,
131
+                                                         colData = annotin,
132
+                                                         rowData = featurein)
133
+
134
+  if(is.null(newassay$sample)) {
135
+    newassay$sample <- "sample"
136
+  }
137
+
138
+  if (isTRUE(rowNamesDedup)) {
139
+    if (any(duplicated(rownames(newassay)))) {
140
+      message("Duplicated gene names found, adding '-1', '-2', ",
141
+              "... suffix to them.")
142
+    }
143
+    newassay <- dedupRowNames(newassay)
144
+  }
145
+  
146
+  return(newassay)
147
+}
... ...
@@ -10,20 +10,26 @@
10 10
   #  sparse <- reticulate::import("scipy.sparse")
11 11
   #  numpy <- reticulate::import("numpy")
12 12
   if (!reticulate::py_module_available(module = "scipy.sparse")) {
13
-    stop("Error!", "Cannot find python module 'scipy.sparse', please install Conda and run sctkPythonInstallConda()
14
-         or run sctkPythonInstallVirtualEnv(). If one of these have been previously run to install the modules,
15
-         make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(), respectively, if R has been
16
-         restarted since the module installation. Alternatively, scipy can be installed on the local machine
17
-         with pip (e.g. pip install scipy) and then the 'use_python()' function from the 'reticulate' package
18
-         can be used to select the correct Python environment.")
13
+    stop("Error!", "Cannot find python module 'scipy.sparse', please install 
14
+          Conda and run sctkPythonInstallConda() or run 
15
+          sctkPythonInstallVirtualEnv(). If one of these have been previously 
16
+          run to install the modules, make sure to run selectSCTKConda() or 
17
+          selectSCTKVirtualEnvironment(), respectively, if R has been restarted 
18
+          since the module installation. Alternatively, scipy can be installed 
19
+          on the local machine with pip (e.g. pip install scipy) and then the 
20
+          'use_python()' function from the 'reticulate' package can be used to 
21
+         select the correct Python environment.")
19 22
   }
20 23
   if (!reticulate::py_module_available(module = "numpy")) {
21
-    stop("Error!", "Cannot find python module 'numpy', please install Conda and run sctkPythonInstallConda()
22
-         or run sctkPythonInstallVirtualEnv(). If one of these have been previously run to install the modules,
23
-         make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(), respectively, if R has been
24
-         restarted since the module installation. Alternatively, numpy can be installed on the local machine
25
-         with pip (e.g. pip install numpy) and then the 'use_python()' function from the 'reticulate' package
26
-         can be used to select the correct Python environment.")
24
+    stop("Error!", "Cannot find python module 'numpy', please install Conda and 
25
+          run sctkPythonInstallConda() or run sctkPythonInstallVirtualEnv(). If 
26
+          one of these have been previously run to install the modules, make 
27
+          sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(), 
28
+          respectively, if R has been restarted since the module installation. 
29
+          Alternatively, numpy can be installed on the local machine with pip 
30
+          (e.g. pip install numpy) and then the 'use_python()' function from the
31
+          'reticulate' package can be used to select the correct Python 
32
+          environment.")
27 33
   }
28 34
 
29 35
   error <- try({
... ...
@@ -56,7 +62,9 @@
56 62
   }, silent = TRUE)
57 63
 
58 64
   if(inherits(error, "try-error")) {
59
-    stop(paste0("importOptimus did not complete successfully. SCE could not be generated. Error given during the import process: \n\n", error))
65
+    stop(paste0("importOptimus did not complete successfully. SCE could not be",
66
+                "generated. Error given during the import process: \n\n", 
67
+                error))
60 68
   }
61 69
 
62 70
   if (class == "matrix") {
... ...
@@ -180,7 +188,8 @@
180 188
   geneMetricsLocation,
181 189
   emptyDropsLocation,
182 190
   class,
183
-  delayedArray) {
191
+  delayedArray,
192
+  rowNamesDedup) {
184 193
 
185 194
   .checkArgsImportOptimus(OptimusDirs, samples)
186 195
 
... ...
@@ -201,6 +210,15 @@
201 210
   }
202 211
 
203 212
   sce <- do.call(SingleCellExperiment::cbind, res)
213
+  
214
+  if (isTRUE(rowNamesDedup)) {
215
+    if (any(duplicated(rownames(sce)))) {
216
+      message("Duplicated gene names found, adding '-1', '-2', ",
217
+              "... suffix to them.")
218
+    }
219
+    sce <- dedupRowNames(sce)
220
+  }
221
+  
204 222
   return(sce)
205 223
 }
206 224
 
... ...
@@ -247,6 +265,8 @@
247 265
 #'  \link[base]{matrix} function). Default "Matrix".
248 266
 #' @param delayedArray Boolean. Whether to read the expression matrix as
249 267
 #'  \link{DelayedArray} object or not. Default \code{FALSE}.
268
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
269
+#'  \code{TRUE}.
250 270
 #' @return A \link[SingleCellExperiment]{SingleCellExperiment} object
251 271
 #'  containing the count
252 272
 #'  matrix, the gene annotation, and the cell annotation.
... ...
@@ -267,7 +287,8 @@ importOptimus <- function(OptimusDirs,
267 287
   geneMetricsLocation = "call-MergeGeneMetrics/merged-gene-metrics.csv.gz",
268 288
   emptyDropsLocation = "call-RunEmptyDrops/empty_drops_result.csv",
269 289
   class = c("Matrix", "matrix"),
270
-  delayedArray = FALSE) {
290
+  delayedArray = FALSE,
291
+  rowNamesDedup = TRUE) {
271 292
 
272 293
   class <- match.arg(class)
273 294
 
... ...
@@ -280,6 +301,7 @@ importOptimus <- function(OptimusDirs,
280 301
     geneMetricsLocation = geneMetricsLocation,
281 302
     emptyDropsLocation = emptyDropsLocation,
282 303
     class = class,
283
-    delayedArray = delayedArray)
304
+    delayedArray = delayedArray,
305
+    rowNamesDedup = rowNamesDedup)
284 306
 
285 307
 }
... ...
@@ -40,7 +40,8 @@
40 40
     barcodesFileNames,
41 41
     gzipped,
42 42
     class,
43
-    delayedArray) {
43
+    delayedArray,
44
+    rowNamesDedup) {
44 45
 
45 46
     if (length(STARsoloDirs) != length(samples)) {
46 47
         stop("'STARsoloDirs' and 'samples' have unequal lengths!")
... ...
@@ -68,6 +69,14 @@
68 69
     }
69 70
 
70 71
     sce <- do.call(SingleCellExperiment::cbind, res)
72
+    
73
+    if (isTRUE(rowNamesDedup)) {
74
+        if (any(duplicated(rownames(sce)))) {
75
+            message("Duplicated gene names found, adding '-1', '-2', ",
76
+                    "... suffix to them.")
77
+        }
78
+        sce <- dedupRowNames(sce)
79
+    }
71 80
     return(sce)
72 81
 }
73 82
 
... ...
@@ -111,6 +120,8 @@
111 120
 #'  \link[base]{matrix} function). Default "Matrix".
112 121
 #' @param delayedArray Boolean. Whether to read the expression matrix as
113 122
 #'  \link{DelayedArray} object or not. Default \code{FALSE}.
123
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
124
+#'  \code{TRUE}.
114 125
 #' @return A \code{SingleCellExperiment} object containing the count
115 126
 #'  matrix, the gene annotation, and the cell annotation.
116 127
 #' @examples
... ...
@@ -151,7 +162,8 @@ importSTARsolo <- function(
151 162
     barcodesFileNames = "barcodes.tsv",
152 163
     gzipped = "auto",
153 164
     class = c("Matrix", "matrix"),
154
-    delayedArray = FALSE) {
165
+    delayedArray = FALSE,
166
+    rowNamesDedup = TRUE) {
155 167
 
156 168
     class <- match.arg(class)
157 169
 
... ...
@@ -164,5 +176,6 @@ importSTARsolo <- function(
164 176
         barcodesFileNames = barcodesFileNames,
165 177
         gzipped = gzipped,
166 178
         class = class,
167
-        delayedArray = delayedArray)
179
+        delayedArray = delayedArray,
180
+        rowNamesDedup = rowNamesDedup)
168 181
 }
... ...
@@ -74,7 +74,8 @@
74 74
     delayedArray,
75 75
     cbNotFirstCol,
76 76
     feNotFirstCol,
77
-    combinedSample) {
77
+    combinedSample,
78
+    rowNamesDedup) {
78 79
 
79 80
     if (length(seqcDirs) != length(samples)) {
80 81
         stop("'seqcDirs' and 'samples' have unequal lengths!")
... ...
@@ -121,6 +122,13 @@
121 122
 
122 123
         }
123 124
         sce <- do.call(SingleCellExperiment::cbind, res)
125
+        if (isTRUE(rowNamesDedup)) {
126
+            if (any(duplicated(rownames(sce)))) {
127
+                message("Duplicated gene names found, adding '-1', '-2', ",
128
+                        "... suffix to them.")
129
+            }
130
+            sce <- dedupRowNames(sce)
131
+        }
124 132
         return(sce)
125 133
 
126 134
     } else {
... ...
@@ -133,7 +141,15 @@
133 141
             res[[i]] <- scei
134 142
         }
135 143
         if (length(seqcDirs) == 1) {
136
-            return(res[[1]])
144
+            sce <- res[[1]]
145
+            if (isTRUE(rowNamesDedup)) {
146
+                if (any(duplicated(rownames(sce)))) {
147
+                    message("Duplicated gene names found, adding '-1', '-2', ",
148
+                            "... suffix to them.")
149
+                }
150
+                sce <- dedupRowNames(sce)
151
+            }
152
+            return(sce)
137 153
         } else {
138 154
             return(res)
139 155
         }
... ...
@@ -145,12 +161,11 @@
145 161
 #' @rdname importSEQC
146 162
 #' @title Construct SCE object from seqc output
147 163
 #' @description Read the filtered barcodes, features, and matrices for all
148
-#'  samples from (preferably a single run of) seqc output. Import and
149
-#'  combine them as one big \link[SingleCellExperiment]{SingleCellExperiment}
150
-#'  object.
164
+#'  samples from (preferably a single run of) seqc output. Import and combine
165
+#'  them as one big \link[SingleCellExperiment]{SingleCellExperiment} object.
151 166
 #' @param seqcDirs A vector of paths to seqc output files. Each sample
152
-#'  should have its own path. For example: \code{./pbmc_1k_50x50}.
153
-#'  Must have the same length as \code{samples}.
167
+#'  should have its own path. For example: \code{"./pbmc_1k_50x50"}. Must have 
168
+#'  the same length as \code{samples}.
154 169
 #' @param samples A vector of user-defined sample names for the samples to be
155 170
 #'  imported. Must have the same length as \code{seqcDirs}.
156 171
 #' @param prefix A vector containing the prefix of file names within each
... ...
@@ -158,44 +173,39 @@
158 173
 #'  length as \emph{samples}.
159 174
 #' @param gzipped Boolean. \code{TRUE} if the seqc output files
160 175
 #'  (sparse_counts_barcode.csv, sparse_counts_genes.csv, and
161
-#'  sparse_molecule_counts.mtx)
162
-#'  were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are
163
-#'  not gzipped.
164
-#' Default \code{FALSE}.
176
+#'  sparse_molecule_counts.mtx) were gzip compressed. \code{FALSE} otherwise. 
177
+#'  Default seqc outputs are not gzipped. Default \code{FALSE}.
165 178
 #' @param class Character. The class of the expression matrix stored in the SCE
166
-#'  object. Can be one of "Matrix" (as returned by
167
-#'  \link{readMM} function), or "matrix" (as returned by
168
-#'  \link[base]{matrix} function). Default "Matrix".
179
+#'  object. Can be one of \code{"Matrix"} (as returned by \link{readMM} 
180
+#'  function), or \code{"matrix"} (as returned by \link[base]{matrix} function).
181
+#'  Default \code{"Matrix"}.
169 182
 #' @param delayedArray Boolean. Whether to read the expression matrix as
170 183
 #'  \link{DelayedArray} object or not. Default \code{FALSE}.
171 184
 #' @param feNotFirstCol Boolean. \code{TRUE} if first column of
172
-#'  sparse_counts_genes.csv
173
-#' is row index and it will be removed. \code{FALSE} the first column will
174
-#'  be kept.
185
+#'  sparse_counts_genes.csv is row index and it will be removed. \code{FALSE} 
186
+#'  the first column will be kept.
175 187
 #' @param cbNotFirstCol Boolean. \code{TRUE} if first column of
176
-#'  sparse_counts_barcode.csv
177
-#' is row index and it will be removed. \code{FALSE} the first column will
178
-#'  be kept.
188
+#'  sparse_counts_barcode.csv is row index and it will be removed. \code{FALSE} 
189
+#'  the first column will be kept.
179 190
 #' @param combinedSample Boolean. If \code{TRUE}, \code{importSEQC} returns a
180 191
 #' \code{SingleCellExperiment} object containing the combined count matrix,
181
-#'  feature annotations
182
-#'  and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a
183
-#'  list containing multiple
192
+#'  feature annotations and the cell annotations. If \code{FALSE}, 
193
+#'  \code{importSEQC} returns a list containing multiple
184 194
 #'  \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment}
185 195
 #'  contains count matrix, feature annotations and cell annotations for
186 196
 #'  each sample.
197
+#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Only applied 
198
+#' if \code{combinedSample} is \code{TRUE} or only one \code{seqcDirs} 
199
+#' specified. Default \code{TRUE}.
187 200
 #' @details
188
-#' \code{importSEQC} imports output from seqc.
189
-#'  The default sparse_counts_barcode.csv or sparse_counts_genes.csv from
190
-#'  seqc output
201
+#' \code{importSEQC} imports output from seqc. The default 
202
+#'  sparse_counts_barcode.csv or sparse_counts_genes.csv from seqc output
191 203
 #'  contains two columns. The first column is row index and the second column
192
-#'  is cell-barcode
193
-#'  or gene symbol. \code{importSEQC} will remove first column. Alternatively,
194
-#'  user can call
204
+#'  is cell-barcode or gene symbol. \code{importSEQC} will remove first column. 
205
+#'  Alternatively, user can call
195 206
 #'  \code{cbNotFirstCol} or \code{feNotFirstCol} as FALSE to keep the first
196
-#'  column of these files.
197
-#'  When \code{combinedSample} is TRUE, \code{importSEQC} will combined count
198
-#'  matrix with genes detected in at least one sample.
207
+#'  column of these files. When \code{combinedSample} is TRUE, \code{importSEQC}
208
+#'  will combined count matrix with genes detected in at least one sample.
199 209
 #' @return A \code{SingleCellExperiment} object containing the combined count
200 210
 #'  matrix, the feature annotations, and the cell annotation.
201 211
 #' @examples
... ...
@@ -220,7 +230,8 @@ importSEQC <- function(
220 230
     delayedArray = FALSE,
221 231
     cbNotFirstCol = TRUE,
222 232
     feNotFirstCol = TRUE,
223
-    combinedSample = TRUE) {
233
+    combinedSample = TRUE,
234
+    rowNamesDedup = TRUE) {
224 235
 
225 236
     class <- match.arg(class)
226 237
 
... ...
@@ -232,5 +243,6 @@ importSEQC <- function(
232 243
         delayedArray = delayedArray,
233 244
         cbNotFirstCol = cbNotFirstCol,
234 245
         feNotFirstCol = feNotFirstCol,
235
-        combinedSample = combinedSample)
246
+        combinedSample = combinedSample,
247
+        rowNamesDedup = rowNamesDedup)
236 248
 }
... ...
@@ -204,7 +204,7 @@ discreteColorPalette <- function(n, palette = c("random", "ggplot", "celda"),
204 204
 #' insert a new column called \code{"rownames.uniq"} to \code{rowData(x)}, with
205 205
 #' the deduplicated rownames.
206 206
 #' @param return.list When set to \code{TRUE}, will return a character vector
207
-#' with deduplicated rownames.
207
+#' of the deduplicated rownames.
208 208
 #' @export
209 209
 #' @return By default, a matrix or /linkS4class{SingleCellExperiment} object
210 210
 #' with rownames deduplicated.
... ...
@@ -240,14 +240,16 @@ dedupRowNames <- function(x, as.rowData = FALSE, return.list = FALSE){
240 240
 
241 241
 #' Set rownames of SCE with a character vector or a rowData column
242 242
 #' @description Users can set rownames of an SCE object with either a character
243
-#' vector where the length equals to \code{nrow(inSCE)}, or a single character 
244
-#' specifying a column in \code{rowData(inSCE)}. Users can set 
245
-#' \code{dedup = TRUE} to remove duplicated entries in the specification, by 
246
-#' adding \code{-1, -2, ..., -i} suffix to the duplication of the same 
247
-#' identifier. 
248
-#' @param inSCE Input \linkS4class{SingleCellExperiment} object
249
-#' @param rowNames Character. Either of length equal to \code{nrow(inSCE)}, or
250
-#' a single character specifying a column in \code{rowData(inSCE)}. 
243
+#' vector where the length equals to \code{nrow(x)}, or a single character 
244
+#' specifying a column in \code{rowData(x)}. Also applicable to matrix like 
245
+#' object where \code{rownames<-} method works, but only allows full size name 
246
+#' vector. Users can set \code{dedup = TRUE} to remove duplicated entries in the
247
+#' specification, by adding \code{-1, -2, ..., -i} suffix to the duplication of 
248
+#' the same identifier. 
249
+#' @param x Input object where the rownames will be modified. 
250
+#' @param rowNames Character vector of the rownames. If \code{x} is an 
251
+#' \linkS4class{SingleCellExperiment} object, a single character specifying a 
252
+#' column in \code{rowData(x)}. 
251 253
 #' @param dedup Logical. Whether to deduplicate the specified rowNames. Default
252 254
 #' \code{TRUE}
253 255
 #' @return The input SCE object with rownames updated.
... ...
@@ -255,31 +257,33 @@ dedupRowNames <- function(x, as.rowData = FALSE, return.list = FALSE){
255 257
 #' @examples 
256 258
 #' data("scExample", package = "singleCellTK")
257 259
 #' head(rownames(sce))
258
-#' sce <- setSCERowNames(sce, "feature_name")
260
+#' sce <- setRowNames(sce, "feature_name")
259 261
 #' head(rownames(sce))
260
-setSCERowNames <- function(inSCE, rowNames, dedup = TRUE) {
261
-  if (!inherits(inSCE, "SingleCellExperiment")) {
262
-    stop("inSCE should be a SingleCellExperiment object")
263
-  }
262
+setRowNames <- function(x, rowNames, dedup = TRUE) {
264 263
   if (!inherits(rowNames, "character")) {
265 264
     stop("rowNames should be of character class")
266 265
   }
267
-  if (length(rowNames) == 1) {
268
-    if (rowNames %in% names(SummarizedExperiment::rowData(inSCE))) {
269
-      rows <- SummarizedExperiment::rowData(inSCE)[[rowNames]]
266
+  if (inherits(x, "SingleCellExperiment")) {
267
+    if (length(rowNames) == 1) {
268
+      if (rowNames %in% names(SummarizedExperiment::rowData(x))) {
269
+        rows <- SummarizedExperiment::rowData(x)[[rowNames]]
270
+      } else {
271
+        stop("Single rowNames specification not found in rowData(x)")
272
+      }
273
+    } else if (length(rowNames) == nrow(x)) {
274
+      rows <- rowNames
270 275
     } else {
271
-      stop("Single rowNames specification not found in rowData(inSCE)")
276
+      stop("Length of rowNames does not match nrow(x)")
272 277
     }
273
-  } else if (length(rowNames) == nrow(inSCE)) {
274
-    rows <- rowNames
278
+    rownames(x) <- rows
275 279
   } else {
276
-    stop("Length of rowNames does not match nrow(inSCE)")
280
+    rownames(x) <- rowNames
277 281
   }
278
-  rownames(inSCE) <- rows
282
+  
279 283
   if (isTRUE(dedup)) {
280
-    inSCE <- dedupRowNames(inSCE)
284
+    x <- dedupRowNames(x)
281 285
   }
282
-  return(inSCE)
286
+  return(x)
283 287
 }
284 288
 
285 289
 #' Retrieve cell/feature index by giving identifiers saved in col/rowData
... ...
@@ -19,7 +19,7 @@ insert a new column called \code{"rownames.uniq"} to \code{rowData(x)}, with
19 19
 the deduplicated rownames.}
20 20
 
21 21
 \item{return.list}{When set to \code{TRUE}, will return a character vector
22
-with deduplicated rownames.}
22
+of the deduplicated rownames.}
23 23
 }
24 24
 \value{
25 25
 By default, a matrix or /linkS4class{SingleCellExperiment} object
... ...
@@ -8,7 +8,8 @@ importAlevin(
8 8
   alevinDir = NULL,
9 9
   sampleName = "sample",
10 10
   delayedArray = FALSE,
11
-  class = c("Matrix", "matrix")
11
+  class = c("Matrix", "matrix"),
12
+  rowNamesDedup = TRUE
12 13
 )
13 14
 }
14 15
 \arguments{
... ...
@@ -28,6 +29,9 @@ barcodes. Default is 'sample'.}
28 29
 object. Can be one of "Matrix" (as returned by
29 30
 \link{readMM} function), or "matrix" (as returned by
30 31
 \link[base]{matrix} function). Default "Matrix".}
32
+
33
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
34
+\code{TRUE}.}
31 35
 }
32 36
 \value{
33 37
 A \code{SingleCellExperiment} object containing the count
... ...
@@ -8,7 +8,8 @@ importAnnData(
8 8
   sampleDirs = NULL,
9 9
   sampleNames = NULL,
10 10
   delayedArray = FALSE,
11
-  class = c("Matrix", "matrix")
11
+  class = c("Matrix", "matrix"),
12
+  rowNamesDedup = TRUE
12 13
 )
13 14
 }
14 15
 \arguments{
... ...
@@ -12,7 +12,8 @@ importBUStools(
12 12
   barcodesFileNames = "genes.barcodes.txt",
13 13
   gzipped = "auto",
14 14
   class = c("Matrix", "matrix"),
15
-  delayedArray = FALSE
15
+  delayedArray = FALSE,
16
+  rowNamesDedup = TRUE
16 17
 )
17 18
 }
18 19
 \arguments{
... ...
@@ -47,6 +48,9 @@ object. Can be one of "Matrix" (as returned by
47 48
 
48 49
 \item{delayedArray}{Boolean. Whether to read the expression matrix as
49 50
 \link[DelayedArray]{DelayedArray-class} object or not. Default \code{FALSE}.}
51
+
52
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
53
+\code{TRUE}.}
50 54
 }
51 55
 \value{
52 56
 A \code{SingleCellExperiment} object containing the count
... ...
@@ -17,7 +17,8 @@ importCellRanger(
17 17
   barcodesFileNames = "barcodes.tsv.gz",
18 18
   gzipped = "auto",
19 19
   class = c("Matrix", "matrix"),
20
-  delayedArray = FALSE
20
+  delayedArray = FALSE,
21
+  rowNamesDedup = TRUE
21 22
 )
22 23
 
23 24
 importCellRangerV2(
... ...
@@ -28,7 +29,8 @@ importCellRangerV2(
28 29
   class = c("Matrix", "matrix"),
29 30
   delayedArray = FALSE,
30 31
   reference = NULL,
31
-  cellRangerOutsV2 = NULL
32
+  cellRangerOutsV2 = NULL,
33
+  rowNamesDedup = TRUE
32 34
 )
33 35
 
34 36
 importCellRangerV3(
... ...
@@ -37,7 +39,8 @@ importCellRangerV3(
37 39
   sampleNames = NULL,
38 40
   dataType = c("filtered", "raw"),
39 41
   class = c("Matrix", "matrix"),
40
-  delayedArray = FALSE
42
+  delayedArray = FALSE,
43
+  rowNamesDedup = TRUE
41 44
 )
42 45
 }
43 46
 \arguments{
... ...
@@ -140,6 +143,9 @@ object. Can be one of "Matrix" (as returned by
140 143
 \item{delayedArray}{Boolean. Whether to read the expression matrix as
141 144
 \link{DelayedArray} object or not. Default \code{FALSE}.}
142 145
 
146
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
147
+\code{TRUE}.}
148
+
143 149
 \item{dataTypeV2}{Character. The type of output to import for
144 150
 Cellranger version below 3.0.0. Whether to import the filtered or the
145 151
 raw data. Can be one of 'filtered' or 'raw'. Default 'filtered'. When
... ...
@@ -8,7 +8,8 @@ importCellRangerV2Sample(
8 8
   dataDir = NULL,
9 9
   sampleName = NULL,
10 10
   class = c("Matrix", "matrix"),
11
-  delayedArray = FALSE
11
+  delayedArray = FALSE,
12
+  rowNamesDedup = TRUE
12 13
 )
13 14
 }
14 15
 \arguments{
... ...
@@ -24,6 +25,9 @@ object. Can be one of "Matrix" (as returned by
24 25
 
25 26
 \item{delayedArray}{Boolean. Whether to read the expression matrix as
26 27
 \link{DelayedArray} object or not. Default \code{FALSE}.}
28
+
29
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
30
+\code{TRUE}.}
27 31
 }
28 32
 \value{
29 33
 A \code{SingleCellExperiment} object containing the count
... ...
@@ -8,7 +8,8 @@ importCellRangerV3Sample(
8 8
   dataDir = "./",
9 9
   sampleName = "sample",
10 10
   class = c("Matrix", "matrix"),
11
-  delayedArray = FALSE
11
+  delayedArray = FALSE,
12
+  rowNamesDedup = TRUE
12 13
 )
13 14
 }
14 15
 \arguments{
... ...
@@ -24,6 +25,9 @@ object. Can be one of "Matrix" (as returned by
24 25
 
25 26
 \item{delayedArray}{Boolean. Whether to read the expression matrix as
26 27
 \link{DelayedArray} object or not. Default \code{FALSE}.}
28
+
29
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
30
+\code{TRUE}.}
27 31
 }
28 32
 \value{
29 33
 A \code{SingleCellExperiment} object containing the count
... ...
@@ -10,7 +10,8 @@ importDropEst(
10 10
   rdsFileName = "cell.counts",
11 11
   sampleNames = NULL,
12 12
   delayedArray = FALSE,
13
-  class = c("Matrix", "matrix")
13
+  class = c("Matrix", "matrix"),
14
+  rowNamesDedup = TRUE
14 15
 )
15 16
 }
16 17
 \arguments{
... ...
@@ -30,6 +31,9 @@ Default "sample".}
30 31
 object. Can be one of "Matrix" (as returned by
31 32
 \link{readMM} function), or "matrix" (as returned by
32 33
 \link[base]{matrix} function). Default \code{"Matrix"}.}
34
+
35
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
36
+\code{TRUE}.}
33 37
 }
34 38
 \value{
35 39
 A \code{SingleCellExperiment} object containing the count matrix,
... ...
@@ -4,7 +4,12 @@
4 4
 \alias{importExampleData}
5 5
 \title{Retrieve example datasets}
6 6
 \usage{
7
-importExampleData(dataset, class = c("Matrix", "matrix"), delayedArray = FALSE)
7
+importExampleData(
8
+  dataset,
9
+  class = c("Matrix", "matrix"),
10
+  delayedArray = FALSE,
11
+  rowNamesDedup = TRUE
12
+)
8 13
 }
9 14
 \arguments{
10 15
 \item{dataset}{Character. Name of the dataset to retrieve.}
... ...
@@ -17,6 +22,9 @@ will store the data as a sparse matrix from package \link{Matrix} while
17 22
 
18 23
 \item{delayedArray}{Boolean. Whether to read the expression matrix as
19 24
 \link{DelayedArray} object or not. Default \code{FALSE}.}
25
+
26
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
27
+\code{TRUE}.}
20 28
 }
21 29
 \value{
22 30
 The specified \link[SingleCellExperiment]{SingleCellExperiment} object.
... ...
@@ -18,54 +18,73 @@ importFromFiles(
18 18
   featureHeader = FALSE,
19 19
   featureRowName = 1,
20 20
   featureSep = "\\t",
21
-  gzipped = "auto"
21
+  gzipped = "auto",
22
+  rowNamesDedup = TRUE
22 23
 )
23 24
 }
24 25
 \arguments{
25
-\item{assayFile}{The path to a  file in .mtx, .txt, .csv, .tab, or .tsv format.}
26
+\item{assayFile}{The path to a file in .mtx, .txt, .csv, .tab, or .tsv 
27
+format.}
26 28
 
27 29
 \item{annotFile}{The path to a text file that contains columns of annotation
28
-information for each sample in the assayFile. This file should have the same
29
-number of rows as there are columns in the assayFile. If multiple samples are
30
-represented in these files, this should be denoted by a column called \code{'sample'}
31
-within the \code{annotFile}.}
30
+information for each cell in the \code{assayFile}. This file should have the 
31
+same number of rows as there are columns in the \code{assayFile}. If multiple
32
+samples are represented in the dataset, this should be denoted by a column 
33
+called \code{'sample'} within the \code{annotFile}.}
32 34
 
33 35
 \item{featureFile}{The path to a text file that contains columns of
34 36
 annotation information for each gene in the count matrix. This file should
35
-have the same genes in the same order as assayFile. This is optional.}
37
+have the same genes in the same order as \code{assayFile}. This is optional.}
36 38
 
37 39
 \item{assayName}{The name of the assay that you are uploading. The default
38
-is "counts".}
40
+is \code{"counts"}.}
39 41
 
40
-\item{inputDataFrames}{If TRUE, assayFile and annotFile are read as data
41
-frames instead of file paths. The default is FALSE.}
42
+\item{inputDataFrames}{If \code{TRUE}, \code{assayFile}, \code{annotFile} and
43
+\code{featureFile} should be \code{data.frames} object (or its inheritance) 
44
+instead of file paths. The default is \code{FALSE}.}
42 45
 
43 46
 \item{class}{Character. The class of the expression matrix stored in the SCE
44
-object. Can be one of "Matrix" (as returned by
45
-\link{readMM} function), or "matrix" (as returned by
46
-\link[base]{matrix} function). Default "Matrix".}
47
+object. Can be one of \code{"Matrix"} (as returned by
48
+\link{readMM} function), or \code{"matrix"} (as returned by
49
+\link[base]{matrix} function). Default \code{"Matrix"}.}
47 50
 
48 51
 \item{delayedArray}{Boolean. Whether to read the expression matrix as
49 52
 \link{DelayedArray} object or not. Default \code{FALSE}.}
50 53
 
51
-\item{annotFileHeader}{Whether there's a header (colnames) in the cell annotation file. Default is FALSE}
54
+\item{annotFileHeader}{Whether there's a header (colnames) in the cell 
55
+annotation file. Default is \code{FALSE}.}
52 56
 
53
-\item{annotFileRowName}{Which column is used as the rownames for the cell annotation file. Default is 1 (first column).}
57
+\item{annotFileRowName}{Which column is used as the rownames for the cell 
58
+annotation file. This should match to the colnames of the \code{assayFile}. 
59
+Default is \code{1} (first column).}
54 60
 
55
-\item{annotFileSep}{Separater used for the cell annotation file. Default is "\\t".}
61
+\item{annotFileSep}{Separater used for the cell annotation file. Default is 
62
+\code{"\\t"}.}
56 63
 
57
-\item{featureHeader}{Whether there's a header (colnames) in the feature annotation file. Default is FALSE}
64
+\item{featureHeader}{Whether there's a header (colnames) in the feature 
65
+annotation file. Default is \code{FALSE}.}
58 66
 
59
-\item{featureRowName}{Which column is used as the rownames for the feature annotation file. Default is 1 (first column).}
67
+\item{featureRowName}{Which column is used as the rownames for the feature 
68
+annotation file. This should match to the rownames of the \code{assayFile}. 
69
+Default is \code{1}. (first column).}
60 70
 
61
-\item{featureSep}{Separater used for the feature annotation file. Default is "\\t".}
71
+\item{featureSep}{Separater used for the feature annotation file. Default is 
72
+\code{"\\t"}.}
62 73
 
63
-\item{gzipped}{Whether the input file is gzipped. Default is "auto" and it will automatically detect whether the file is gzipped. Other options is TRUE or FALSE.}
74
+\item{gzipped}{Whether the input file is gzipped. Default is \code{"auto"} 
75
+and it will automatically detect whether the file is gzipped. Other options 
76
+are \code{TRUE} or \code{FALSE}.}
77
+
78
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
79
+\code{TRUE}.}
64 80
 }
65 81
 \value{
66
-a SingleCellExperiment object
82
+a \linkS4class{SingleCellExperiment} object
67 83
 }
68 84
 \description{
69
-Creates a SingleCellExperiment object from a counts file in various formats.
70
-and a file of annotation information, .
85
+Create a SingleCellExperiment object from files
86
+}
87
+\details{
88
+Creates a \linkS4class{SingleCellExperiment} object from a counts 
89
+file in various formats, and files of cell and feature annotation.
71 90
 }
... ...
@@ -14,7 +14,8 @@ importOptimus(
14 14
   geneMetricsLocation = "call-MergeGeneMetrics/merged-gene-metrics.csv.gz",
15 15
   emptyDropsLocation = "call-RunEmptyDrops/empty_drops_result.csv",
16 16
   class = c("Matrix", "matrix"),
17
-  delayedArray = FALSE
17
+  delayedArray = FALSE,
18
+  rowNamesDedup = TRUE
18 19
 )
19 20
 }
20 21
 \arguments{
... ...
@@ -63,6 +64,9 @@ object. Can be one of "Matrix" (as returned by
63 64
 
64 65
 \item{delayedArray}{Boolean. Whether to read the expression matrix as
65 66
 \link{DelayedArray} object or not. Default \code{FALSE}.}
67
+
68
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Default 
69
+\code{TRUE}.}
66 70
 }
67 71
 \value{
68 72
 A \link[SingleCellExperiment]{SingleCellExperiment} object
... ...
@@ -13,13 +13,14 @@ importSEQC(
13 13
   delayedArray = FALSE,
14 14
   cbNotFirstCol = TRUE,
15 15
   feNotFirstCol = TRUE,
16
-  combinedSample = TRUE
16
+  combinedSample = TRUE,
17
+  rowNamesDedup = TRUE
17 18
 )
18 19
 }
19 20
 \arguments{
20 21
 \item{seqcDirs}{A vector of paths to seqc output files. Each sample
21
-should have its own path. For example: \code{./pbmc_1k_50x50}.
22
-Must have the same length as \code{samples}.}
22
+should have its own path. For example: \code{"./pbmc_1k_50x50"}. Must have 
23
+the same length as \code{samples}.}
23 24
 
24 25
 \item{samples}{A vector of user-defined sample names for the samples to be
25 26
 imported. Must have the same length as \code{seqcDirs}.}
... ...
@@ -29,38 +30,37 @@ sample directory. It cannot be null and the vector should have the same
29 30
 length as \emph{samples}.}
30 31
 
31 32
 \item{gzipped}{Boolean. \code{TRUE} if the seqc output files
32
- (sparse_counts_barcode.csv, sparse_counts_genes.csv, and
33
- sparse_molecule_counts.mtx)
34
- were gzip compressed. \code{FALSE} otherwise. Default seqc outputs are
35
- not gzipped.
36
-Default \code{FALSE}.}
33
+(sparse_counts_barcode.csv, sparse_counts_genes.csv, and
34
+sparse_molecule_counts.mtx) were gzip compressed. \code{FALSE} otherwise. 
35
+Default seqc outputs are not gzipped. Default \code{FALSE}.}
37 36
 
38 37
 \item{class}{Character. The class of the expression matrix stored in the SCE
39
-object. Can be one of "Matrix" (as returned by
40
-\link{readMM} function), or "matrix" (as returned by
41
-\link[base]{matrix} function). Default "Matrix".}
38
+object. Can be one of \code{"Matrix"} (as returned by \link{readMM} 
39
+function), or \code{"matrix"} (as returned by \link[base]{matrix} function).
40
+Default \code{"Matrix"}.}
42 41
 
43 42
 \item{delayedArray}{Boolean. Whether to read the expression matrix as
44 43
 \link{DelayedArray} object or not. Default \code{FALSE}.}
45 44
 
46 45
 \item{cbNotFirstCol}{Boolean. \code{TRUE} if first column of
47
- sparse_counts_barcode.csv
48
-is row index and it will be removed. \code{FALSE} the first column will
49
- be kept.}
46
+sparse_counts_barcode.csv is row index and it will be removed. \code{FALSE} 
47
+the first column will be kept.}
50 48
 
51 49
 \item{feNotFirstCol}{Boolean. \code{TRUE} if first column of
52
- sparse_counts_genes.csv
53
-is row index and it will be removed. \code{FALSE} the first column will
54
- be kept.}
50
+sparse_counts_genes.csv is row index and it will be removed. \code{FALSE} 
51
+the first column will be kept.}
55 52
 
56 53
 \item{combinedSample}{Boolean. If \code{TRUE}, \code{importSEQC} returns a
57 54
 \code{SingleCellExperiment} object containing the combined count matrix,
58
- feature annotations
59
- and the cell annotations. If \code{FALSE}, \code{importSEQC} returns a
60
- list containing multiple
55
+ feature annotations and the cell annotations. If \code{FALSE}, 
56
+ \code{importSEQC} returns a list containing multiple
61 57
  \code{SingleCellExperiment} objects. Each \code{SingleCellExperiment}
62 58
  contains count matrix, feature annotations and cell annotations for
63 59
  each sample.}
60
+
61
+\item{rowNamesDedup}{Boolean. Whether to deduplicate rownames. Only applied 
62
+if \code{combinedSample} is \code{TRUE} or only one \code{seqcDirs} 
63
+specified. Default \code{TRUE}.}
64 64
 }
65 65
 \value{
66 66
 A \code{SingleCellExperiment} object containing the combined count
... ...
@@ -68,22 +68,18 @@ A \code{SingleCellExperiment} object containing the combined count
68 68
 }
69 69
 \description{
70 70
 Read the filtered barcodes, features, and matrices for all
71
- samples from (preferably a single run of) seqc output. Import and
72
- combine them as one big \link[SingleCellExperiment]{SingleCellExperiment}
73
- object.
71
+ samples from (preferably a single run of) seqc output. Import and combine
72
+ them as one big \link[SingleCellExperiment]{SingleCellExperiment} object.
74 73
 }
75 74