Browse code

Bugfixes

Christian Arnold authored on 29/11/2022 12:33:46
Showing3 changed files

... ...
@@ -140,7 +140,7 @@ initializeGRN <- function(objectMetadata = list(),
140 140
 #' @param counts_peaks Data frame. No default. Counts for the peaks, with raw or normalized counts for each peak (rows) across all samples (columns). 
141 141
 #' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_peaks} below. 
142 142
 #' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.
143
-#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactor}. Normalization procedure for peak data. 
143
+#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactors}. Normalization procedure for peak data. 
144 144
 #' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, 
145 145
 #' \code{EDASeq_GC_peaks}, \code{gcqn_peaks}, \code{DESeq2_sizeFactors}, \code{none}.
146 146
 #' @param idColumn_peaks Character. Default \code{peakID}. Name of the column in the counts_peaks data frame that contains peak IDs. 
... ...
@@ -149,7 +149,7 @@ initializeGRN <- function(objectMetadata = list(),
149 149
 #' @param counts_rna Data frame. No default. Counts for the RNA-seq data, with raw or normalized counts for each gene (rows) across all samples (columns). 
150 150
 #' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_rna} below. 
151 151
 #' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table.
152
-#' @param normalization_rna Character. Default \code{quantile}. Normalization procedure for peak data. 
152
+#' @param normalization_rna Character. Default \code{limma_quantile}. Normalization procedure for peak data. 
153 153
 #' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, \code{DESeq2_sizeFactors}, \code{none}.
154 154
 #' @param idColumn_RNA Character. Default \code{ENSEMBL}. Name of the column in the \code{counts_rna} data frame that contains Ensembl IDs.
155 155
 #' @param sampleMetadata Data frame. Default \code{NULL}. Optional, additional metadata for the samples, such as age, sex, gender etc. 
... ...
@@ -176,8 +176,8 @@ initializeGRN <- function(objectMetadata = list(),
176 176
 #' # We omit sampleMetadata = meta.df in the following line, becomes too long otherwise
177 177
 #' # GRN = addData(GRN, counts_peaks = peaks.df, counts_rna = rna.df, forceRerun = FALSE)
178 178
 
179
-addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactor", idColumn_peaks = "peakID", 
180
-                    counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
179
+addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactors", idColumn_peaks = "peakID", 
180
+                    counts_rna, normalization_rna = "limma_quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL,
181 181
                     additionalParams.l = list(),
182 182
                     allowOverlappingPeaks= FALSE,
183 183
                     keepOriginalReadCounts = FALSE,
... ...
@@ -389,10 +389,15 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactor"
389 389
     futile.logger::flog.info(paste0("Adding peak and gene annotation..."))
390 390
     
391 391
     GRN = .populatePeakAnnotation(GRN)
392
-    GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") 
393 392
     
394
-    # Additional GC statistics, not used at the moment currently
395
-    GRN = .calcAdditionalGCStatistics(GRN, GC.data.df)
393
+    if (normalization_peaks %in% c("EDASeq_GC_peaks", "gcqn_peaks")) {
394
+        GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") 
395
+        
396
+        # Additional GC statistics, not used at the moment currently
397
+        GRN = .calcAdditionalGCStatistics(GRN, GC.data.df)
398
+    }
399
+    
400
+
396 401
 
397 402
     GRN = .populateGeneAnnotation(GRN)
398 403
     
... ...
@@ -982,6 +982,10 @@ plotDiagnosticPlots_peakGene <- function(GRN,
982 982
       
983 983
       class_levels = c(paste0("real_",range), paste0("random_",range))
984 984
       
985
+      if (! "peak.GC.perc" %in% colnames(GRN@annotation$peaks)) {
986
+          GRN@annotation$peaks$peak.GC.perc = NA
987
+      }
988
+      
985 989
       peakGeneCorrelations.all = 
986 990
         rbind(
987 991
           dplyr::select(GRN@connections$peak_genes[["0"]], tidyselect::all_of(cols_keep)) %>%
... ...
@@ -1250,13 +1254,12 @@ plotDiagnosticPlots_peakGene <- function(GRN,
1250 1254
       xlabel = paste0("Correlation raw\np-value (binned)")
1251 1255
       mainTitle = paste0("Summary QC (TF: ", TFCur, ", gene type: ", paste0(geneTypesSelected, collapse = "+"), ", ", .prettyNum(range), " bp promoter range)")
1252 1256
       
1253
-      allVars = c("peak.GC.class",  "peak.width", "peak.mean","peak.median",
1257
+      allVars = c("peak.annotation", "peak.GC.class", "peak.width", "peak.mean","peak.median",
1254 1258
                   "peak.CV", "gene.median",  "gene.mean", "gene.CV", "peak.gene.combined.CV")
1255 1259
       
1256
-      # If ChIPseeker is not installed, remove peak.annotation from it
1257
-      if ("peak.annotation" %in% colnames(peakGeneCorrelations.all)) {
1258
-        allVars = c(allVars, "peak.annotation")
1259
-      }
1260
+      # Only use those actually available, as some packages may not be available
1261
+      allVars = intersect(allVars, colnames(peakGeneCorrelations.all))
1262
+
1260 1263
       
1261 1264
       
1262 1265
       for (varCur in allVars) {
... ...
@@ -67,12 +67,12 @@ Before we start with the package, let's retrieve some example data! For the purp
67 67
 * RNA-Seq data, raw counts (originally for around 35,000 genes, here filtered to around 19,000)
68 68
 * sample metadata with additional sample-specific information
69 69
 
70
-In general, the dataset is from human macrophages (both naive and IFNg primed) of healthy individuals and various stimulations / infections (naive vs primed and infected with SL1344 vs not), with 4 groups in total: control/infected(SL1344) and naive/primed(IFNg)
70
+In general, the dataset is from human macrophages (both naive and IFNg primed) of healthy individuals and various stimulations / infections (naive vs primed and infected with SL1344 vs not), with 4 groups in total: control/infected(SL1344) and naive/primed(IFNg). However, here for the example data, all ~30 samples are from IFNg primed and infected cells (as summarized as `IFNg_SL1344` in the sample metadata column `condition`).
71 71
 
72 72
 
73 73
 Furthermore, the example dataset is accompanied by the following files:
74 74
 
75
-* genome-wide transcription factor binding site predictions for 75 selected TFs, along with a translation table to link TF names to their corresponding Ensembl IDs. For each TF, a gzipped BED file has been created with predicted TF binding sites. The files have been generated with `PWMScan` and the `HOCOMOCO` database, see [2] for details.
75
+* genome-wide transcription factor binding site predictions for 6 selected TFs, along with a translation table to link TF names to their corresponding Ensembl IDs. For each TF, a gzipped BED file has been created with predicted TF binding sites. The files have been generated with `PWMScan` and the `HOCOMOCO` database, see [2] for details.
76 76
 
77 77
 # Example Workflow
78 78
 <a name="section1"></a>
... ...
@@ -193,13 +193,13 @@ At any time point, we can simply "print" a `GRaNIE` object by typing its name an
193 193
 
194 194
 We are now ready to fill our empty object with data! After preparing the data beforehand, we can now use the data import function `addData()` to import both enhancers and RNA-seq data to the `GRaNIE` object. In addition to the count tables, we explicitly specify the name of the ID columns. As mentioned before, the sample metadata is optional but recommended if available.
195 195
 
196
-An important consideration is data normalization for RNA and ATAC. We currently support three choices of normalization: `quantile`, `DESeq_sizeFactor` and `none` and refer to the R help for more details (`?addData`). The default for RNA-Seq is a quantile normalization, while for the open chromatin enhancer data, it is `DESeq_sizeFactor` (i.e., a "regular" `DESeq` size factor normalization). Importantly, `DESeq_sizeFactor` requires raw data, while `quantile` does not necessarily. We nevertheless recommend raw data as input, although it is also possible to provide pre-normalized data as input and then topping this up with another normalization method or "none". 
196
+An important consideration is data normalization for RNA and ATAC. We support many different choices of normalization, the selection of which also depends on whether RNA or peaks is considered, and possible choices are: `limma_quantile`, `DESeq2_sizeFactors` and `none` and refer to the R help for more details (`?addData`). The default for RNA-Seq is a quantile normalization, while for the open chromatin enhancer data, it is `DESeq2_sizeFactor2` (i.e., a "regular" `DESeq` size factor normalization). Importantly, `DESeq2_sizeFactors` requires raw data, while `quantile` does not necessarily. We nevertheless recommend raw data as input, although it is also possible to provide pre-normalized data as input and then topping this up with another normalization method or "none". 
197 197
 
198 198
 
199 199
 ```{r addData, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200"}
200 200
 GRN = addData(GRN, 
201
-              counts_peaks = countsPeaks.df, normalization_peaks = "DESeq_sizeFactor", idColumn_peaks = idColumn_peaks,
202
-              counts_rna = countsRNA.df, normalization_rna = "quantile", idColumn_RNA = idColumn_RNA,
201
+              counts_peaks = countsPeaks.df, normalization_peaks = "DESeq2_sizeFactors", idColumn_peaks = idColumn_peaks,
202
+              counts_rna = countsRNA.df, normalization_rna = "limma_quantile", idColumn_RNA = idColumn_RNA,
203 203
               sampleMetadata = sampleMetadata.df,
204 204
               forceRerun = TRUE)
205 205
 ```
... ...
@@ -235,10 +235,13 @@ For more parameter details, see the R help (`?addTFBS` and `?overlapPeaksAndTFBS
235 235
 
236 236
 ```{r addTFBS, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200", results='hold'}
237 237
 
238
-folder_TFBS_first50 = "https://www.embl.de/download/zaugg/GRaNIE/TFBS_selected.zip"
238
+folder_TFBS_6TFs = "https://www.embl.de/download/zaugg/GRaNIE/TFBS_selected.zip"
239 239
 # Download the zip of all TFBS files. Takes too long here, not executed therefore
240
-# download.file(folder_TFBS_first50, file.path("TFBS_selected.zip"), quiet = FALSE)
240
+
241
+# download.file(folder_TFBS_6TFs, file.path("TFBS_selected.zip"), quiet = FALSE)
242
+
241 243
 # unzip(file.path("TFBS_selected.zip"), overwrite = TRUE)
244
+
242 245
 # motifFolder = tools::file_path_as_absolute("TFBS_selected")
243 246
 
244 247
 GRN = addTFBS(GRN, motifFolder = motifFolder, TFs = "all", filesTFBSPattern = "_TFBS", fileEnding = ".bed.gz", forceRerun = TRUE)
... ...
@@ -307,7 +310,7 @@ This function may run a while, and each time-consuming step has a built-in progr
307 310
 For reasons of brevity and organization, we fully describe their interpretation and meaning in detail elsewhere, however. In summary, TF-enhancer diagnostic plots are available for each TF, and each page summarizes the QC for each TF in two plots:  
308 311
 
309 312
 
310
-```{r, echo=FALSE, include=TRUE, eval = TRUE, class.output="scroll-200", results='hold', fig.cap="<i>TF-enhancer diagnostic plots for EGR1.0.A</i>", results='hold', fig.height = 10}
313
+```{r, echo=TRUE, include=TRUE, eval = TRUE, class.output="scroll-200", results='hold', fig.cap="<i>TF-enhancer diagnostic plots for EGR1.0.A</i>", results='hold', fig.height = 10}
311 314
 GRN = plotDiagnosticPlots_TFPeaks(GRN, dataType = c("real", "permuted"), plotAsPDF = FALSE, pages = 4)
312 315
 ```
313 316