... | ... |
@@ -140,7 +140,7 @@ initializeGRN <- function(objectMetadata = list(), |
140 | 140 |
#' @param counts_peaks Data frame. No default. Counts for the peaks, with raw or normalized counts for each peak (rows) across all samples (columns). |
141 | 141 |
#' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_peaks} below. |
142 | 142 |
#' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table. |
143 |
-#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactor}. Normalization procedure for peak data. |
|
143 |
+#' @param normalization_peaks Character. Default \code{DESeq2_sizeFactors}. Normalization procedure for peak data. |
|
144 | 144 |
#' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, |
145 | 145 |
#' \code{EDASeq_GC_peaks}, \code{gcqn_peaks}, \code{DESeq2_sizeFactors}, \code{none}. |
146 | 146 |
#' @param idColumn_peaks Character. Default \code{peakID}. Name of the column in the counts_peaks data frame that contains peak IDs. |
... | ... |
@@ -149,7 +149,7 @@ initializeGRN <- function(objectMetadata = list(), |
149 | 149 |
#' @param counts_rna Data frame. No default. Counts for the RNA-seq data, with raw or normalized counts for each gene (rows) across all samples (columns). |
150 | 150 |
#' In addition to the count data, it must also contain one ID column with a particular format, see the argument \code{idColumn_rna} below. |
151 | 151 |
#' Row names are ignored, column names must be set to the sample names and must match those from the RNA counts and the sample metadata table. |
152 |
-#' @param normalization_rna Character. Default \code{quantile}. Normalization procedure for peak data. |
|
152 |
+#' @param normalization_rna Character. Default \code{limma_quantile}. Normalization procedure for peak data. |
|
153 | 153 |
#' Must be one of \code{limma_cyclicloess}, \code{limma_quantile}, \code{limma_scale}, \code{csaw_cyclicLoess_orig}, \code{csaw_TMM}, \code{DESeq2_sizeFactors}, \code{none}. |
154 | 154 |
#' @param idColumn_RNA Character. Default \code{ENSEMBL}. Name of the column in the \code{counts_rna} data frame that contains Ensembl IDs. |
155 | 155 |
#' @param sampleMetadata Data frame. Default \code{NULL}. Optional, additional metadata for the samples, such as age, sex, gender etc. |
... | ... |
@@ -176,8 +176,8 @@ initializeGRN <- function(objectMetadata = list(), |
176 | 176 |
#' # We omit sampleMetadata = meta.df in the following line, becomes too long otherwise |
177 | 177 |
#' # GRN = addData(GRN, counts_peaks = peaks.df, counts_rna = rna.df, forceRerun = FALSE) |
178 | 178 |
|
179 |
-addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactor", idColumn_peaks = "peakID", |
|
180 |
- counts_rna, normalization_rna = "quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL, |
|
179 |
+addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactors", idColumn_peaks = "peakID", |
|
180 |
+ counts_rna, normalization_rna = "limma_quantile", idColumn_RNA = "ENSEMBL", sampleMetadata = NULL, |
|
181 | 181 |
additionalParams.l = list(), |
182 | 182 |
allowOverlappingPeaks= FALSE, |
183 | 183 |
keepOriginalReadCounts = FALSE, |
... | ... |
@@ -389,10 +389,15 @@ addData <- function(GRN, counts_peaks, normalization_peaks = "DESeq2_sizeFactor" |
389 | 389 |
futile.logger::flog.info(paste0("Adding peak and gene annotation...")) |
390 | 390 |
|
391 | 391 |
GRN = .populatePeakAnnotation(GRN) |
392 |
- GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") |
|
393 | 392 |
|
394 |
- # Additional GC statistics, not used at the moment currently |
|
395 |
- GRN = .calcAdditionalGCStatistics(GRN, GC.data.df) |
|
393 |
+ if (normalization_peaks %in% c("EDASeq_GC_peaks", "gcqn_peaks")) { |
|
394 |
+ GRN@annotation$peaks = dplyr::left_join(GRN@annotation$peaks, GC.data.df, by = "peak.ID") |
|
395 |
+ |
|
396 |
+ # Additional GC statistics, not used at the moment currently |
|
397 |
+ GRN = .calcAdditionalGCStatistics(GRN, GC.data.df) |
|
398 |
+ } |
|
399 |
+ |
|
400 |
+ |
|
396 | 401 |
|
397 | 402 |
GRN = .populateGeneAnnotation(GRN) |
398 | 403 |
|
... | ... |
@@ -982,6 +982,10 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
982 | 982 |
|
983 | 983 |
class_levels = c(paste0("real_",range), paste0("random_",range)) |
984 | 984 |
|
985 |
+ if (! "peak.GC.perc" %in% colnames(GRN@annotation$peaks)) { |
|
986 |
+ GRN@annotation$peaks$peak.GC.perc = NA |
|
987 |
+ } |
|
988 |
+ |
|
985 | 989 |
peakGeneCorrelations.all = |
986 | 990 |
rbind( |
987 | 991 |
dplyr::select(GRN@connections$peak_genes[["0"]], tidyselect::all_of(cols_keep)) %>% |
... | ... |
@@ -1250,13 +1254,12 @@ plotDiagnosticPlots_peakGene <- function(GRN, |
1250 | 1254 |
xlabel = paste0("Correlation raw\np-value (binned)") |
1251 | 1255 |
mainTitle = paste0("Summary QC (TF: ", TFCur, ", gene type: ", paste0(geneTypesSelected, collapse = "+"), ", ", .prettyNum(range), " bp promoter range)") |
1252 | 1256 |
|
1253 |
- allVars = c("peak.GC.class", "peak.width", "peak.mean","peak.median", |
|
1257 |
+ allVars = c("peak.annotation", "peak.GC.class", "peak.width", "peak.mean","peak.median", |
|
1254 | 1258 |
"peak.CV", "gene.median", "gene.mean", "gene.CV", "peak.gene.combined.CV") |
1255 | 1259 |
|
1256 |
- # If ChIPseeker is not installed, remove peak.annotation from it |
|
1257 |
- if ("peak.annotation" %in% colnames(peakGeneCorrelations.all)) { |
|
1258 |
- allVars = c(allVars, "peak.annotation") |
|
1259 |
- } |
|
1260 |
+ # Only use those actually available, as some packages may not be available |
|
1261 |
+ allVars = intersect(allVars, colnames(peakGeneCorrelations.all)) |
|
1262 |
+ |
|
1260 | 1263 |
|
1261 | 1264 |
|
1262 | 1265 |
for (varCur in allVars) { |
... | ... |
@@ -67,12 +67,12 @@ Before we start with the package, let's retrieve some example data! For the purp |
67 | 67 |
* RNA-Seq data, raw counts (originally for around 35,000 genes, here filtered to around 19,000) |
68 | 68 |
* sample metadata with additional sample-specific information |
69 | 69 |
|
70 |
-In general, the dataset is from human macrophages (both naive and IFNg primed) of healthy individuals and various stimulations / infections (naive vs primed and infected with SL1344 vs not), with 4 groups in total: control/infected(SL1344) and naive/primed(IFNg) |
|
70 |
+In general, the dataset is from human macrophages (both naive and IFNg primed) of healthy individuals and various stimulations / infections (naive vs primed and infected with SL1344 vs not), with 4 groups in total: control/infected(SL1344) and naive/primed(IFNg). However, here for the example data, all ~30 samples are from IFNg primed and infected cells (as summarized as `IFNg_SL1344` in the sample metadata column `condition`). |
|
71 | 71 |
|
72 | 72 |
|
73 | 73 |
Furthermore, the example dataset is accompanied by the following files: |
74 | 74 |
|
75 |
-* genome-wide transcription factor binding site predictions for 75 selected TFs, along with a translation table to link TF names to their corresponding Ensembl IDs. For each TF, a gzipped BED file has been created with predicted TF binding sites. The files have been generated with `PWMScan` and the `HOCOMOCO` database, see [2] for details. |
|
75 |
+* genome-wide transcription factor binding site predictions for 6 selected TFs, along with a translation table to link TF names to their corresponding Ensembl IDs. For each TF, a gzipped BED file has been created with predicted TF binding sites. The files have been generated with `PWMScan` and the `HOCOMOCO` database, see [2] for details. |
|
76 | 76 |
|
77 | 77 |
# Example Workflow |
78 | 78 |
<a name="section1"></a> |
... | ... |
@@ -193,13 +193,13 @@ At any time point, we can simply "print" a `GRaNIE` object by typing its name an |
193 | 193 |
|
194 | 194 |
We are now ready to fill our empty object with data! After preparing the data beforehand, we can now use the data import function `addData()` to import both enhancers and RNA-seq data to the `GRaNIE` object. In addition to the count tables, we explicitly specify the name of the ID columns. As mentioned before, the sample metadata is optional but recommended if available. |
195 | 195 |
|
196 |
-An important consideration is data normalization for RNA and ATAC. We currently support three choices of normalization: `quantile`, `DESeq_sizeFactor` and `none` and refer to the R help for more details (`?addData`). The default for RNA-Seq is a quantile normalization, while for the open chromatin enhancer data, it is `DESeq_sizeFactor` (i.e., a "regular" `DESeq` size factor normalization). Importantly, `DESeq_sizeFactor` requires raw data, while `quantile` does not necessarily. We nevertheless recommend raw data as input, although it is also possible to provide pre-normalized data as input and then topping this up with another normalization method or "none". |
|
196 |
+An important consideration is data normalization for RNA and ATAC. We support many different choices of normalization, the selection of which also depends on whether RNA or peaks is considered, and possible choices are: `limma_quantile`, `DESeq2_sizeFactors` and `none` and refer to the R help for more details (`?addData`). The default for RNA-Seq is a quantile normalization, while for the open chromatin enhancer data, it is `DESeq2_sizeFactor2` (i.e., a "regular" `DESeq` size factor normalization). Importantly, `DESeq2_sizeFactors` requires raw data, while `quantile` does not necessarily. We nevertheless recommend raw data as input, although it is also possible to provide pre-normalized data as input and then topping this up with another normalization method or "none". |
|
197 | 197 |
|
198 | 198 |
|
199 | 199 |
```{r addData, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200"} |
200 | 200 |
GRN = addData(GRN, |
201 |
- counts_peaks = countsPeaks.df, normalization_peaks = "DESeq_sizeFactor", idColumn_peaks = idColumn_peaks, |
|
202 |
- counts_rna = countsRNA.df, normalization_rna = "quantile", idColumn_RNA = idColumn_RNA, |
|
201 |
+ counts_peaks = countsPeaks.df, normalization_peaks = "DESeq2_sizeFactors", idColumn_peaks = idColumn_peaks, |
|
202 |
+ counts_rna = countsRNA.df, normalization_rna = "limma_quantile", idColumn_RNA = idColumn_RNA, |
|
203 | 203 |
sampleMetadata = sampleMetadata.df, |
204 | 204 |
forceRerun = TRUE) |
205 | 205 |
``` |
... | ... |
@@ -235,10 +235,13 @@ For more parameter details, see the R help (`?addTFBS` and `?overlapPeaksAndTFBS |
235 | 235 |
|
236 | 236 |
```{r addTFBS, echo=TRUE, include=TRUE, eval = FALSE, class.output="scroll-200", results='hold'} |
237 | 237 |
|
238 |
-folder_TFBS_first50 = "https://www.embl.de/download/zaugg/GRaNIE/TFBS_selected.zip" |
|
238 |
+folder_TFBS_6TFs = "https://www.embl.de/download/zaugg/GRaNIE/TFBS_selected.zip" |
|
239 | 239 |
# Download the zip of all TFBS files. Takes too long here, not executed therefore |
240 |
-# download.file(folder_TFBS_first50, file.path("TFBS_selected.zip"), quiet = FALSE) |
|
240 |
+ |
|
241 |
+# download.file(folder_TFBS_6TFs, file.path("TFBS_selected.zip"), quiet = FALSE) |
|
242 |
+ |
|
241 | 243 |
# unzip(file.path("TFBS_selected.zip"), overwrite = TRUE) |
244 |
+ |
|
242 | 245 |
# motifFolder = tools::file_path_as_absolute("TFBS_selected") |
243 | 246 |
|
244 | 247 |
GRN = addTFBS(GRN, motifFolder = motifFolder, TFs = "all", filesTFBSPattern = "_TFBS", fileEnding = ".bed.gz", forceRerun = TRUE) |
... | ... |
@@ -307,7 +310,7 @@ This function may run a while, and each time-consuming step has a built-in progr |
307 | 310 |
For reasons of brevity and organization, we fully describe their interpretation and meaning in detail elsewhere, however. In summary, TF-enhancer diagnostic plots are available for each TF, and each page summarizes the QC for each TF in two plots: |
308 | 311 |
|
309 | 312 |
|
310 |
-```{r, echo=FALSE, include=TRUE, eval = TRUE, class.output="scroll-200", results='hold', fig.cap="<i>TF-enhancer diagnostic plots for EGR1.0.A</i>", results='hold', fig.height = 10} |
|
313 |
+```{r, echo=TRUE, include=TRUE, eval = TRUE, class.output="scroll-200", results='hold', fig.cap="<i>TF-enhancer diagnostic plots for EGR1.0.A</i>", results='hold', fig.height = 10} |
|
311 | 314 |
GRN = plotDiagnosticPlots_TFPeaks(GRN, dataType = c("real", "permuted"), plotAsPDF = FALSE, pages = 4) |
312 | 315 |
``` |
313 | 316 |
|