#' @title Select regions with variations in DNA methylation #' levels above a threshold #' @description #' For each region, computes the interquartile range (IQR) of the DNA methylation (DNAm) #' levels and requires the IQR to be above a threshold #' @param dnam DNA methylation matrix or SummarizedExperiment object #' @param min.IQR.threshold #' Threshold for minimal interquantile range (difference between the 75th and 25th percentiles) #' of the DNAm #' @param cores #' Number of CPU cores to be used in the analysis. Default: 1 #' @export #' @examples #' data("dna.met.chr21") #' dna.met.chr21.filtered <- filter_dnam_by_quant_diff( #' dna.met.chr21 #' ) #' @return #' A subset of the original matrix only with the #' rows passing the filter threshold. #' @importFrom SummarizedExperiment assay<- filter_dnam_by_quant_diff <- function( dnam, min.IQR.threshold = 0.2, cores = 1 ){ is.se <- is(dnam,"SummarizedExperiment") if (is.se) { matrix <- assay(dnam) } else { matrix <- dnam } # remove all NA rows keep.rows <- which(rowSums(is.na(matrix)) != ncol(matrix)) if(length(keep.rows) < nrow(matrix)){ message("Removing rows with NAs for all samples") matrix <- matrix[keep.rows,] } IQR <- calculate_IQR(matrix) tab <- plyr::count(IQR$IQR > min.IQR.threshold) colnames(tab)[1] <- "Status" tab$Status[which(tab$Status == FALSE)] <- "Regions below threshold" tab$Status[which(tab$Status == TRUE)] <- "Regions above threshold" print(tab) diff.regions <- c( IQR %>% filter(IQR > min.IQR.threshold) %>% pull(.data$ID) %>% as.character() ) matrix <- matrix[diff.regions,,drop = FALSE] if (is.se) { dnam <- dnam[rownames(matrix),] assay(dnam) <- matrix } else { dnam <- matrix } dnam } #' @examples #' library(dplyr) #' 10 %>% rnorm %>% #' matrix(nrow = 1,dimnames = list(c("row1"), LETTERS[1:10])) %>% #' calculate_IQR #' @noRd calculate_IQR <- function(matrix){ check_package("matrixStats") tibble::tibble( "ID" = rownames(matrix), "IQR" = matrixStats::rowIQRs(matrix, na.rm = TRUE) ) } #' @examples #' library(dplyr) #' 10 %>% rnorm %>% #' matrix(nrow = 1,dimnames = list(c("row1"), LETTERS[1:10])) %>% #' calculate_mean_q4_minus_mean_q1 #' @noRd calculate_mean_q4_minus_mean_q1 <- function( matrix, cores = 1, dnam.group.threshold = 0.25 ){ parallel <- register_cores(cores) plyr::adply(.data = matrix,.margins = 1,.fun = function(row){ qs <- quantile(row, probs = c(dnam.group.threshold,1 - dnam.group.threshold), na.rm = TRUE) qs.mean <- tapply(row, findInterval(row, qs), mean, rm.na = TRUE) tibble::tibble("diff.mean" = max(qs.mean) - min(qs.mean)) },.parallel = parallel, .progress = "time",.id = "ID") } #' @title Select genes with variations above a threshold #' @description For each gene, compares the mean gene expression #' levels in samples in high expression (Q4) #' vs. samples with low gene expression (Q1), #' and requires the fold change to be above a certain threshold. #' @param exp Gene expression matrix or SumarizedExperiment object #' @param fold.change #' Threshold for fold change of mean gene #' expression levels in samples with high #' (Q4) and low (Q1) gene expression levels. Defaults to 1.5. #' @param cores #' Number of CPU cores to be used in the analysis. Default: 1 #' @export #' @examples #' data("gene.exp.chr21.log2") #' gene.exp.chr21.log2.filtered <- filter_exp_by_quant_mean_FC( #' gene.exp.chr21.log2 #' ) #' @return #' A subset of the original matrix only with the rows passing #' the filter threshold. filter_exp_by_quant_mean_FC <- function( exp, fold.change = 1.5, cores = 1 ){ if(is(exp,"SummarizedExperiment")){ matrix <- assay(exp) } else { matrix <- exp } parallel <- register_cores(cores) diff.genes <- plyr::adply(matrix,.margins = 1,.fun = function(row){ quant <- quantile(row, na.rm = TRUE) quant.fold.change <- data.frame("q4_div_q1" = quant[4] / quant[2]) low.cutoff <- quant[2] upper.cutoff <- quant[4] mean.q1 <- row[row <= low.cutoff] %>% mean(na.rm = TRUE) mean.q4 <- row[row >= upper.cutoff] %>% mean(na.rm = TRUE) data.frame( "diff_fold_change" = mean.q4 / mean.q1, stringsAsFactors = FALSE ) }, .progress = "time",.parallel = parallel) tab <- plyr::count(diff.genes$diff_fold_change > fold.change) colnames(tab)[1] <- "Status" tab$Status[which(tab$Status == FALSE)] <- "Genes below threshold" tab$Status[which(tab$Status == TRUE)] <- "Genes above threshold" print(tab) diff.genes <- c( diff.genes %>% filter(.data$diff_fold_change > fold.change) %>% pull(.data$X1) %>% as.character() ) exp[diff.genes,,drop = FALSE] } #' @title Remove genes with gene expression level equal to 0 in a #' substantial percentage of the samples #' @param exp Gene expression matrix or SumarizedExperiment object #' @param max.samples.percentage Max percentage of samples with gene #' expression as 0, for genes to be selected. #' If max.samples.percentage 100, remove genes with 0 for 100\% samples. #' If max.samples.percentage 25, remove genes with 0 for more #' than 25\% of the samples. #' #' @export #' @return A subset of the original matrix only with the rows #' passing the filter threshold. filter_genes_zero_expression <- function( exp, max.samples.percentage = 0.25 ){ if (is(exp,"SummarizedExperiment")) { matrix <- assay(exp) } else { matrix <- exp } na.or.zeros <- matrix == 0 | is.na(matrix) percent.na.or.zeros <- rowSums(na.or.zeros) / ncol(matrix) genes.keep <- (percent.na.or.zeros < max.samples.percentage) %>% which %>% names message("Removing ", nrow(matrix) - length(genes.keep), " out of ", nrow(matrix), " genes") exp[genes.keep,, drop = FALSE] } #' @title #' Remove genes with gene expression level equal to 0 or NA in a all samples #' @param exp Gene expression matrix or a Summarized Experiment object #' @noRd #' @examples #' data("gene.exp.chr21.log2") #' gene.exp.chr21.log2.filtered <- filter_genes_zero_expression_all_samples( #' gene.exp.chr21.log2 #' ) #' @return #' A subset of the original matrix only with the rows #' passing the filter threshold. filter_genes_zero_expression_all_samples <- function( exp ){ if(is(exp,"SummarizedExperiment")){ exp <- assay(exp) } idx.all.zero <- rowSums(exp == 0, na.rm = TRUE) == ncol(exp) idx.all.na <- rowSums(is.na(exp)) == ncol(exp) # do not keep if it is all zero or all NA genes.keep <- rownames(exp)[!(idx.all.zero | idx.all.na)] %>% na.omit() if(length(genes.keep) < nrow(exp) & length(genes.keep) > 0){ message( "Removing ", nrow(exp) - length(genes.keep), " out of ", nrow(exp), " genes" ) } exp <- exp[genes.keep,,drop = FALSE] } #' @description Give dnam and expression outputs the % of samples with #' 0 in Q1 and Q4. This function is useful if the analysis is performed using #' residuals. This function will add the following column to the input data #' "% of 0 target genes (Q1 and Q4)". #' @noRd add_percent_zero_q1_q4 <- function( region.target, dnam, exp ){ if (is(exp,"SummarizedExperiment")) { exp <- assay(exp) } if (is(dnam,"SummarizedExperiment")) { dnam <- assay(dnam) } aux <- plyr::adply( unique(region.target[,c("regionID","target")]), .margins = 1, .fun = function(row) { rna.target <- exp[rownames(exp) == row$target, , drop = FALSE] met <- dnam[rownames(dnam) == as.character(row$regionID), ] data <- data.frame( rna.target = rna.target %>% as.numeric, met = met %>% as.numeric ) quant.met <- quantile(data$met,na.rm = TRUE) low.cutoff <- quant.met[2] upper.cutoff <- quant.met[4] data.high.low <- data %>% filter(.data$met <= low.cutoff | .data$met >= upper.cutoff) data.high.low$metGrp <- ifelse(data.high.low$met <= low.cutoff, 0, 1) pct.zeros.in.quant.samples <- sum( data.high.low$rna.target == 0, na.rm = TRUE) / nrow(data.high.low) data.frame("% of 0 target genes (Q1 and Q4)" = paste0( round(pct.zeros.in.quant.samples * 100,digits = 2), " %") ) } ) region.target$`% of 0 target genes (Q1 and Q4)` <- aux$X..of.0.target.genes..Q1.and.Q4.[ match(paste0(region.target$regionID,region.target$target),paste0(aux$regionID,aux$target)) ] return(region.target) }