R/SequenceData-class.R
1f8fe8c2
 #' @include RNAmodR.R
a44d3dbf
 #' @include normalization.R
09f9831a
 #' @include SequenceDataFrame-class.R
57698a41
 #' @include settings.R
1f8fe8c2
 NULL
 
a4b215eb
 #' @name SequenceData-class
21b04006
 #' @aliases SequenceData
14ccc6b7
 #' 
a4b215eb
 #' @title The SequenceData class
14ccc6b7
 #' 
45929b71
 #' @md
 #' 
1f8fe8c2
 #' @description 
e7831bc9
 #' The \code{SequenceData} class is implemented to contain data on each position
f2c595d7
 #' along transcripts and holds the corresponding annotation data and
e7831bc9
 #' nucleotide sequence of these transcripts. To access this data several
 #' \code{\link[=SequenceData-functions]{functions}} are available. The
f2c595d7
 #' \code{SequenceData} class is a virtual class, from which specific classes can
e7831bc9
 #' be extended. Currently the following classes are implemented:
fa2b7e6e
 #' 
e7831bc9
 #' \itemize{
 #' \item{\code{\link[=CoverageSequenceData-class]{CoverageSequenceData}}} 
 #' \item{\code{\link[=EndSequenceData-class]{End5SequenceData}}, 
 #' \code{\link[=EndSequenceData-class]{End3SequenceData}}, 
 #' \code{\link[=EndSequenceData-class]{EndSequenceData}}}
 #' \item{\code{\link[=NormEndSequenceData-class]{NormEnd5SequenceData}}, 
 #' \code{\link[=NormEndSequenceData-class]{NormEnd5SequenceData}}}
 #' \item{\code{\link[=PileupSequenceData-class]{PileupSequenceData}}}
 #' \item{\code{\link[=ProtectedEndSequenceData-class]{ProtectedEndSequenceData}}}
 #' }
 #' 
f2c595d7
 #' The annotation and sequence data can be accessed through the functions
 #' \code{ranges} and \code{sequences}, respectively. Beaware, that the data is
 #' always provided according to genomic positions with increasing
 #' \code{rownames}, but the sequence is given as the actual sequence of the
 #' transcript. Therefore, it is necessary to treat the minus strand accordingly.
32a4fed9
 #' 
f2c595d7
 #' The \code{SequenceData} class is derived from the
e7831bc9
 #' \code{\link[IRanges:DataFrameList-class]{CompressedSplitDataFrameList}} class
 #' with additional slots for annotation and sequence data. Some functionality is
45929b71
 #' not inherited and might not available to full extend, e.g.\code{relist}.
 #' 
 #' **SequenceDataFrame**
 #' 
f19515bb
 #' The \code{SequenceDataFrame} class is a virtual class and  contains data for
 #' positions along a single transcript. In addition to being used for returning
 #' elements from a \code{SequenceData} object, the SequenceDataFrame class is
 #' used to store the unlisted data within a
 #' \code{\link[=SequenceData-class]{SequenceData}} object. Therefore, a matching
 #' \code{SequenceData} and \code{SequenceDataFrame} class must be implemented.
45929b71
 #' 
 #' The \code{SequenceDataFrame} class is derived from the
 #' \code{\link[S4Vectors:DataFrame-class]{DataFrame}} class.
 #' 
 #' Subsetting of a \code{SequenceDataFrame} returns a \code{SequenceDataFrame} or 
 #' \code{DataFrame}, if it is subset by a column or row, respectively. The 
 #' \code{drop} argument is ignored for column subsetting.
a4b215eb
 #' 
14ccc6b7
 #' @param dataType The prefix for construction the class name of the 
 #' \code{SequenceData} subclass to be constructed.
 #' @param bamfiles the input which can be of the following types
 #' \itemize{
 #' \item{\code{BamFileList}:} {a named \code{BamFileList}}
 #' \item{\code{character}:} {a \code{character} vector, which must be coercible
 #' to a named \code{BamFileList} referencing existing bam files. Valid names are
 #' \code{control} and \code{treated} to define conditions and replicates}
 #' }
 #' @param annotation annotation data, which must match the information contained
 #' in the BAM files.
 #' @param sequences sequences matching the target sequences the reads were 
 #' mapped onto. This must match the information contained in the BAM files.
 #' @param seqinfo optional \code{\link[GenomeInfoDb:Seqinfo]{Seqinfo}} to 
 #' subset the transcripts analyzed on a chromosome basis.
3191cb7e
 #' @param ... Optional arguments overwriting default values. Not all 
 #' \code{SequenceData} classes use all arguments. The arguments are:
fa2b7e6e
 #' \itemize{
3191cb7e
 #' \item{\code{minLength}} {single integer value setting a threshold for minimum
 #' read length. Shorther reads are discarded (default: \code{minLength = NA}).}
 #' \item{\code{maxLength}} {single integer value setting a threshold for maximum
 #' read length. Longer reads are discarded (default: \code{maxLength = NA}).}
 #' \item{\code{minQuality}} {single integer value setting a threshold for maximum
 #' read quality. Reads with a lower quality are discarded (default: 
 #' \code{minQuality = 5L}, but this is class dependent).}
 #' \item{\code{max_depth}} {maximum depth for pileup loading (default: 
 #' \code{max_depth = 10000L}).}
fa2b7e6e
 #' }
45929b71
 #' @param deparse.level See \code{\link[base:cbind]{base::cbind}} for a 
 #' description of this argument.
ff9162c0
 #' 
 #' @slot sequencesType a \code{character} value for the class name of 
8a505469
 #' \code{sequences}. Either \code{RNAStringSet}, \code{ModRNAStringSet}, 
 #' \code{DNAStringSet} or \code{ModDNAStringSet}.
ff9162c0
 #' @slot minQuality a \code{integer} value describing a threshold of the minimum
 #' quality of reads to be used.
f19515bb
 #' 
 #' @return A SequenceData object
1f8fe8c2
 NULL
 
00cb3236
 #' @name RNAmodR-development
 #' @export
09f9831a
 setClass("SequenceData",
ff9162c0
          contains = c("VIRTUAL", "CompressedSplitDataFrameList"),
8a505469
          slots = c(minQuality = "integer",
acf4e07e
                    unlistData = "SequenceDataFrame",
3191cb7e
                    unlistType = "character",
8a505469
                    dataDescription = "character"))
1f8fe8c2
 
67abf29d
 setMethod(
   f = "initialize",
   signature = signature(.Object = "SequenceData"),
   definition = function(.Object, ...){
c3cad704
     if(!.is_non_empty_string(.Object@dataDescription)){
67abf29d
       stop("'dataDescription' must be a single non empty character value.")
     }
     callNextMethod(.Object, ...)
   }
 )
 
500045c8
 # class names must be compatible with this class name generation function
 sequenceDataClass <- function(dataType){
   ans <- paste0(dataType,"SequenceData")
a44d3dbf
   tmp <- try(getClass(ans))
   if(is(tmp,"try-error")){
     stop("Class '",ans,"' not found: ",tmp)
   }
500045c8
   ans
 }
 
d0372463
 setMethod("classNameForDisplay", "SequenceData",
           function(x) class(x)
 )
 
a4b215eb
 #' @rdname SequenceData-functions
09f9831a
 setMethod("show", "SequenceData",
ff9162c0
   function(object){
     k <- length(object)
acf4e07e
     unlisted_object <- object@unlistData
     data_nc <- ncol(unlisted_object)
     unlisted_ranges <- unlist(ranges(object),use.names = FALSE)
     ranges_mcols <- mcols(unlisted_ranges, use.names = FALSE)
ff9162c0
     ranges_nmc <- if (is.null(ranges_mcols)) 0L else ncol(ranges_mcols)
     cat(classNameForDisplay(object), " with ", k, " elements ",
         "containing ",sep = "")
     cat(data_nc, ifelse(data_nc == 1L, " data column", " data columns"),
         " and ",ranges_nmc, ifelse(ranges_nmc == 1L, " metadata column",
                                    " metadata columns"),
         "\n",sep = "")
     out_data <- NULL
     # data
     if (data_nc > 0) {
acf4e07e
       data_col_names <- colnames(unlisted_object)
ff9162c0
       data_col_types <- 
acf4e07e
         lapply(unlisted_object, function(x) {
ff9162c0
           paste0("<", classNameForDisplay(x)[1],">")
         })
       out_data <- 
         matrix(unlist(data_col_types, use.names = FALSE), nrow = 1,
         dimnames = list("", data_col_names))
     }
     cat("- Data columns:\n")
     print(out_data, quote = FALSE, right = TRUE)
acf4e07e
     cat("-  ",class(seqinfo(object)), " object with ", 
         summary(seqinfo(object)), ":\n", sep = "")
ff9162c0
   }
1f8fe8c2
 )
 # validity ---------------------------------------------------------------------
 
ff9162c0
 .valid.SequenceData_elements <- function(x){
acf4e07e
   unlisted_x <- unlist(x, use.names=FALSE)
   nrow <- sum(width(ranges(unlisted_x)))
   if(nrow != nrow(unlisted_x)){
ef579737
     return("row number of data does not match position covered by annotation.")
   }
acf4e07e
   if(nrow != sum(width(sequences(x)))){
ef579737
     return("Length of sequences does not match position covered by annotation.")
   }
acf4e07e
   if(is.null(rownames(unlisted_x))){
ef579737
     return("rownames of data is not set.")
   } else {
32a4fed9
     seqs <- .seqs_rl_strand(ranges(x))
ef579737
     if(any(any(seqs != IRanges::IntegerList(rownames(x))))){
       return(paste0("Out of range rownames of data. The rownames do not match ",
                     "the ranges covered by the annotation data."))
     }
   }
1f8fe8c2
   NULL
 }
ff9162c0
 
 .valid.SequenceData <- function(x){
   c(.valid.SequenceData_elements(x),
065a57d9
     IRanges:::.valid.CompressedList(x))
ff9162c0
 }
67abf29d
 S4Vectors::setValidity2(Class = "SequenceData", .valid.SequenceData)
1f8fe8c2
 
acf4e07e
 # coercion ---------------------------------------------------------------------
ff9162c0
 
d0372463
 coerceSequenceDataToCompressedSplitDataFrameList <- function(className){
   setMethod("coerce",
     signature = c(from = className, to = "CompressedSplitDataFrameList"),
     function(from, to, strict = TRUE){
       if (strict) {
         from <- from
         {
           value <- new("CompressedSplitDataFrameList")
           for (what in c("elementType", "elementMetadata", 
                          "metadata", "unlistData", "partitioning"
           )) slot(value, what) <- slot(from, what)
8cd7f034
           value@unlistData <- as(value@unlistData,"DFrame")
d0372463
           value
         }
       } else from
     })
 }
 
45929b71
 coerceToSequenceData <- function(className) {
   function(from) {
     if(is.list(from)) {
       classes <- unlist(lapply(from,class))
       from <- from[classes == paste0(className,"Frame")]
       if(length(from) == 0) {
         FUN <- match.fun(className)
         from <- list(FUN())
       }
     } else {
       if(is(from,className)){
         return(from)
       } else if(is(from,paste0(className,"Frame"))) {
         from <- list(from)
       } else {
         stop("Cannot coerce ",class(from)," to ",className,".")
       }
     }
     IRanges:::coerceToCompressedList(from)
   }
1f8fe8c2
 }
 
45929b71
 setSequenceDataCoercions <- function(type) {
   className <- sequenceDataClass(type)
d0372463
   coerceSequenceDataToCompressedSplitDataFrameList(className)
45929b71
   setAs("ANY", className, coerceToSequenceData(className))
   setAs("list", className, coerceToSequenceData(className))
 }
acf4e07e
 
 # internals --------------------------------------------------------------------
1f8fe8c2
 
d0372463
 
 
 # Accessors --------------------------------------------------------------------
1f8fe8c2
 
acf4e07e
 setMethod("rownames", "SequenceData",
d0372463
   function (x){
     ans <- rownames(unlist(x,use.names = FALSE), do.NULL = TRUE)
     relist(ans,x)
   }
acf4e07e
 )
 
45929b71
 # Concatenation ----------------------------------------------------------------
1f8fe8c2
 
45929b71
 .check_ranges <- function(args){
   ranges <- lapply(args,ranges)
   ranges <- vapply(ranges[seq.int(2L,length(ranges))],
                    function(r){
                      all(all(r == ranges[[1L]]))
                    },
                    logical(1))
   if(!all(ranges)){
     stop("Inputs must have the same ranges.")
   }
 }
1f8fe8c2
 
45929b71
 .check_sequences <- function(args){
   sequences <- lapply(args,sequences)
   sequences <- vapply(sequences[seq.int(2L,length(sequences))],
                       function(s){
                         all(s == sequences[[1L]])
                       },
                       logical(1))
   if(!all(sequences)){
     stop("Inputs must have the same sequences.")
   }
 }
6932ce03
 
45929b71
 .check_bamfiles <- function(args){
   bamfiles <- lapply(args,bamfiles)
   bamfiles <- vapply(bamfiles[seq.int(2L,length(bamfiles))],
                      function(b){
                        all(path(b) == path(bamfiles[[1L]]))
                      },
                      logical(1))
   if(!all(bamfiles)){
     stop("Inputs must be derived from the same bamfiles.")
acf4e07e
   }
45929b71
 }
 
 #' @rdname SequenceData-class
 #' @export
 setMethod("cbind", "SequenceData",
           function(..., deparse.level = 1) 
           {
             args <- list(...)
             if(length(args) == 1L){
               return(args[[1L]])
             }
             # input checks
             classes <- lapply(args,class)
             if(length(unique(classes)) != 1L){
               stop("Inputs must be of the same SequenceDataFrame type.")
             }
             lengths <- vapply(args,function(a){sum(lengths(a))},integer(1))
             if(length(unique(lengths)) != 1L){
               stop("Inputs must have the same lengths.")
             }
             .check_ranges(args)
             .check_sequences(args)
             callNextMethod()
           }
1f8fe8c2
 )
45929b71
 
 #' @rdname SequenceData-class
 #' @export
09f9831a
 setMethod("rbind", "SequenceData",
45929b71
           function(..., deparse.level = 1) 
           {
             args <- list(...)
             if(length(args) == 1L){
               return(args[[1L]])
             }
             # input checks
             classes <- lapply(args,class)
             if(length(unique(classes)) != 1L){
               stop("Inputs must be of the same SequenceDataFrame type.")
             }
             lengths <- vapply(args,function(a){ncol(unlist(a))},integer(1))
             if(length(unique(lengths)) != 1L){
               stop("Inputs must have the same width.")
             }
             .check_bamfiles(args)
             callNextMethod()
           }
 )
 
 setMethod("bindROWS", "SequenceData",
           function (x, objects = list(), use.names = TRUE, ignore.mcols = FALSE, 
                     check = TRUE) 
           {
             objects <- S4Vectors:::prepare_objects_to_bind(x, objects)
             all_objects <- c(list(x), objects)
             names <- unlist(lapply(all_objects,names))
             if(any(duplicated(names))){
               stop("Input must have unique names.")
             }
             .check_bamfiles(all_objects)
             callNextMethod(x, objects, use.names = use.names, 
f20b77bb
                            ignore.mcols = ignore.mcols, check = check)
1f8fe8c2
           }
 )
 
45929b71
 setMethod("unlist", "SequenceData",
           function(x, recursive = TRUE, use.names = FALSE) 
           {
             callNextMethod(x, recursive = recursive, use.names = FALSE) 
           }
 )
 
 
67abf29d
 # constructor ------------------------------------------------------------------
 
57698a41
 .quality_settings <- data.frame(
   variable = c("minQuality"),
   testFUN = c(".not_integer_bigger_equal_than_one"),
   errorValue = c(TRUE),
   errorMessage = c("'minQuality' must be integer with a value higher than 1L."),
   stringsAsFactors = FALSE)
 
67abf29d
 .norm_min_quality <- function(input, minQuality){
57698a41
   .norm_settings(input, .quality_settings, minQuality)[["minQuality"]]
67abf29d
 }
 
45929b71
 .get_replicate_number <- function(conditions){
   control_rep <- seq_along(conditions[conditions == "control"])
   treated_rep <- seq_along(conditions[conditions == "treated"])
67abf29d
   rep <- c(control_rep,treated_rep)
   rep <- rep[c(which(conditions == "control"),
                which(conditions == "treated"))]
   factor(rep)
 }
 
32a4fed9
 .check_positions_in_data <- function(data, positions){
   data_rownames <- lapply(data,rownames)
   data_rownames_non_empty <- !vapply(data_rownames,is.null,logical(1))
   # only check if data is present
   if(any(data_rownames_non_empty)){
     check <- lapply(data_rownames[data_rownames_non_empty],
                     "==", positions)
     check <- unlist(lapply(check,all))
     if(!all(check)){
       stop("rownames()/names() in data does not have the correct order. ",
            "rownames()/names() must be coercible to numeric and strictly ",
            "increasing.")
     }
   }
   NULL
 }
 
 # construct a result DataFrame by creating from Integer or NumericList
 # or mergeing DataFrames into one
 .norm_postprocess_read_data <- function(data){
   if(is(data[[1L]],"IntegerList") || is(data[[1L]],"NumericList")){
659d2e08
     m_data <- lapply(lapply(data, metadata),"[[","stats")
32a4fed9
     data <- lapply(data, unlist)
     if(length(unique(lengths(data))) != 1L){
       stop("Data is of unequal length and cannot be coerced to a DataFrame.",
            call. = FALSE)
     }
     data <- S4Vectors::DataFrame(data)
   } else if(is(data[[1L]],"DataFrameList")) {
659d2e08
     m_data <- lapply(lapply(data, metadata),"$","stats")
32a4fed9
     data <- lapply(data, unlist, use.names = FALSE)
     ncols <- unique(unlist(lapply(data, ncol)))
     if(length(ncols) != 1L){
       stop("Something went wrong. Width of data should be constant.")
     }
     if(length(unique(vapply(data,nrow,numeric(1)))) != 1L){
       stop("Data is of unequal length and cannot be merged into a DataFrame.",
            call. = FALSE)
     }
     data <- do.call(cbind,data)
   } else {
16803e67
     stop("")
32a4fed9
   }
659d2e08
   metadata(data) <- list(stats = m_data) 
32a4fed9
   data
 }
 
 # reverse order of data, if originating from minus strand
 .order_read_data_by_strand <- function(data, grl){
   # check if minus strand data is ordered in reverse
   strand_u <- .get_strand_u_GRangesList(grl)
   strand_minus <- strand_u == "-"
   if(any(strand_minus)){
     strand_minus_and_needs_rev <- strand_minus & 
       vapply(IRanges::IntegerList(rownames(data)),
              function(i){
                i[1] < i[2]
              },
              logical(1))
     if(any(strand_minus_and_needs_rev)){
       data[strand_minus_and_needs_rev] <- 
         lapply(data[strand_minus_and_needs_rev], rev)
     }
   }
   data
 }
 
67abf29d
 #' @importFrom IRanges PartitioningByWidth PartitioningByEnd
 #' @importClassesFrom IRanges PartitioningByWidth PartitioningByEnd
 .SequenceData <- function(className, bamfiles, ranges, sequences, seqinfo, args,
                           ...){
   ##############################################################################
   # setup additional variables
   ##############################################################################
16803e67
   if(!is.list(args)){
     args <- as.list(args)
67abf29d
   }
   proto <- new(className)
659d2e08
   proto@minQuality <- .norm_min_quality(args, proto@minQuality)
67abf29d
   condition <- factor(names(bamfiles))
45929b71
   replicate <- .get_replicate_number(condition)
c3cad704
   if(!.is_non_empty_string(proto@dataDescription)){
67abf29d
     stop("'dataDescription' must be a single non empty character value.")
   }
659d2e08
   if(is.null(proto@minQuality)){
67abf29d
     stop("Minimum quality is not set for '", className ,"'.",
          call. = FALSE)
   }
   param <- .assemble_scanBamParam(ranges, proto@minQuality, seqinfo)
32a4fed9
   positions <- .seqs_rl_strand(ranges, force_continous = TRUE)
67abf29d
   ##############################################################################
   # run the specific data aggregation function
   ##############################################################################
   message("Loading ", proto@dataDescription, " from BAM files ... ",
           appendLF = FALSE)
   data <- getData(proto, bamfiles, ranges, sequences, param, args)
   names(data) <- paste0(names(data),".",condition,".",replicate)
32a4fed9
   # check positions
   .check_positions_in_data(data, positions)
67abf29d
   ##############################################################################
   # post process the data
   ##############################################################################
   conditionsFmultiplier <- length(data)
   # work with the unlisted data and construct a CompressedSplitDataFrameList
   # from this
   data <- .norm_postprocess_read_data(data)
   # readjust replicate and condition to the normalized dimensions of the data
   conditionsFmultiplier <- ncol(data) / conditionsFmultiplier 
   replicate <- rep(replicate, each = conditionsFmultiplier)
   condition <- rep(condition, each = conditionsFmultiplier)
   # create partitioning object from ranges
   partitioning <- IRanges::PartitioningByWidth(sum(width(ranges)))
   if(sum(width(partitioning)) != nrow(data)){
     stop("Something went wrong. Length of data and Ranges do not match.")
   }
   # order data so that is matched the PartitioningByWidth object
   data <- relist(data, partitioning)
   rownames(data) <- IRanges::CharacterList(positions)
32a4fed9
   data <- .order_read_data_by_strand(data, ranges)
67abf29d
   # order sequences
32a4fed9
   sequences <- sequences[match(names(ranges), names(sequences))]
67abf29d
   # basic checks
   names(data) <- names(ranges)
   if(any(names(ranges) != names(sequences)) || 
      any(names(ranges) != names(data))){
16803e67
     stop("")
67abf29d
   }
   ##############################################################################
3118902c
   # Create SequenceData object
67abf29d
   ##############################################################################
45929b71
   unlist_data <- 
     .SequenceDataFrame(class = gsub("SequenceData","",className),
                        df = unlist(data, use.names = FALSE),
                        ranges = unlist(ranges, use.names = FALSE),
                        sequence = unlist(sequences, use.names = FALSE),
                        replicate = replicate,
                        condition = condition,
                        bamfiles = bamfiles,
                        seqinfo = seqinfo)
acf4e07e
   ans <- new(className, 
659d2e08
              minQuality = proto@minQuality,
45929b71
              unlistData = unlist_data,
acf4e07e
              partitioning = IRanges::PartitioningByEnd(data),
659d2e08
              metadata = metadata(unlist_data),
acf4e07e
              ...)
   message("OK")
   ans
67abf29d
 }
fa2b7e6e
 
3118902c
 .SequenceData_settings <- data.frame(
   variable = c("max_depth",
                "minLength",
8a505469
                "maxLength",
                "seqtype"),
3118902c
   testFUN = c(".not_integer_bigger_than_10",
               ".not_integer_bigger_equal_than_zero_nor_na",
8a505469
               ".not_integer_bigger_equal_than_one_nor_na",
               ".is_valid_nucleotide_seqtype"),
3118902c
   errorValue = c(TRUE,
                  TRUE,
8a505469
                  TRUE,
                  FALSE),
3118902c
   errorMessage = c("'max_depth' must be integer with a value higher than 10L.",
                    "'minLength' must be integer with a value higher than 0L or NA.",
8a505469
                    "'maxLength' must be integer with a value higher than 1L or NA.",
                    paste0("'seqtype' must be either '",seqtype(RNAString()) ,"' or '",seqtype(DNAString()) ,"'.")),
3118902c
   stringsAsFactors = FALSE)
 
 .get_SequenceData_args <- function(input){
ef579737
   minQuality <- .norm_min_quality(input, NULL)
14ccc6b7
   max_depth <- 10000L # the default is 250, which is to small
a1683e44
   minLength <- NA_integer_
   maxLength <- NA_integer_
8a505469
   seqtype <- seqtype(RNAString()) 
3118902c
   args <- .norm_settings(input, .SequenceData_settings, max_depth, minLength,
8a505469
                          maxLength, seqtype)
a1683e44
   if(!is.na(args[["minLength"]]) && !is.na(args[["maxLength"]])){
     if(args[["minLength"]] > args[["maxLength"]]){
       stop("'minLength' must be smaller or equal to 'maxLength'.",
            call. = FALSE)
     }
   }
14ccc6b7
   #
3118902c
   args <- c(list(minQuality = minQuality),
             args)
14ccc6b7
   args
 }
 
500045c8
 # internal SequenceData constructor
14ccc6b7
 .new_SequenceData <- function(dataType, bamfiles, annotation, sequences, seqinfo,
473ca656
                               ...){
a44d3dbf
   if(is.null(dataType)){
     stop("Invalid data type.")
   }
3118902c
   args <- .get_SequenceData_args(list(...))
a44d3dbf
   className <- sequenceDataClass(dataType)
   # check bam files
14ccc6b7
   bamfiles <- .norm_bamfiles(bamfiles, className)
500045c8
   # get annotation and sequence data
e7831bc9
   annotation <- .norm_annotation(annotation, className)
a44d3dbf
   sequences <- .norm_sequences(sequences, className)
acf4e07e
   seqinfo_missing <- missing(seqinfo)
e7831bc9
   seqinfo <- .norm_seqnames(bamfiles, annotation, sequences, seqinfo, className)
67abf29d
   # load transcript data and sequence data
e7831bc9
   grl <- .load_annotation(annotation)
   grl <- .subset_by_seqinfo(grl, seqinfo)
acf4e07e
   if(length(grl) == 0L){
     if(seqinfo_missing){
       stop("No overlap between bamfiles and annotation.")
     } else {
       stop("No overlap between bamfiles, annotation and seqinfo.")
     }
   }
8a505469
   sequences <- .load_sequences(sequences, grl, args)
67abf29d
   # create the class
   .SequenceData(className, bamfiles, grl, sequences, seqinfo, args)
 }
 
 # constructor utility functions ------------------------------------------------
 # also used at other places
 
a1683e44
 # check for multiple seqnames per ranges
 .norm_unique_seqnames <- function(ranges){
   seqnames_ranges_u <- unique(GenomeInfoDb::seqnames(ranges))
   f <- lengths(seqnames_ranges_u) != 1L
   if(any(f)){
     message("Found transcript annotation with non unique seqnames. Removing ",
             "them ...")
     ranges <- ranges[!f]
     GenomeInfoDb::seqlevels(ranges) <- GenomeInfoDb::seqlevelsInUse(ranges)
   }
   ranges
 }
 
67abf29d
 # load annotation as GRangesList. one element per transcript
 .load_annotation <- function(annotation){
   if(is(annotation,"TxDb")){
     ranges <- GenomicFeatures::exonsBy(annotation, by = "tx")
a1683e44
     ranges <- .norm_unique_seqnames(ranges)
67abf29d
   } else if(is(annotation,"GRangesList")) {
     ranges <- annotation
a1683e44
     ranges <- .norm_unique_seqnames(ranges)
     # make sure, that the elements are reverse order if on minus strand
     unlisted_ranges <- unlist(ranges)
     strand <- strand(unlisted_ranges) == "+"
     unlisted_ranges <- 
       c(unlisted_ranges[strand][order(unlisted_ranges[strand])],
         unlisted_ranges[!strand][rev(order(unlisted_ranges[!strand]))])
     ranges <- split(unname(unlisted_ranges),
                     factor(names(unlisted_ranges),
                            levels = unique(names(ranges))))
67abf29d
   } else {
     stop("Annotation is not a 'TxDb' or a 'GRangesList'.")
   }
   ranges
 }
 
 #' @importFrom Biostrings xscat
 # load the transcript sequence per transcript aka. one sequence per GRangesList
 # element
8a505469
 .load_sequences <- function(sequences, grl, args){
67abf29d
   seq <- Biostrings::getSeq(sequences, unlist(grl))
a1683e44
   seq <- relist(unlist(seq),IRanges::PartitioningByWidth(sum(width(grl))))
67abf29d
   names(seq) <- names(grl)
8a505469
   seqtype(seq) <- args[["seqtype"]]
   seq
67abf29d
 }
 
 # remove any elements, which are not in the seqinfo
 .subset_by_seqinfo <- function(grl, seqinfo){
   grl <- grl[GenomicRanges::seqnames(grl) %in% GenomeInfoDb::seqnames(seqinfo)]
a1683e44
   grl <- grl[width(IRanges::PartitioningByWidth(grl)) != 0L]
   GenomeInfoDb::seqlevels(grl) <- GenomeInfoDb::seqlevelsInUse(grl)
67abf29d
   grl
 }
 
 ################################################################################
 
45929b71
 #' @rdname SequenceData-class
 #' @export
f2c595d7
 setGeneric( 
   name = "SequenceData",
   signature = c("annotation","sequences"),
   def = function(dataType, bamfiles, annotation, sequences, seqinfo, ...)
     standardGeneric("SequenceData")
 ) 
 
45929b71
 #' @rdname SequenceData-class
 #' @export
a44d3dbf
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "character", sequences = "character"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
a44d3dbf
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
a44d3dbf
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "character", sequences = "BSgenome"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
a44d3dbf
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
a44d3dbf
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "TxDb", sequences = "character"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
a44d3dbf
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
500045c8
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "TxDb", sequences = "BSgenome"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
500045c8
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
e7831bc9
 setMethod("SequenceData",
           signature = c(annotation = "GRangesList", sequences = "character"),
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
e7831bc9
 setMethod("SequenceData",
           signature = c(annotation = "GRangesList", sequences = "BSgenome"),
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
500045c8
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "GFF3File", sequences = "BSgenome"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
a44d3dbf
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
a44d3dbf
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "GFF3File", sequences = "character"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
a44d3dbf
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
a44d3dbf
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "character", sequences = "FaFile"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
500045c8
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
500045c8
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "GFF3File", sequences = "FaFile"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
500045c8
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
500045c8
 setMethod("SequenceData",
473ca656
           signature = c(annotation = "TxDb", sequences = "FaFile"),
14ccc6b7
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
500045c8
           })
45929b71
 #' @rdname SequenceData-class
 #' @export
e7831bc9
 setMethod("SequenceData",
           signature = c(annotation = "GRangesList", sequences = "FaFile"),
           function(dataType, bamfiles, annotation, sequences, seqinfo, ...){
             .new_SequenceData(dataType, bamfiles, annotation, sequences,
                               seqinfo, ...)
           })
500045c8
 
67abf29d
 # Constructor end
1f8fe8c2
 ################################################################################
fd560deb
 
e7831bc9
 #' @rdname SequenceData-functions
 #' @export
 setMethod("getData",
67abf29d
           signature = c(x = "SequenceData", bamfiles = "BamFileList", 
                         grl = "GRangesList", sequences = "XStringSet", 
                         param = "ScanBamParam"),
           definition = function(x, bamfiles, grl, sequences, param, args){
a4b215eb
             stop("This functions needs to be implemented by '",class(x),"'.",
                  call. = FALSE)
           }
 )
 
c70f0bc6
 # accessors --------------------------------------------------------------------
 
a4b215eb
 #' @rdname SequenceData-functions
09f9831a
 #' @export
c70f0bc6
 setMethod(f = "bamfiles", 
09f9831a
           signature = signature(x = "SequenceData"),
45929b71
           definition = function(x){bamfiles(unlist(x))})
8a505469
 
a4b215eb
 #' @rdname SequenceData-functions
09f9831a
 #' @export
c70f0bc6
 setMethod(f = "conditions", 
           signature = signature(object = "SequenceData"),
acf4e07e
           definition = function(object){conditions(unlist(object))})
8a505469
 
a4b215eb
 #' @rdname SequenceData-functions
09f9831a
 #' @export
acf4e07e
 setMethod(
   f = "ranges", 
   signature = signature(x = "SequenceData"),
   definition = 
     function(x){
       partitioning <- IRanges::PartitioningByEnd(x)
       unlisted_ranges <- ranges(unlist(x))
45929b71
       ends <- match(cumsum(width(partitioning)),cumsum(width(unlisted_ranges)))
       partitioning_relist <- IRanges::PartitioningByEnd(ends)
acf4e07e
       if(length(x) != length(partitioning_relist)){
         stop("ranges could not be relisted.")
       }
45929b71
       names(partitioning_relist) <- names(x)
acf4e07e
       relist(unlisted_ranges, partitioning_relist)
     })
8a505469
 
a4b215eb
 #' @rdname SequenceData-functions
09f9831a
 #' @export
c70f0bc6
 setMethod(f = "replicates", 
09f9831a
           signature = signature(x = "SequenceData"),
acf4e07e
           definition = function(x){replicates(unlist(x))})
8a505469
 
4a40e96c
 #' @rdname SequenceData-functions
 #' @export
c70f0bc6
 setMethod(f = "seqinfo", 
           signature = signature(x = "SequenceData"),
45929b71
           definition = function(x){seqinfo(unlist(x))})
8a505469
 
4a40e96c
 #' @rdname SequenceData-functions
 #' @export
c70f0bc6
 setMethod(f = "sequences", 
4a40e96c
           signature = signature(x = "SequenceData"),
acf4e07e
           definition = function(x){relist(sequences(unlist(x)),x)})
8a505469
 
 #' @rdname SequenceData-functions
 #' @export
 setMethod(f = "seqtype", 
           signature = signature(x = "SequenceData"),
           definition = function(x){seqtype(unlist(x))})
 
 #' @rdname SequenceData-functions
 #' @export
 setReplaceMethod(f = "seqtype", 
                  signature = signature(x = "SequenceData"),
                  definition = function(x, value){
                    unlisted_x <- unlist(x)
                    seqtype(unlisted_x) <- value
                    relist(unlisted_x,x)
                  })
 
d62f4d05
 #' @rdname SequenceData-functions
 #' @export
 setMethod(f = "dataType",
           signature = signature(x = "SequenceData"),
           definition = function(x){dataType(unlist(x))})
1e4ec5e6
 
 # dummy functions --------------------------------------------------------------
79434cbd
 # this needs to be implemented by each subclass
1e4ec5e6
 
478352fe
 .check_aggregate_seqdata <- function(data, x){
   seqs <- .seqs_rl_strand(ranges(x))
   if(any(any(seqs != IRanges::IntegerList(rownames(data))))){
     stop("rownames() of aggregate data is not named or not named along the ",
          "genomic coordinates. Contact the maintainer of the class used.",
          call. = FALSE)
   }
   data
 }
 
a4b215eb
 #' @rdname aggregate
1e4ec5e6
 #' @export
 setMethod(f = "aggregate", 
           signature = signature(x = "SequenceData"),
           definition = 
478352fe
             function(x, condition = c()){
               .check_aggregate_seqdata(aggregateData(x, condition), x)
1e4ec5e6
             }
 )
478352fe
 
 #' @rdname aggregate
 #' @export
 setMethod(f = "aggregateData", 
           signature = signature(x = "SequenceData"),
           definition = function(x, condition){
             stop("This functions needs to be implemented by '",class(x),"'.",
                  call. = FALSE)
           })