R/import_funcs.R
02d6d7ba
 .TPP_importFct_CheckDataFormat <- function (files, dataframes, expNames){
ff68e0d5
     # internal function copied from TPP package to avoid 
     # import of non-exported package functions
     . <- NULL
     isDF <- !is.null(dataframes)
     isF <- !is.null(files)
     isBoth <- isDF & isF
     isNone <- !(isDF | isF)
     if (isBoth) {
         stop("Data import function received a",
              " filename AND a dataframe object. \n",
              "Please specify only one.")
342fc172
     }
ff68e0d5
     else if (isNone) {
         stop("Data import function requires a", 
              " filename or a dataframe object. \n",
              "Please specify one.")
342fc172
     }
ff68e0d5
     if (isDF) {
         isClassList <- is.list(dataframes) && !is.data.frame(dataframes)
         isClassDF <- is.data.frame(dataframes)
         if (isClassList) {
             classesInList <- dataframes %>% 
             vapply(. %>% inherits(., "data.frame"), TRUE)
             if (!all(classesInList)) {
                 stop(paste("Argument 'dataframes' contains", 
                            "elements that are not of type", 
                            "'data.frame' at the following positions: "), 
                      which(!classesInList) %>% paste(collapse = ", "), ".")
             }
           }
           else if (isClassDF) {
               dataframes <- list(dataframes)
               names(dataframes) <- expNames
           }
           else {
               stop("Argument 'dataframes' must be either an object of class \n
                    'data.frame', or a list of such objects.")
           }
342fc172
     }
ff68e0d5
     if (isF) {
         files <- as.character(files)
         names(files) <- expNames
     }
     return(list(files = files, dataframes = dataframes))
342fc172
 }
 
 #' @importFrom utils read.delim
d96a18d0
 #' @importFrom RCurl url.exists
02d6d7ba
 .TPP_importFct_readFiles <- function (files, naStrs){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   expNames <- names(files)
   data <- vector("list", length(files))
   names(data) <- expNames
   for (expName in expNames) {
     fTmp <- files[[expName]]
     if (file.exists(fTmp) || url.exists(fTmp)) {
       data[[expName]] <- read.delim(fTmp, as.is = TRUE, 
                                     na.strings = naStrs, quote = "")
     }
     else {
       stop("File ", fTmp, " could not be found.")
     }
   }
   return(data)
 }
 
ac79cafb
 #' @import dplyr
02d6d7ba
 .TPP_importFct_removeDuplicates <- function(inDF, refColName, 
342fc172
                                            nonNAColNames, qualColName){
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   message("Removing duplicate identifiers using quality column '", 
           qualColName, "'...")
ac79cafb
   nonUniques <- unique(as_tibble(inDF)[duplicated(inDF[[refColName]]), 
342fc172
                            refColName])
02d6d7ba
   retDF <- subset(inDF, !(get(refColName) %in% nonUniques))
   if(nrow(nonUniques)){
       for (nU in nonUniques) {
           tmpDF <- subset(inDF, get(refColName) == nU)
           nonNArows <- NULL
           for (r in seq_len(nrow(tmpDF))) {
               if (any(!is.na(tmpDF[r, nonNAColNames]))) {
                   nonNArows <- c(nonNArows, r)
               }
           }
           if (length(nonNArows) > 1) {
               if (is.null(qualColName)) {
                   useRow <- 1
               }
               else {
                   qualVals <- tmpDF[nonNArows, qualColName]
                   useRow <- match(max(qualVals), qualVals)
               }
           }
           else {
               useRow <- nonNArows[1]
           }
           retDF <- rbind(retDF, tmpDF[useRow, ])
342fc172
       }
   }
ff68e0d5
   message(nrow(retDF), " out of ", nrow(inDF), 
           " rows kept for further analysis.")
342fc172
   return(retDF)
 }
 
02d6d7ba
 .TPP_replaceZeros <- function(x){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   x[which(x == 0)] <- NA
   return(x)
 }
 
02d6d7ba
 .TPP_importFct_rmZeroSias <- function(data.list, 
                                       intensityStr){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   out <- lapply(names(data.list), function(l.name) {
     datTmp <- data.list[[l.name]]
     colsTmp <- colnames(datTmp)
     intensity.cols <- grep(intensityStr, colsTmp, value = TRUE)
     intensity.df <- subset(datTmp, select = intensity.cols) %>% 
       mutate_all(as.character) %>% mutate_all(as.numeric)
02d6d7ba
     new.intensity.df <- intensity.df %>% mutate_all(.TPP_replaceZeros)
342fc172
     datTmp[, intensity.cols] <- new.intensity.df
     return(datTmp)
   })
   names(out) <- names(data.list)
   return(out)
 }
 
02d6d7ba
 .TPP_importFct_checkExperimentCol <- function(expCol){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   if (is.null(expCol)) {
ff68e0d5
     m <- paste("Config table needs an 'Experiment'", 
                "column with unique experiment IDs.")
342fc172
     stop(m, "\n")
   }
   oldExpNames <- expCol
   newExpNames <- gsub("([^[:alnum:]])", "_", expCol)
   iChanged <- oldExpNames != newExpNames
   if (any(iChanged)) {
ff68e0d5
     m1 <- paste("Replaced non-alphanumeric characters", 
                 "in the 'Experiment' column entries:")
342fc172
     m2 <- paste("'", paste(oldExpNames[iChanged], collapse = "', '"), 
                 "'\nby\n'", paste(newExpNames[iChanged], collapse = "', '"), 
                 sep = "")
     message(m1, "\n", m2, "\n")
   }
   return(newExpNames)
 }
 
02d6d7ba
 .TPP_importFct_checkComparisons <- function(confgTable){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   expConds <- confgTable$Condition
   expNames <- confgTable$Experiment
   compCols <- grep("Comparison", colnames(confgTable), ignore.case = TRUE, 
                    value = TRUE)
   compChars <- apply(confgTable[compCols], 2, function(x) {
     length(grep("[[:alnum:]]", x, value = TRUE))
   })
   comp_unequal_two <- compChars != 2
   if (any(comp_unequal_two)) {
     warning(paste("\nThe following comparison columns could not be evaluated", 
                   "because they did not contain exactly two entries:\n\t\t"), 
             paste(compCols[comp_unequal_two], collapse = ",\n\t\t"))
   }
   validCompCols <- compCols[!comp_unequal_two]
   allCompStrs <- c()
02d6d7ba
   if (length(validCompCols)) {
342fc172
     message("Comparisons will be performed between the following experiments:")
     for (colName in validCompCols) {
       current_compEntries <- confgTable[[colName]]
       current_compRows <- grep("[[:alnum:]]", current_compEntries)
       current_compExps <- expNames[current_compRows]
       compRef <- current_compExps[1]
       compTreatm <- current_compExps[2]
       if ("Condition" %in% names(confgTable)) {
         current_compConds <- expConds[current_compRows]
         if ("Vehicle" %in% current_compConds && "Treatment" %in% 
             current_compConds) {
           compRef <- current_compExps[current_compConds == 
                                         "Vehicle"]
           compTreatm <- current_compExps[current_compConds == 
                                            "Treatment"]
         }
       }
       compStr <- paste(compTreatm, "_vs_", compRef, sep = "")
       names(compStr) <- colName
       message(compStr)
       allCompStrs <- c(allCompStrs, compStr)
     }
     message("\n")
   }
   return(allCompStrs)
 }
 
 #' @importFrom stringr str_to_title
02d6d7ba
 .TPP_importFct_checkConditions <- function(condInfo, 
342fc172
                                           expectedLength){
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   flagGenerateConds <- FALSE
   if (is.null(condInfo)) {
ff68e0d5
     message("No information about experimental conditions given.", 
             "Assigning NA instead.\n",
             "Reminder: recognition of Vehicle and Treatment groups", 
             "during pairwise \n",
             "comparisons is only possible when they are specified ",
             "in the config table.\n")
342fc172
     condInfo <- rep(NA_character_, expectedLength)
   }
   else {
ff68e0d5
     condInfo <- as.character(condInfo) %>% 
       stringr::str_to_title()
342fc172
     condLevels <- unique(condInfo)
ff68e0d5
     invalidLevels = 
       setdiff(condLevels, c("Treatment", "Vehicle"))
342fc172
     if (length(invalidLevels) > 0) {
       stop("The entry '", invalidLevels, 
ff68e0d5
            paste("' in the condition column is invalid.", 
                  "Only the values 'Treatment' and", 
                  "'Vehicle' are allowed. Please correct", 
                  "this and start again."))
342fc172
     }
   }
   return(condInfo)
 }
 
02d6d7ba
 .TPP_checkFunctionArgs <- 
ff68e0d5
   function(functionCall, expectedArguments){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   myArgs <- names(functionCall)
d96a18d0
   lapply(expectedArguments, function(arg) {
342fc172
     if (!arg %in% myArgs) {
ff68e0d5
       stop("Error in ", paste(functionCall)[1], 
            ": argument '", 
            arg, "' is missing, with no default", 
            call. = FALSE)
342fc172
     }
   })
 }
 
02d6d7ba
 .TPP_nonLabelColumns <- function(){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   out <- data.frame(
     column = c("Experiment", "Experiment",
                "Experiment", "Path", "Path", 
                "Path", "Condition", "Replicate", 
                "Compound", "Temperature", "RefCol"), 
     type = c("TR", "CCR", "2D", "TR", "CCR", "2D", 
              "TR", "TR", "2D", "2D", "2D"), 
     obligatory = c(TRUE, TRUE, TRUE, FALSE, FALSE, 
                    FALSE, TRUE, FALSE, TRUE, TRUE, TRUE), 
     exclusive = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
                   FALSE, TRUE, TRUE, TRUE, TRUE, TRUE))
   return(out)
 }
 
02d6d7ba
 .TPP_detectLabelColumnsInConfigTable <- 
ff68e0d5
   function(allColumns){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
02d6d7ba
   .TPP_checkFunctionArgs(match.call(), c("allColumns"))
   noLabelCols <- .TPP_nonLabelColumns()$column %>% 
ff68e0d5
     as.character %>% 
342fc172
     unique
   compCols <- grep("comparison", allColumns, value = TRUE, 
                    ignore.case = TRUE)
   noLabelCols <- c(noLabelCols, compCols)
   labelCols <- setdiff(allColumns, noLabelCols)
   return(labelCols)
 }
 
02d6d7ba
 .TPP_importCheckTemperatures <- function(temp){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   tempMatrix <- as.matrix(temp)
   rownames(tempMatrix) <- NULL
   naRows <- apply(is.na(tempMatrix), 1, all)
   if (any(naRows)) {
     stop("Row(s) ", paste(which(naRows), collapse = ", "), 
ff68e0d5
          " in the configuration table contain", 
          " only missing temperature values.")
342fc172
   }
   return(tempMatrix)
 }
 
 #' @importFrom openxlsx read.xlsx
 #' @importFrom utils read.table
02d6d7ba
 .TPP_importFct_readConfigTable <- function(cfg){
342fc172
   # internal function copied from TPP package to avoid 
   # import of non-exported package functions
   if (is.character(cfg)) {
     if (file.exists(cfg)) {
       strChunks <- strsplit(cfg, "\\.")[[1]]
       fileExtension <- strChunks[length(strChunks)]
       if (fileExtension == "txt") {
ff68e0d5
         tab <- read.table(
           file = cfg, header = TRUE, 
           check.names = FALSE, stringsAsFactors = FALSE, 
           sep = "\t")
342fc172
       }
       else if (fileExtension == "csv") {
ff68e0d5
         tab <- read.table(
           file = cfg, header = TRUE, 
           check.names = FALSE, stringsAsFactors = FALSE, 
           sep = ",")
342fc172
       }
       else if (fileExtension == "xlsx") {
         tab <- openxlsx::read.xlsx(cfg)
       }
       else {
ff68e0d5
         stop("Error during data import: ", cfg, 
              " does not belong to a valid configuration file.")
342fc172
       }
     }
     else {
ff68e0d5
       stop("Error during data import: ", cfg, 
            " does not belong to a valid configuration file.")
342fc172
     }
     cfg <- tab
   }
   return(cfg)
 }
 
02d6d7ba
 #' Import and chech configuration table
 #' 
 #' @param infoTable character string of a file path to
 #' a config table (excel,txt or csv file) or data frame
 #' containing a config table
 #' @param type charater string indicating dataset type
 #' default is 2D
 #' 
 #' @return data frame with config table
 #' 
 #' @examples 
 #' data("config_tab")
 #' TPP_importCheckConfigTable(config_tab, type = "2D")
 #' @export
 TPP_importCheckConfigTable <- function(infoTable, type = "2D"){
   .TPP_checkFunctionArgs(match.call(), c("infoTable", "type"))
   Experiment <- Path <- Compound <- NULL
342fc172
   isValidDF <- FALSE
   if (is.data.frame(infoTable)) {
ff68e0d5
     if ((ncol(infoTable) > 1) & 
         ("Experiment" %in% colnames(infoTable))) {
342fc172
       isValidDF <- TRUE
     }
   }
   if (!is.character(infoTable) & !isValidDF) {
ff68e0d5
     stop("'infoTable' must either be a data frame", 
          " with an 'Experiment' column \n",
          "and at least one isobaric label column,", 
          "or a filename pointing at a \n",
          "table that fulfills the same criteria")
342fc172
   }
   isValidType <- type %in% c("2D")
   if (!isValidType) {
     stop("'type' must have this value: '2D'")
   }
02d6d7ba
   infoTable <- .TPP_importFct_readConfigTable(cfg = infoTable)
ff68e0d5
   infoTable$Experiment <- 
02d6d7ba
     .TPP_importFct_checkExperimentCol(infoTable$Experiment)
342fc172
   infoTable <- subset(infoTable, Experiment != "")
   givenPaths <- NULL
   if (any("Path" %in% colnames(infoTable))) {
     if (all(infoTable$Path == "") || all(is.na(infoTable$Path))) {
       message("Removing empty 'Path' column from config table")
       infoTable <- infoTable %>% select(-Path)
     }
     else {
       givenPaths <- infoTable$Path
     }
   }
d96a18d0
   compStrs <- NA
   infoTable$Condition <- NULL
342fc172
   allCols <- colnames(infoTable)
02d6d7ba
   labelCols <- .TPP_detectLabelColumnsInConfigTable(allColumns = allCols)
342fc172
   labelValues <- infoTable[, labelCols]
   labelValuesNum <- suppressWarnings(labelValues %>% apply(2, 
                                                            as.numeric))
   if (is.matrix(labelValuesNum)) {
     isInvalid <- labelValuesNum %>% apply(2, is.na) %>% apply(2, 
                                                               all)
   }
   else if (is.vector(labelValuesNum)) {
     isInvalid <- is.na(labelValuesNum)
   }
   invalidLabels <- labelCols[isInvalid]
   infoTable[, invalidLabels] <- NULL
   labelColsNew <- labelCols[!isInvalid]
   labelStr <- paste(labelColsNew, collapse = ", ")
   message("The following valid label columns were detected:\n", 
           labelStr, ".")
   if (type == "2D") {
     temperatures <- infoTable$Temperature
     if (is.null(temperatures) | length(temperatures) < 2) {
ff68e0d5
       m1 <- paste("Insufficient temperatures (<2)", 
                   "specified in config file.")
       m2 <- paste("Does your configuration table", 
                   "have the correct column names?")
342fc172
       stop(m1, "\n", m2)
     }
     else if (length(which(!infoTable$RefCol %in% labelColsNew)) != 
              0) {
5a6cf13e
       stop(paste("Labels in reference column not found", 
                  "in any of the label columns."))
342fc172
     }
     hasCompoundCol <- any(allCols == "Compound")
     if (!hasCompoundCol) {
ff68e0d5
       m <- paste("Config table of a 2D-TPP experiment", 
                  "needs a 'Compound' column.")
342fc172
       stop(m, "\n")
     }
     else {
ff68e0d5
       infoTable <- infoTable %>% 
         mutate(Compound = 
                  gsub("([^[:alnum:]])", "_", Compound))
342fc172
     }
     out <- infoTable
   }
   else {
     temperatures <- subset(infoTable, select = labelColsNew)
02d6d7ba
     tempMatrix <- .TPP_importCheckTemperatures(temp = temperatures)
ff68e0d5
     infoList <- list(
       expNames = as.character(infoTable$Experiment), 
       expCond = infoTable$Condition, files = givenPaths, 
       compStrs = compStrs, labels = labelColsNew, 
       tempMatrix = tempMatrix)
342fc172
     out <- infoList
   }
   return(out)
 }
 
02d6d7ba
 #' Import 2D-TPP dataset main function
 #' 
 #' @param configTable character string of a file path to a config table
 #' @param data possible list of datasets from different MS runs 
 #' corresponding to a 2D-TPP dataset, circumvents loading datasets 
 #' referencend in config table, default is NULL
 #' @param idVar character string indicating which data column provides the 
 #' unique identifiers for each protein.
 #' @param intensityStr character string indicating which columns contain 
 #' raw intensities measurements
 #' @param fcStr character string indicating which columns contain the actual 
 #' fold change values. Those column names containing the suffix \code{fcStr} 
 #' will be regarded as containing fold change values.
 #' @param naStrs character vector indicating missing values in the data table. 
 #' When reading data from file, this value will be passed on to the argument 
 #' \code{na.strings} in function \code{read.delim}.
 #' @param addCol character string indicating additional column to import
 #' @param nonZeroCols column like default qssm that should be imported and
 #' requested to be non-zero in analyzed data
 #' @param qualColName character string indicating which column can be used for 
 #' additional quality criteria when deciding between different non-unique 
 #' protein identifiers.
 #' 
 #' @return list of data frames containing different
 #' datasets
 #' 
 #' @examples 
 #' data("config_tab")
 #' data("raw_dat_list")
 #' dataList <- import2dMain(configTable = config_tab,
 #'                          data = raw_dat_list,
 #'                          idVar = "protein_id",
 #'                          fcStr = "rel_fc_",
 #'                          addCol = "gene_name",
 #'                          naStrs = NA,
 #'                          intensityStr = "signal_sum_",
 #'                          nonZeroCols = "qusm",
 #'                          qualColName = "qupm")
 #' @export
29b04a66
 import2dMain <- function(configTable, data, idVar, fcStr,
                          addCol, naStrs, intensityStr,
                          qualColName, nonZeroCols){
   
   # internal import main function, adapted from TPP package
   files <- configTable$Path
   if (!is.null(files)) {
     if (any(files == "")) {
       files <- NULL
     }
   }
02d6d7ba
   Experiment <- Compound <- Temperature <- RefCol <- NULL
29b04a66
   expNames <- configTable$Experiment
02d6d7ba
   argList <- .TPP_importFct_CheckDataFormat(dataframes = data, 
342fc172
                                            files = files,
                                            expNames = expNames)
29b04a66
   data <- argList[["dataframes"]]
   files <- argList[["files"]]
   if (!is.null(files)) {
     files2 <- files[!duplicated(names(files))]
02d6d7ba
     data <- .TPP_importFct_readFiles(files = files2, 
342fc172
                                     naStrs = naStrs)
29b04a66
   }
ec68edc5
   iVec <- seq_len(nrow(configTable))
29b04a66
   dataList <- lapply(iVec, function(iTmp) {
     rowTmp <- configTable[iTmp, ]
     expTmp <- rowTmp$Experiment
     message("Importing 2D-TPP dataset: ", expTmp)
     tTmp <- rowTmp$Temperature
     dataTmp <- data[[expTmp]]
     noFCCols <- c("Compound", "Experiment", "Temperature",
                   "RefCol", "Path", "Condition")
     allCols <- colnames(rowTmp)
     labelCols <- setdiff(allCols, noFCCols)
02d6d7ba
     labelValues <- suppressWarnings(rowTmp[, labelCols] %>%
29b04a66
                                       as.numeric)
     labelColsNum <- labelCols[!is.na(labelValues)]
     signalCols <- paste(intensityStr, labelColsNum, sep = "")
     relevant.cols <- c(idVar, qualColName, nonZeroCols, addCol,
                        signalCols) %>% unique
     if (!is.null(fcStr)) {
       fcCols <- paste(fcStr, labelColsNum, sep = "")
       relevant.cols <- c(relevant.cols, fcCols)
       dataCols <- fcCols
     }
     else {
       dataCols <- signalCols
     }
     if (!all(relevant.cols %in% colnames(dataTmp))) {
       notFound <- paste(setdiff(relevant.cols, colnames(dataTmp)),
                         collapse = "', '")
       stop("The following columns could not be found: '",
342fc172
            notFound, paste("'. Please check the suffices and the", 
                            "additional column names you have specified."))
29b04a66
     }
02d6d7ba
     dataFiltered <- .TPP_importFct_removeDuplicates(
342fc172
       inDF = dataTmp,refColName = idVar, 
       nonNAColNames = dataCols, 
       qualColName = qualColName[1])
29b04a66
     idsTmp <- as.character(dataFiltered[, idVar])
     idsAnnotated <- paste(expTmp, tTmp, idsTmp, sep = "_")
     dataFinal <- dataFiltered %>% subset(select = relevant.cols) %>%
       mutate(temperature = tTmp, experiment = expTmp, unique_ID = idsAnnotated)
     return(dataFinal)
   })
d96a18d0
   newNames <- vapply(seq(nrow(configTable)), function(iTmp) {
29b04a66
     rowTmp <- configTable[iTmp, ]
     tTmp <- rowTmp$Temperature
     expTmp <- rowTmp$Experiment
     newName <- paste(expTmp, tTmp, sep = "_")
     return(newName)
d96a18d0
   }, "")
29b04a66
   names(dataList) <- newNames
02d6d7ba
   out <- .TPP_importFct_rmZeroSias(data.list = dataList,
                                    intensityStr = intensityStr)
29b04a66
   return(out)
 }
 
02d6d7ba
 #' Tranform configuration table from wide to long
 #' 
 #' @param configWide data frame containing a config table
 #' @return data frame containing config table in long format 
 #' 
342fc172
 #' @importFrom tidyr gather
02d6d7ba
 #' @examples 
 #' data("config_tab")
 #' configWide2Long(configWide = config_tab)
 #' 
 #' @export
29b04a66
 configWide2Long <- function(configWide){
02d6d7ba
  Path <- label <- conc <- Compound <- Experiment <- 
d96a18d0
     Temperature <- RefCol <- NULL
   
29b04a66
   if(any(grepl("Path", colnames(configWide)))){
     configLong <- configWide %>%
       dplyr::select(-Path) %>%
d96a18d0
       gather(label, conc, -Compound, 
              -Experiment, -Temperature, -RefCol) %>%
29b04a66
       filter(conc != "-")
   }else{
     configLong <- configWide %>%
d96a18d0
       gather(label, conc, -Compound, 
              -Experiment, -Temperature, -RefCol) %>%
29b04a66
       filter(conc != "-")
   }
 }
 
02d6d7ba
 #' Annotate imported data list using a config table
 #' @param dataList list of datasets from different MS runs 
 #' corresponding to a 2D-TPP dataset
 #' @param geneNameVar character string of the column name that describes
 #' the gene name of a given protein in the raw data files
 #' @param configLong long formatted data frame of a corresponding
 #' config table
 #' @param intensityStr character string indicating which columns contain 
 #' raw intensities measurements
 #' @param fcStr character string indicating which columns contain the actual 
 #' fold change values. Those column names containing the suffix \code{fcStr} 
 #' will be regarded as containing fold change values.
 #' 
 #' @return data frame containing all data annotated
 #' by information supplied in the config table
 #'   
fa7d9b05
 #' @importFrom tidyr spread
02d6d7ba
 #' 
 #' @examples 
 #' data("config_tab")
 #' data("raw_dat_list")
 #' dataList <- import2dMain(configTable = config_tab,
 #'                          data = raw_dat_list,
 #'                          idVar = "protein_id",
 #'                          fcStr = "rel_fc_",
 #'                          addCol = "gene_name",
 #'                          naStrs = NA,
 #'                          intensityStr = "signal_sum_",
 #'                          nonZeroCols = "qusm",
 #'                          qualColName = "qupm")
 #' configLong <- configWide2Long(configWide = config_tab)
 #' annotateDataList(dataList = dataList,
 #'                  geneNameVar = "gene_name",
 #'                  configLong = configLong,
 #'                  intensityStr = "signal_sum_",
 #'                  fcStr = "rel_fc_")
 #' @export
29b04a66
 annotateDataList <- function(dataList, geneNameVar, configLong,
                              intensityStr, fcStr){
fa7d9b05
   channel <- signal <- Temperature <- RefCol <- label <- 
d96a18d0
     conc <- unique_ID <- spread_var <- NULL
fa7d9b05
   
29b04a66
   combinedTab <- bind_rows(lapply(dataList, function(dat){
71150341
     datLong <- dat %>% as_tibble() %>%
29b04a66
       gather(channel, signal, matches(intensityStr), matches(fcStr)) %>%
       mutate(label = gsub(fcStr, "", gsub(intensityStr, "", channel))) %>%
ff68e0d5
       left_join(configLong %>% 
                   dplyr::select(Temperature, RefCol, label, conc),
29b04a66
                 by = c("temperature" = "Temperature", "label")) %>%
ff68e0d5
       mutate(spread_var = 
                ifelse(grepl(fcStr, channel), "rel_value", "raw_value")) %>%
29b04a66
       dplyr::select(-channel, -unique_ID) %>%
fa7d9b05
       spread(spread_var, signal)
29b04a66
   }))
   return(combinedTab)
 }
 
02d6d7ba
 #' Filter out contaminants
 #' 
 #' @param dataLong long format data frame of imported dataset
 #' 
 #' @return data frame containing full dataset filtered to 
 #' contain no contaminants
 #' 
 #' @examples 
 #' data("simulated_cell_extract_df")
 #' filterOutContaminants(simulated_cell_extract_df)
 #' 
 #' @export
29b04a66
 filterOutContaminants <- function(dataLong){
   # internal function to filter out contaminants
d96a18d0
   representative <- NULL
29b04a66
   filter(dataLong, !grepl("##", representative))
 }
 
02d6d7ba
 .checkRatioRef <- function(dataLong, idVar, concFactor = 1e6){
ff68e0d5
   # internal function to check that protein 
   # fold changes are computed
29b04a66
   # relative to the correct TMT channel
d96a18d0
   label <- RefCol <- rel_value <- raw_value <- conc <- NULL
   
ff68e0d5
   if(!all(filter(dataLong, label == RefCol)$rel_value == 1, 
           na.rm = TRUE)){
29b04a66
     message("Recomputing ratios!")
     dataOut <- dataLong %>%
5a6cf13e
       group_by(.dots = c(idVar, "temperature")) %>%
29b04a66
       mutate(rel_value = rel_value/rel_value[label == RefCol]) %>%
       ungroup %>%
       filter(!is.na(raw_value)) %>%
       mutate(conc = as.numeric(conc)) %>%
       mutate(log_conc = log10(conc/concFactor))
     
     return(dataOut)
     
   }else{
     message("Ratios were correctly computed!")
     return(dataLong %>%
              filter(!is.na(raw_value)) %>%
              mutate(conc = as.numeric(conc)) %>%
              mutate(log_conc = log10(conc/concFactor)))
   }
 }
 
fa7d9b05
 #' @importFrom stats median
02d6d7ba
 .medianNormalizeRatios <- function(dataLong){
ff68e0d5
   # internal function to perform median normalization 
   # of ratios
18be4832
   rel_value <- temperature <- conc <- 
     raw_rel_value <- NULL
   
29b04a66
   dataOut <- dataLong %>%
     rename(raw_rel_value = rel_value) %>%
     group_by(temperature, conc) %>%
fa7d9b05
     mutate(rel_value = raw_rel_value / 
              median(raw_rel_value, na.rm = TRUE)) %>%
29b04a66
     ungroup()
   
   return(dataOut)
 }
 
02d6d7ba
 #' Rename columns of imported data frame
 #' 
 #' @param dataLong long format data frame of imported dataset
 #' @param idVar character string indicating which data column provides the 
 #' unique identifiers for each protein.
 #' @param geneNameVar character string of the column name that describes
 #' the gene name of a given protein in the raw data files
 #' 
 #' @return data frame containing imported data with renamed
 #' columns
 #' 
 #' @examples 
 #' data("config_tab")
 #' data("raw_dat_list")
 #' 
 #' dataList <- import2dMain(configTable = config_tab,
 #'                          data = raw_dat_list,
 #'                          idVar = "protein_id",
 #'                          fcStr = "rel_fc_",
 #'                          addCol = "gene_name",
 #'                          naStrs = NA,
 #'                          intensityStr = "signal_sum_",
 #'                          nonZeroCols = "qusm",
 #'                          qualColName = "qupm")
 #' configLong <- configWide2Long(configWide = config_tab)
 #' annoDat <- annotateDataList(dataList = dataList,
 #'                             geneNameVar = "gene_name",
 #'                             configLong = configLong,
 #'                             intensityStr = "signal_sum_",
 #'                             fcStr = "rel_fc_")
 #' renameColumns(annoDat, 
 #'               idVar = "protein_id", 
 #'               geneNameVar = "gene_name")
 #' @export
29b04a66
 renameColumns <- function(dataLong, idVar, geneNameVar){
18be4832
   clustername <- representative <- NULL
   
6bd41829
   dplyr::rename(dataLong, "representative" = idVar, 
                 "clustername" = geneNameVar) %>%
29b04a66
     group_by(clustername) %>%
     mutate(representative =
02d6d7ba
              .paste_rmNA(unique(unlist(strsplit(representative, 
ff68e0d5
                                                split = "\\|"))), 
29b04a66
                         sep = "|")) %>%
     ungroup()
 }
 
 #' Import 2D-TPP dataset using a config table
 #' 
 #' @param configTable character string of a file path to a config table
 #' @param data possible list of datasets from different MS runs 
 #' corresponding to a 2D-TPP dataset, circumvents loading datasets 
 #' referencend in config table, default is NULL
 #' @param idVar character string indicating which data column provides the 
 #'   unique identifiers for each protein.
 #' @param intensityStr character string indicating which columns contain 
 #'   raw intensities measurements
 #' @param fcStr character string indicating which columns contain the actual 
 #'   fold change values. Those column names containing the suffix \code{fcStr} 
 #'   will be regarded as containing fold change values.
 #' @param naStrs character vector indicating missing values in the data table. 
 #'   When reading data from file, this value will be passed on to the argument 
 #'   \code{na.strings} in function \code{read.delim}.
 #' @param qualColName character string indicating which column can be used for 
 #'   additional quality criteria when deciding between different non-unique 
 #'   protein identifiers.
 #' @param medianNormalizeFC perform median normalization (default: TRUE).
 #' @param addCol character string indicating additional column to import
 #' @param filterContaminants boolean variable indicating whether data 
 #' should be filtered to exclude contaminants (default: TRUE).
 #' @param nonZeroCols column like default qssm that should be imported and
 #' requested to be non-zero in analyzed data
 #' @param geneNameVar character string of the column name that describes
 #' the gene name of a given protein in the raw data files
 #' @param concFactor numeric value that indicates how concentrations need to 
 #' be adjusted to yield total unit e.g. default mmol - 1e6
 #' 
2158973b
 #' @return tidy data frame representing a 2D-TPP dataset
 #' 
ec68edc5
 #' @examples 
 #' data("config_tab")
 #' data("raw_dat_list")
ff68e0d5
 #' import_df <- import2dDataset(configTable = config_tab, 
 #'                              data = raw_dat_list,
ec68edc5
 #'                              idVar = "protein_id",
 #'                              intensityStr = "signal_sum_",
 #'                              fcStr = "rel_fc_",
 #'                              nonZeroCols = "qusm",
 #'                              geneNameVar = "gene_name",
 #'                              addCol = NULL,
 #'                              qualColName = "qupm",
 #'                              naStrs = c("NA", "n/d", "NaN"),
 #'                              concFactor = 1e6,
 #'                              medianNormalizeFC = TRUE,
 #'                              filterContaminants = TRUE)
2158973b
 #' 
29b04a66
 #' @export
 import2dDataset <- function(configTable, data,
                             idVar = "representative",
                             intensityStr = "sumionarea_protein_",
                             fcStr = "rel_fc_protein_",
                             nonZeroCols = "qssm",
                             geneNameVar = "clustername",
ec68edc5
                             addCol = NULL,
29b04a66
                             qualColName = "qupm",
                             naStrs = c("NA", "n/d", "NaN"),
                             concFactor = 1e6,
                             medianNormalizeFC = TRUE,
                             filterContaminants = TRUE){
   
ff68e0d5
   configWide <- TPP_importCheckConfigTable(
     infoTable = configTable, type = "2D")
29b04a66
   configLong <- configWide2Long(configWide = configWide)
   
   dataList <- import2dMain(configTable = configWide,
                            data = data,
                            idVar = idVar,
                            fcStr = fcStr,
                            addCol = c(geneNameVar, addCol),
                            naStrs = naStrs,
                            intensityStr = intensityStr,
                            nonZeroCols = nonZeroCols,
                            qualColName = qualColName)
   
   dataLong <- annotateDataList(dataList = dataList,
                                geneNameVar = geneNameVar,
                                configLong = configLong,
                                intensityStr = intensityStr,
                                fcStr = fcStr)
   
02d6d7ba
   dataRatioChecked <- .checkRatioRef(dataLong, idVar = idVar,
                                      concFactor = concFactor)
29b04a66
   
   if(medianNormalizeFC){
     message("Median normalizing fold changes...")
02d6d7ba
     dataNorm <- .medianNormalizeRatios(dataRatioChecked)
29b04a66
   }else{
     dataNorm <- dataRatioChecked
   }
   
   dataOut <- renameColumns(dataNorm,
                            idVar = idVar,
                            geneNameVar = geneNameVar)
   
   if(filterContaminants){
     dataOut <- filterOutContaminants(dataOut)
   }
   
   return(dataOut)
 }