#' Train the model needed to impute a GReX for each gene 
#'
#' @param exprMatrix A SummarizedExperiment object containing expression data
#' @param assay A string with the name of the object in
#' SummarizedExperiment::assays(exprMatrix) that contains expression values 
#' @param tbaPaths A vector of strings, which are the paths to
#' MultiAssayExperiment RDS files containing the tba values
#' @param regionAssoc A data.frame with the association between regulatory
#' regions and expressed genes and with colnames = c("REGULATORY_REGION",
#' "EXPRESSED_REGION")
#' @param cov Optional. A data.frame with covariates values for the population 
#' structure where the columns are the PCs and the rows are the individual IIDs
#' Default is NULL
#' @param varExplained An integer between 0 and 100; varExplained=80 means that
#' the principal components selected to fit the models must explain at least
#' 80 percent of variation of TBA values; default is 80
#' @param scale A logical; if scale=FALSE the TBA values will be only centered,
#' not scaled before performing PCA; default is TRUE
#' @param BPPARAM A BiocParallelParam object. Default is bpparam(). For
#' details on BiocParallelParam virtual base class see 
#' browseVignettes("BiocParallel")
#' @param kfold An integer. The k definition of k-fold cross-validation on the
#' training dataset. Default is 1. This argument controls the behavior of the
#' function in the following way:
#'      
#' \itemize{
#'  \item kfold<2: train the models on the whole dataset, not performing the
#'   cross-validation
#'  \item kfold>=2: perform the k-fold cross-validation
#' }
#'
#' @return The output depends on the parameter kfold
#' 
#' If kfold<2: a list containing three objects: pca, bs, regionsCount
#' \itemize{
#'  \item pca: A list containing lists named as the
#'  MultiAssayExperiment::experiments() found in the MultiAssayExperiment
#'  objects listed in the param tbaPaths. Each of these lists contain two
#'  objects:
#'  \itemize{
#'      \item eigenvectors: A matrix containing eigenvectors for those principal
#'      components of the TBA selected according to the param varExplained
#'      \item pcs: A matrix containing the principal components values of the 
#'      TBA selected according to the param varExplained
#'      \item eigenvalues: A vector containing eigenvalues for those principal
#'      components of the TBA selected according to the param varExplained
#'  }
#'  \item bs: A list containing lists named as the EXPRESSED_REGIONS found in
#'  the param regionAssoc that have a correspondent rowname in the expression
#'  values stored SummarizedExperiment::assays(exprMatrix)$assay.
#'  Each of the lists in bs contains three objects:
#'  \itemize{
#'      \item coefficients: The coefficients of the principal components used in the
#'      model, completely similar to the "coefficients" from the results of lm()
#'      \item p.val: The uncorrected anova pvalue of the model
#'      \item r.sq: The coefficient of determination between the real total expression
#'      values and the imputed GReX, retrived from summary(model)$r.squared
#'      \item corrected.p.val: The p-value of the model, corrected for multiple
#'      testing with benjamini-hochberg procedure
#'  }
#'  \item regionsCount: An integer, that is the number of genomic regions taken into
#'  account during the training phase
#' }
#'
#' If kfold>=2: a list containing k-fold objects, named from 1 to kfold and
#' corresponding to the different cross-validations [i]; each one of these
#' objects is a list containing lists named as the expressed gene IDs [y] (i.e.
#' the rownames() of the object in SummarizedExperiment::assays(exprMatrix)
#' containing the expression values), for which a GReX could be imputed. Each of
#' these inner lists contain two objects:
#' \itemize{     
#'  \item rho: the pearson's correlation coefficient (R) between the real
#'  expression values and the imputed GReX for the cross-validation i on
#'  the expressed gene y, computed with cor()
#'  \item rho.sq: the coefficient of determination (R^2) between the real
#'  expression values and the imputed GReX for the cross-validation i on
#'  the expressed gene y, computed as pearson^2
#'  \item cor.test.p.val: the p-value of the cor.test() between the real expression values
#'  and the imputed GReX for the cross-validation i on the expressed gene y
#'  \item model.p.val: The uncorrected anova pvalue of the model
#'  \item model.corrected.p.val: The p-value of the model, corrected for
#'  multiple testing with benjamini-hochberg procedure
#'  \item model.r.sq: the model's coefficient of determination (R^2) on the
#'  training data
#' }
#' @import MultiAssayExperiment SummarizedExperiment BiocParallel crayon
#' @importFrom stats anova cor cor.test lm p.adjust pf prcomp setNames var
#' @export
#'
#' @examples
#' if(interactive()) {
#' trainingTbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#'
#' data(exprMatrix)
#' data(regionAssoc)
#' data(trainingCovariates)
#'
#' assay <- "values"
#'
#' training <- affiXcanTrain(exprMatrix=exprMatrix, assay=assay,
#' tbaPaths=trainingTbaPaths, regionAssoc=regionAssoc, cov=trainingCovariates,
#' varExplained=80, scale=TRUE, kfold=3)
#' }
affiXcanTrain <- function(exprMatrix, assay, tbaPaths, regionAssoc, cov=NULL,
    varExplained=80, scale=TRUE, BPPARAM=bpparam(), kfold=1) {
    regionsCount <- overlookRegions(tbaPaths)
    sampleNames <- colnames(exprMatrix)
    nSamples <- length(sampleNames)
    sampGroups <- subsetKFold(kfold, nSamples)
    for (i in seq(length(sampGroups))) {
        sampGroups[[i]] <- colnames(exprMatrix)[sampGroups[[i]]]
    }
    trainingOutput <- vector('list', length=kfold)
    
    for (i in seq(kfold)) {
        if (kfold > 1) {
            cat(green(bold("\nAffiXcan")),": ",cyan("Performing Training With",
                kfold, "Fold Cross-Validation (",i,"/",kfold,")\n"), sep="")
            testingSamples <- sampGroups[[i]]
            trainingSamples <- sampleNames[!sampleNames %in% testingSamples]
        }
        else {
            cat(green(bold("\nAffiXcan")),": ", cyan("Training The Models\n"),
                sep="")
            trainingSamples <- sampleNames
        }

        cat(bold(cyan("\t-->")),
            green("Performing Principal Components Analysis\n"))
        pca <- affiXcanPca(tbaPaths, varExplained, scale, regionsCount,
                           BPPARAM, trainingSamples)
        cat(bold(cyan("\t-->")), green("Training Linear Models\n"))
        bs <- affiXcanBs(exprMatrix, assay, regionAssoc, pca, cov, BPPARAM,
                         trainingSamples)
        affiXcanTraining <- list(pca=pca, bs=bs, regionsCount=regionsCount)

        if (kfold > 1) {
            cat(bold(cyan("\t-->")),
                green("Computing Principal Components On Validation Set\n"))
            pcs <- affiXcanPcs(tbaPaths, affiXcanTraining, scale, BPPARAM,
                               testingSamples)
            cat(bold(cyan("\t-->")), green("Imputing GReX Values\n"))
            imputedExpr <- affiXcanGReX(affiXcanTraining, pcs, BPPARAM)
            cat(bold(cyan("\t-->")), green("Computing Cross-Validated R^2\n"))
            correlation <- computeRSquared(exprMatrix, imputedExpr, assay,
                                         testingSamples, BPPARAM)
            for (gene in names(correlation)) {
                correlation[[gene]]$model.p.val <- bs[[gene]]$p.val
                correlation[[gene]]$model.corrected.p.val <-
                    bs[[gene]]$corrected.p.val
                correlation[[gene]]$model.r.sq <- bs[[gene]]$r.sq
            }
            trainingOutput[[i]] <- correlation
        }
        else {
            trainingOutput <- affiXcanTraining
        }
        cat(bold(cyan("\tDone\n")))
    }
    
    return(trainingOutput)
}

#' Split the numbers from 1 to n in k equally-sized lists
#'
#' @param k An integer. The number of groups in which the first n natural
#' numbers are to be splitted
#' @param n An integer. Defines the interval 1..n 
#'
#' @return A list of lists; the first natural n numbers equally distributed 
#' across k lists
#'
#' @export
#'
#' @examples
#' sampGroups <- subsetKFold(k=5, n=23)
subsetKFold <- function(k, n) {
    sampGroups <- as.vector(seq(n))
    sampGroups <- split(sampGroups, seq_along(sampGroups)%%k + 1)
    return(sampGroups)
}

#' Perform a PCA on each experiment found in MultiAssayExperiment objects
#'
#' @param tbaPaths A vector of strings, which are the paths to
#' MultiAssayExperiment RDS files containing the tba values
#' @param varExplained An integer between 0 and 100; varExplained=80 means that
#' the principal components selected to fit the models must explain at least
#' 80 percent of variation of TBA values; default is 80
#' @param scale A logical; if scale=FALSE the TBA values will be only centered,
#' not scaled before performing PCA; default is TRUE
#' @param regionsCount An integer, that is the summation of length(assays()) of
#' every MultiAssayExperiment RDS object indicated in the param tbaPaths; it is
#' the returning value from overlookRegions()
#' @param BPPARAM A BiocParallelParam object. Default is bpparam(). For
#' details on BiocParallelParam virtual base class see 
#' browseVignettes("BiocParallel")
#' @param trainingSamples A vector of strings. The identifiers (e.g. row names
#' of MultiAssayExperiment objects from tbaPaths) of the samples that have to
#' be considered in the training phase, and not used for the cross-validation 
#'
#' @return pca: A list containing lists named as the 
#' MultiAssayExperiment::experiments() found in the MultiAssayExperiment objects
#' listed in the param tbaPaths. Each of these lists contain two objects:
#' \itemize{
#'  \item eigenvectors: A matrix containing eigenvectors for those principal
#'  components selected according to the param varExplained
#'  \item pcs: A matrix containing the principal components values selected
#'  according to the param varExplained
#'  \item eigenvalues: A vector containing eigenvalues for those principal
#'  components selected according to the param varExplained
#' }
#' @import MultiAssayExperiment BiocParallel 
#' @export
#'
#' @examples
#' if (interactive()) {
#' data(exprMatrix)
#'
#' tbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#' regionsCount <- overlookRegions(tbaPaths)
#'
#' sampleNames <- colnames(exprMatrix)
#' nSamples <- length(sampleNames)
#' sampGroups <- subsetKFold(k=3, n=nSamples)
#' for (i in seq(length(sampGroups))) {
#'      sampGroups[[i]] <- colnames(exprMatrix)[sampGroups[[i]]]
#' }
#' 
#' testingSamples <- sampGroups[[1]]
#' trainingSamples <- sampleNames[!sampleNames %in% testingSamples]
#'
#' pca <- affiXcanPca(tbaPaths=tbaPaths, varExplained=80, scale=TRUE,
#' regionsCount=regionsCount, trainingSamples=trainingSamples)
#' }
affiXcanPca <- function(tbaPaths, varExplained=80, scale=TRUE, regionsCount,
                        BPPARAM=bpparam(), trainingSamples) {

    pca <- vector("list", regionsCount)
    index <- 0

    for(i in seq(1,length(tbaPaths))) {

        tbaMatrixMAE <- readRDS(tbaPaths[i])
        tbaMatrixMAE <- MultiAssayExperiment::subsetByRow(tbaMatrixMAE,
                                                          trainingSamples)
        tbaMatrix <- MultiAssayExperiment::experiments(tbaMatrixMAE)

        newPca <- BiocParallel::bplapply(X=tbaMatrix, FUN=computePca,
        varExplained, scale, BPPARAM=BPPARAM)

        for(l in seq(1,length(newPca))) {
            pca[index + l] <- newPca[l]
            names(pca)[index + l] <- names(newPca)[l]
        }

        index <- index + length(newPca)
    
    }

    return(pca)

}

#' Perform a PCA on a matrix where columns are variables
#'
#' @param data A matrix containing the TBA values for a certain genomic region;
#' columns are PWMs, rows are individuals IIDs
#' @param varExplained An integer between 0 and 100; varExplained=80 means that
#' the principal components selected to fit the models must explain at least
#' 80 percent of variation of TBA values; default is 80
#' @param scale A logical; if scale=FALSE the TBA values will be only centered,
#' not scaled before performing PCA; default is TRUE
#'
#' @return A list containing two objects:
#' \itemize{ 
#'  \item eigenvectors: A matrix containing eigenvectors for those principal
#'  components selected according to the param varExplained
#'  \item pcs: A matrix containing the principal components values selected
#'  according to the param varExplained
#'  \item eigenvalues: A vector containing eigenvalues for those principal
#'  components selected according to the param varExplained
#' }
#' @export
#'
#' @examples
#' if (interactive()) {
#' data(exprMatrix)
#' sampleNames <- colnames(exprMatrix)
#' nSamples <- length(sampleNames)
#' sampGroups <- subsetKFold(k=3, n=nSamples)
#' for (i in seq(length(sampGroups))) {
#'      sampGroups[[i]] <- colnames(exprMatrix)[sampGroups[[i]]]
#' }
#' testingSamples <- sampGroups[[i]]
#' trainingSamples <- sampleNames[!sampleNames %in% testingSamples]
#'
#' tbaMatrixMAE <- readRDS(system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan"))
#' tbaMatrixMAE <- MultiAssayExperiment::subsetByRow(tbaMatrixMAE,
#'                                                   trainingSamples)
#' tbaMatrix <- MultiAssayExperiment::experiments(tbaMatrixMAE)
#' tba <- tbaMatrix$ENSG00000256377.1
#'
#' pca <- computePca(data=tba, varExplained=80, scale=TRUE)
#' 
#' }
computePca <- function(data, varExplained=80, scale=TRUE) {

    tryCatch (
        {
            notScalablePwm <- vector('character')
            for(pwm in colnames(data)) {
                if(TRUE %in% is.na(data[[pwm]]) || var(data[[pwm]])==0) {        
                    notScalablePwm <- c(notScalablePwm, pwm)
                    data[[pwm]] <- NULL
                }
            }

            dataPrcomp <- prcomp(data, center = TRUE, scale. = scale)
            dataPcs <- dataPrcomp$x
            pcVar <- dataPrcomp$sdev^2
            totVar <- sum(pcVar)
            eigenvalues <- pcVar * 100 / totVar 
            eigenvectors <- dataPrcomp$rotation
            nWantedPcs <- which.max(cumsum(eigenvalues) >= varExplained)
            wantedPcs <- subset(dataPcs, select=colnames(dataPcs)[1:nWantedPcs])
            wantedEigenvectors <- subset(eigenvectors,
                select=colnames(eigenvectors)[1:nWantedPcs])

            for(pwm in notScalablePwm) {
                newLine <- matrix(rep(0, nWantedPcs), nrow=1)
                wantedEigenvectors <- rbind(wantedEigenvectors, newLine)
                rownames(wantedEigenvectors)[nrow(wantedEigenvectors)] <- pwm
            }

            wantedEigenvectors <- as.data.frame(wantedEigenvectors)
            wantedEigenvectors <- 
                wantedEigenvectors[order(rownames(wantedEigenvectors)),]
            wantedEigenvectors <- as.matrix(wantedEigenvectors)

            wantedEigenvalues <- eigenvalues[1:nWantedPcs]
            names(wantedEigenvalues) <- colnames(wantedEigenvectors)

            return(list(eigenvectors=wantedEigenvectors, pcs=wantedPcs,
                        eigenvalues=wantedEigenvalues))
        },
        error=function(e) {
            wantedEigenvectors <- NA
            wantedPcs <- NA
            wantedEigenvalues <- NA
            return(list(eigenvectors=wantedEigenvectors, pcs=wantedPcs,
                        eigenvalues=wantedEigenvalues))
        }
    )
}

#' Fit linear models and compute ANOVA p-values
#'
#' @param exprMatrix A SummarizedExperiment object containing expression data
#' @param assay A string with the name of the object in
#' SummarizedExperiment::assays(exprMatrix) that contains expression values
#' @param regionAssoc A data.frame with the associations between regulatory
#' regions and expressed genes, and with colnames = c("REGULATORY_REGION",
#' "EXPRESSED_REGION")
#' @param pca A list, which is the returningObject$pca from affiXcanPca()
#' @param cov Optional; a data.frame with covariates values for the population
#' structure where the columns are the PCs and the rows are the individual IDs;
#' default is NULL
#' @param BPPARAM A BiocParallelParam object. Default is bpparam(). For
#' details on BiocParallelParam virtual base class see 
#' browseVignettes("BiocParallel")
#' @param trainingSamples A vector of strings. The identifiers (e.g. row names
#' of MultiAssayExperiment objects from tbaPaths) of the samples that have to
#' be considered in the training phase, and not used for the cross-validation
#'
#' @return A list containing lists named as the EXPRESSED_REGIONS found in the
#' param regionAssoc.
#' Each of these lists contain three objects:
#' \itemize{
#'  \item coefficients: An object containing the coefficients of the principal
#'  components used in the model, completely similar to the "coefficients"
#'  from the results of lm()
#'  \item p.val: The uncorrected anova p-value of the model
#'  \item r.sq: The coefficient of determination between the real total expression
#'  values and the imputed GReX, retrived from summary(model)$r.squared
#'  \item corrected.p.val: The p-value of the model, corrected for multiple
#'  testing with benjamini-hochberg procedure 
#' }
#' @import SummarizedExperiment BiocParallel
#' @export
#'
#' @examples
#' if (interactive()) {
#' data(exprMatrix)
#' data(trainingCovariates)
#' data(regionAssoc)
#'
#' tbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#' regionsCount <- overlookRegions(tbaPaths)
#' assay <- "values"
#'
#' sampleNames <- colnames(exprMatrix)
#' nSamples <- length(sampleNames)
#' sampGroups <- subsetKFold(k=5, n=nSamples)
#' for (i in seq(length(sampGroups))) {
#'      sampGroups[[i]] <- colnames(exprMatrix)[sampGroups[[i]]]
#' }
#' 
#' testingSamples <- sampGroups[[1]]
#' trainingSamples <- sampleNames[!sampleNames %in% testingSamples]
#'
#' pca <- affiXcanPca(tbaPaths=tbaPaths, varExplained=80, scale=TRUE,
#' regionsCount=regionsCount, trainingSamples=trainingSamples)
#'
#' bs <- affiXcanBs(exprMatrix=exprMatrix, assay=assay, regionAssoc=regionAssoc,
#' pca=pca, cov=trainingCovariates, trainingSamples=trainingSamples)
#' }
affiXcanBs <- function(exprMatrix, assay, regionAssoc, pca, cov=NULL,
                       BPPARAM=bpparam(), trainingSamples) {

    if (is.null(cov)==FALSE) {
        cov <- cov[rownames(cov) %in% trainingSamples,]
    }
    
    expr <- SummarizedExperiment::assays(exprMatrix)[[assay]]
    expr <- expr[,colnames(expr) %in% trainingSamples]
    tDfExprMatrix<-t(as.data.frame(expr))

    expressedRegions <- as.vector(unique(regionAssoc$EXPRESSED_REGION))
    assocList <- BiocParallel::bplapply(X=expressedRegions, FUN=assoc2list,
        regionAssoc, BPPARAM=BPPARAM)
    names(assocList) <- expressedRegions

    bs <- BiocParallel::bplapply(X=assocList, FUN=computeBs, pca,
        tDfExprMatrix, cov, BPPARAM=BPPARAM)

    ## Remove ensg for which the model is NA
    for(i in names(bs)) {
        if(is.na(bs[[i]]$r.sq)) {
            bs[[i]] <- NULL
        }
    }

    ## Compute multitest pvalue correction (benjamini-hochberg)
    pvalues <- vector("numeric", length=length(bs))
    
    for(i in seq(1:length(pvalues))) {
        pvalues[[i]] <- bs[[i]]$p.val
    }
    
    multi.test.pvals <- p.adjust(pvalues, method="BH")
    
    for(i in seq(1:length(bs))) {
        bs[[i]]$corrected.p.val <- multi.test.pvals[[i]]
    }
    
    return(bs)
}

#' Reorganize associations table in a list
#'
#' @param gene A string; the name of an expressed gene
#' @param regionAssoc A data.frame with the associations between regulatory
#' regions and expressed genes, and with colnames = c("REGULATORY_REGION",
#' "EXPRESSED_REGION")
#'
#' @return A list of data frames, each of which has the same structure of the
#' param regionAssoc, except that contains the information relative to one
#' expressed gene
#' @export
#'
#' @examples
#' if (interactive()) {
#' data(regionAssoc)
#' expressedRegions <- as.list(as.vector(unique(regionAssoc$EXPRESSED_REGION)))
#' gene <- expressedRegions[[1]]
#' assocList <- assoc2list(gene, regionAssoc)
#' }
assoc2list <- function(gene, regionAssoc) {
    return(regionAssoc[regionAssoc$EXPRESSED_REGION==gene,])
}

#' Fit a linear model to impute a GReX for a certain gene
#'
#' @param assocRegions A data.frame with the associations between regulatory
#' regions and one expressed gene, and with colnames = c("REGULATORY_REGION",
#' "EXPRESSED_REGION")
#' @param pca The returningObject$pca from affiXcanTrain()
#' @param expr A matrix containing the real total expression values, where the
#' columns are genes and the rows are individual IIDs
#' @param covariates Optrional; a data.frame with covariates values for the
#' population structure where the columns are the PCs and the rows are the
#' individual IIDs; default is NULL
#'
#' @return A list containing three objects:
#' \itemize{
#'  \item coefficients: An object containing the coefficients of the principal
#'  components used in the model, completely similar to the "coefficients"
#'  from the results of lm()
#'  \item p.val: The uncorrected anova pvalue of the model
#'  \item r.sq: The coefficient of determination between the real total expression
#'  values and the imputed GReX, retrived from summary(model)$r.squared
#' }
#' @export
#'
#' @examples
#' if (interactive()) {
#' data(exprMatrix)
#' data(trainingCovariates)
#' data(regionAssoc)
#'
#' tbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#' regionsCount <- overlookRegions(tbaPaths)
#' assay <- "values"
#'
#' sampleNames <- colnames(exprMatrix)
#' nSamples <- length(sampleNames)
#' sampGroups <- subsetKFold(k=5, n=nSamples)
#' for (i in seq(length(sampGroups))) {
#'      sampGroups[[i]] <- colnames(exprMatrix)[sampGroups[[i]]]
#' }
#' 
#' testingSamples <- sampGroups[[1]]
#' trainingSamples <- sampleNames[!sampleNames %in% testingSamples]
#'
#' pca <- affiXcanPca(tbaPaths=tbaPaths, varExplained=80, scale=TRUE,
#' regionsCount=regionsCount, trainingSamples=trainingSamples)
#'
#' cov <- trainingCovariates
#' cov <- cov[rownames(cov) %in% trainingSamples,]
#'
#' expr <- SummarizedExperiment::assays(exprMatrix)[[assay]]
#' expr <- expr[,colnames(expr) %in% trainingSamples]
#' expr <- t(as.data.frame(expr))
#'
#' expressedRegions <- as.vector(unique(regionAssoc$EXPRESSED_REGION))
#' assocList <- BiocParallel::bplapply(X=expressedRegions, FUN=assoc2list,
#' regionAssoc)
#' names(assocList) <- expressedRegions
#' assocRegions <- assocList[[1]]
#'
#' bs <- computeBs(assocRegions=assocRegions, pca=pca, expr=expr,
#' covariates=cov)
#' }
computeBs <- function(assocRegions, pca, expr, covariates=NULL) {
    expressedRegion <- as.vector(unique(assocRegions$EXPRESSED_REGION))
    expression <- as.vector(as.data.frame(expr)[[expressedRegion]])
    tryCatch (
        {
            myPcs <- as.data.frame(matrix(nrow = nrow(expr)))
            myPcsColnames <- vector("character")

            for(regulRegion in assocRegions$REGULATORY_REGION) {

                ## Discards the pcs for those regions that are NA in the model
                if(is.null(pca[[regulRegion]]$pcs[[1]]) ||
                    is.na(pca[[regulRegion]]$pcs[[1]])) {

                    next

                } else {

                    nNames <- length(colnames(pca[[regulRegion]]$pcs))
                    regionColnames <- colnames(pca[[regulRegion]]$pcs)
                    regionColnames <- paste(regionColnames, regulRegion,
                                                                    sep="@")
                    myPcsColnames <- c(myPcsColnames, regionColnames)
                    regulRegionPcs <- pca[[regulRegion]]$pcs
                    myPcs <- cbind(myPcs, regulRegionPcs)

                }
            }

            if (is.null(covariates)==FALSE) {
                myPcsColnames <- c(myPcsColnames, names(covariates))
                myPcs <- cbind(myPcs, covariates)
            }

            myPcs <- as.matrix(subset(myPcs, select=-V1))
            colnames(myPcs) = myPcsColnames

            model <- lm(expression~myPcs)
            
            if (is.null(covariates)==FALSE) {
                modelReduced <- lm(expression~as.matrix(covariates))
                a <- anova(model, modelReduced, test="F")
                pval <- a$'Pr(>F)'[2]
            } else {
                f <- summary(model)$fstatistic
                pval <- pf(f[1],f[2],f[3],lower.tail=F)
                attributes(pval) <- NULL
            }

            if (is.null(covariates)==FALSE) {
                betas <- model$coefficients[seq(1,
                    (length(model$coefficients)-length(names(covariates))))]
            } else {
                betas <- model$coefficients
            }

            r.sq <- summary(model)$r.squared
            results <- list(coefficients=betas, p.val=pval, r.sq=r.sq)
            return(results)
        },
        error = function(e) {
            ## TODO: throw warning
            betas <- NA
            pval <- NA
            r.sq <- NA
            results <- list(coefficients=betas, p.val=pval, r.sq=r.sq)
            return(results)
        }
    )
}

#' Count the number of genomic regions on which the TBA was computed
#'
#' @param tbaPaths, A vector of strings, which are the paths to
#' MultiAssayExperiment RDS files containing the tba values
#'
#' @return An integer, that is the summation of length(assays()) of every
#' MultiAssayExperiment RDS object indicated in the param tbaPaths
#' @import MultiAssayExperiment
#' @export
#'
#' @examples
#' if (interactive()) {
#' testingTbaPaths <- system.file("extdata","testing.tba.toydata.rds",
#' package="AffiXcan")
#'
#' regionsCount <- overlookRegions(tbaPaths=testingTbaPaths)
#' }
overlookRegions <- function(tbaPaths) {

    regionsCount <- 0 
    
    for(path in tbaPaths) {
        tba <- readRDS(path)
        assaysNum <- length(assays(tba))
        regionsCount <- regionsCount + assaysNum
    }

    return(regionsCount)
    
}

#' Compute PCs in MultiAssayExperiment objects using eigenvectors given by user
#'
#' @param tbaPaths A vector of strings, which are the paths to
#' MultiAssayExperiment RDS files containing the tba values
#' @param affiXcanTraining The returning object from affiXcanTrain()
#' @param scale A logical; if scale=FALSE the TBA values will be only centered,
#' not scaled before performing PCA
#' @param BPPARAM A BiocParallelParam object. Default is bpparam(). For
#' details on BiocParallelParam virtual base class see 
#' browseVignettes("BiocParallel")
#' @param testingSamples A vector of strings. The identifiers (e.g. row names
#' of MultiAssayExperiment objects from tbaPaths) of the samples that have not
#' been considered in the training phase, to be used in the cross-validation;
#' default is NULL; if is.null(testingSamples)==TRUE then no filtering is
#' performed
#'
#' @return A list of matrices containing the principal components values of TBA
#' for each region; each object of the list is named after the
#' MultiAssayExperiment object from which it derives
#' @import MultiAssayExperiment BiocParallel
#' @export
#'
#' @examples
#' if (interactive()) {
#' data(exprMatrix)
#' data(trainingCovariates)
#' data(regionAssoc)
#'
#' trainingTbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#' testingTbaPaths <- system.file("extdata","testing.tba.toydata.rds",
#' package="AffiXcan")
#'
#' assay <- "values"
#'
#' training <- affiXcanTrain(exprMatrix=exprMatrix, assay=assay,
#' tbaPaths=trainingTbaPaths, regionAssoc=regionAssoc, cov=trainingCovariates,
#' varExplained=80, scale=TRUE)
#'
#' pcs <- affiXcanPcs(tbaPaths=testingTbaPaths, affiXcanTraining=training,
#' scale=TRUE)
#' }
affiXcanPcs <- function(tbaPaths, affiXcanTraining, scale, BPPARAM=bpparam(),
                        testingSamples=NULL) {

    pcs <- vector("list", affiXcanTraining$regionsCount)
    index <- 0

    for(i in seq(1,length(tbaPaths))) {

        pca <- affiXcanTraining$pca
        tbaMatrixMAE <- readRDS(tbaPaths[i])
        
        if (is.null(testingSamples)==FALSE) {
            tbaMatrixMAE <- MultiAssayExperiment::subsetByRow(tbaMatrixMAE,
                                                              testingSamples)
        }
        
        tbaMatrix <- MultiAssayExperiment::experiments(tbaMatrixMAE)
        regionsList <- setNames(as.list(names(tbaMatrix)), names(tbaMatrix))
        
        newPcs <- BiocParallel::bplapply(X=regionsList, FUN=computePcs,
            tbaMatrix, scale, pca, BPPARAM=BPPARAM)
        
        for(l in seq(1,length(newPcs))) {
            pcs[index + l] <- newPcs[l]
            names(pcs)[index + l] <- names(newPcs)[l]
        }

        index <- index + length(newPcs)
    
    }

    return(pcs)
}

#' Compute a matrix product between variables and eigenvectors
#'
#' @param region A string, which is the name of the object in the list
#' MultiAssayExperiment::experiments(tbaMatrix) that contains the TBA values of
#' the genomic region of interest (see the param tbaMatrix)
#' @param tbaMatrix A MultiAssayExperiment object containing the TBA values
#' @param scale A logical; if scale=FALSE the TBA values will be only centered,
#' not scaled before performing PCA
#' @param pca The returningObject$pca from affiXcanTrain()
#'
#' @return A data.frame containing the principal components values of the TBA
#' in a certain genomic region, as the result of the matrix product between the
#' TBA values and the eigenvectors
#' @import MultiAssayExperiment
#' @export
#'
#' @examples
#' if (interactive()) {
#' trainingTbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#'
#' data(exprMatrix)
#' data(regionAssoc)
#' data(trainingCovariates)
#'
#' assay <- "values"
#'
#' training <- affiXcanTrain(exprMatrix=exprMatrix, assay=assay,
#' tbaPaths=trainingTbaPaths, regionAssoc=regionAssoc, cov=trainingCovariates,
#' varExplained=80, scale=TRUE)
#'
#' region <- "ENSG00000256377.1"
#'
#' tbaMatrixMAE <- readRDS(system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan"))
#'
#' pca <- training$pca
#' pcs <- computePcs(region=region, tbaMatrix=tbaMatrixMAE, scale=TRUE, pca=pca)
#' }
computePcs <- function(region, tbaMatrix, scale, pca) {

    myRegion <- region[[1]]
    myTbaMatrix <- tbaMatrix[[myRegion]]

    tryCatch (
        {
            myEigenvectors <- as.matrix(pca[[myRegion]]$eigenvectors)
            dataScaled <- as.matrix(scale(myTbaMatrix, center = TRUE,
                scale = scale))
            myPcs <- as.data.frame(dataScaled %*% myEigenvectors)
            return(pcs=myPcs)
        },
        error = function(e) {
            myPcs <- NA
            return(pcs=myPcs)
        }
    )
}

#' Compute a GReX from variables and their coefficients for a set of genes
#'
#' @param affiXcanTraining The returning object from affiXcanTrain()
#' @param pcs A list, which is the returning object from affiXcanPcs()
#' @param BPPARAM A BiocParallelParam object. Default is bpparam(). For
#' details on BiocParallelParam virtual base class see 
#' browseVignettes("BiocParallel")
#'
#' @return A SummarizedExperiment object containing the imputed GReX values
#' @import SummarizedExperiment BiocParallel 
#' @export
#'
#' @examples
#' if (interactive()) {
#' trainingTbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#'
#' data(exprMatrix)
#' data(regionAssoc)
#' data(trainingCovariates)
#'
#' assay <- "values"
#'
#' training <- affiXcanTrain(exprMatrix=exprMatrix, assay=assay,
#' tbaPaths=trainingTbaPaths, regionAssoc=regionAssoc, cov=trainingCovariates,
#' varExplained=80, scale=TRUE)
#'
#' testingTbaPaths <- system.file("extdata","testing.tba.toydata.rds",
#' package="AffiXcan")
#'
#' pcs <- affiXcanPcs(tbaPaths=testingTbaPaths, affiXcanTraining=training,
#' scale=TRUE)
#'
#' exprmatrix <- affiXcanGReX(affiXcanTraining=training, pcs=pcs)
#' }
affiXcanGReX <- function(affiXcanTraining, pcs, BPPARAM=bpparam()) {
    bs <- affiXcanTraining$bs
    
    expr <- BiocParallel::bplapply(X=bs, FUN=computeExpr, pcs, BPPARAM=BPPARAM)

    exprmatrix <- matrix(nrow = nrow(pcs[[1]]), ncol=0)
    rownames(exprmatrix) <- rownames(pcs[[1]])
    
    for(i in seq(1:length(expr))) {
        if(is.na(expr[[i]][[1]])==FALSE) {
            exprMatrixPart <- matrix(expr[[i]], ncol=1)
            colnames(exprMatrixPart)[[1]] <- ls(expr[i])
            exprMatrixPart <- as.matrix(exprMatrixPart, ncol=1)
            exprmatrix <- cbind(exprmatrix, exprMatrixPart)
        }
    }
    
    ## TODO: possibly include a colData given by the user in the SE object
    Summarized.GReX <- SummarizedExperiment(assays=list(GReX=t(exprmatrix)))
    
    return(Summarized.GReX)
}

#' Compute the imputed GReX for a certain gene on a set of individuals
#'
#' @param bs A list containing three objects:
#' \itemize{
#'  \item coefficients: An object containing the coefficients of the principal
#'  components used in the model, completely similar to the "coefficients"
#'  object from the results of lm()
#'  \item p.val: The uncorrected anova pvalue of the model
#'  \item r.sq: The coefficient of determination between the real total expression
#'  values and the imputed GReX, retrived from summary(model)$r.squared
#' }
#' @param pcs A list, which is the returning object of affiXcanPcs()
#'
#' @return A vector of imputed GReX values
#' @export
#'
#' @examples
#' if (interactive()) {
#' trainingTbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#'
#' data(exprMatrix)
#' data(regionAssoc)
#' data(trainingCovariates)
#'
#' assay <- "values"
#'
#' training <- affiXcanTrain(exprMatrix=exprMatrix, assay=assay,
#' tbaPaths=trainingTbaPaths, regionAssoc=regionAssoc, cov=trainingCovariates,
#' varExplained=80, scale=TRUE)
#'
#' testingTbaPaths <- system.file("extdata","testing.tba.toydata.rds",
#' package="AffiXcan")
#' 
#' pcs <- affiXcanPcs(tbaPaths=testingTbaPaths, affiXcanTraining=training,
#' scale=TRUE)
#'
#' region <- "ENSG00000256377.1"
#' bs <- training$bs[[region]]
#'
#' exprmatrix <- computeExpr(bs=bs, pcs=pcs)
#' }
computeExpr <- function(bs, pcs) {

    tryCatch (
        {   
            ## ANOVA p-value is not used anymore (changes in version 1.3.1)
            #if(is.na(bs$correctedP) == TRUE || bs$correctedP > 0.05) {
            #    exprVector <- NA
            #    return(exprVector)
            #} else {
            
            ## create vector with all regulatory region names associated
            ## with the expressed region
            regulRegions <- names(bs$coefficients[-1])
            for(i in seq(1:length(regulRegions))) {

                name <- strsplit(regulRegions[[i]], split="@", fixed=TRUE)
                regulRegions[[i]] <- name[[1]][[2]]

            }
            regulRegions <- unique(regulRegions)
            ## create data frame with all the predictors 
            pcVals <- as.data.frame(matrix(nrow = nrow(pcs[[regulRegions]]),
                ncol=0))
            rownames(pcVals) <- as.vector(rownames(pcs[[regulRegions]]))
            for(region in regulRegions) {
                pcValsPart <- pcs[[region]]
                colnames(pcValsPart) <- paste(colnames(pcValsPart), "@",
                    region, sep="")
                pcVals <- cbind(pcVals, pcValsPart)
            }
            ## create vector with imputed expression values
            exprVector <- vector(mode="numeric", length=nrow(pcVals))
            intercept <- bs$coefficients[[1]]
            coefficients <- as.vector(bs$coefficients[-1])
            for(j in seq(1, nrow(pcVals))) {
                expr <- sum(coefficients * as.vector(pcVals[j, ]))
                    + intercept
                exprVector[j] <- expr
            }

            return(exprVector) 
        },
        error = function(e) {
            exprVector <- NA
            return(exprVector)
        }
    )
}

#' Compute R and R^2 between rows of two SummarizedExperiment assays
#'
#' @param realExpr A SummarizedExperiment object containing expression data
#' @param imputedExpr The returning object of affiXcanImpute()
#' @param assay A string with the name of the object in
#' SummarizedExperiment::assays(realExpr) that contains expression values
#' @param testingSamples A vector of strings. The identifiers of the samples
#' that have to be considered by the function; default is NULL; if 
#' is.null(testingSamples)==TRUE then no filtering is performed
#' @param BPPARAM A BiocParallelParam object. Default is bpparam(). For
#' details on BiocParallelParam virtual base class see 
#' browseVignettes("BiocParallel")
#'
#' @return A list of lists; inner lists are named after the rows for which the
#' correlation between realExpr and imputedExpr have been computed; inner
#' lists contain two objects:
#' \itemize{     
#'  \item rho: the pearson's correlation coefficient (R) between the real
#'  expression values and the imputed GReX for the cross-validation i on
#'  the expressed gene y, computed with cor()
#'  \item rho.sq: the coefficient of determination (R^2) between the real
#'  expression values and the imputed GReX for the cross-validation i on
#'  the expressed gene y, computed as pearson^2
#'  \item cor.test.p.val: the p-value of the cor.test() between the real expression values
#'  and the imputed GReX for the cross-validation i on the expressed gene y
#' }
#' @import SummarizedExperiment BiocParallel
#' @export
#'
#' @examples
#' if (interactive()) {
#' trainingTbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#' 
#' data(exprMatrix)
#' data(regionAssoc)
#' data(trainingCovariates)
#' 
#' assay <- "values"
#' 
#' training <- affiXcanTrain(exprMatrix=exprMatrix, assay=assay,
#' tbaPaths=trainingTbaPaths, regionAssoc=regionAssoc, cov=trainingCovariates,
#' varExplained=80, scale=TRUE)
#' 
#' imputedExpr <- affiXcanImpute(tbaPaths=trainingTbaPaths,
#' affiXcanTraining=training, scale=TRUE)
#' realExpr <- exprMatrix
#' 
#' correlation <- computeRSquared(realExpr, imputedExpr, assay)
#' }
computeRSquared <- function(realExpr, imputedExpr, assay,
                            testingSamples=NULL, BPPARAM=bpparam()) {
    geneNames <- rownames(imputedExpr)
    names(geneNames) <- geneNames

    if (is.null(testingSamples)==FALSE) {
        realExpr <- realExpr[,colnames(exprMatrix) %in% testingSamples]
    }
   
    ## check/correct the consistency of the order of the samples
    if(is.na(table(colnames(imputedExpr)==colnames(realExpr))['FALSE'])==FALSE){
        imputedExpr <- imputedExpr[,colnames(realExpr)]
    }
    
    imputedExpr <- SummarizedExperiment::assays(imputedExpr)$GReX
    realExpr <- SummarizedExperiment::assays(realExpr)[[assay]] 
    correlation <- BiocParallel::bplapply(X=geneNames, FUN=computeCorrelation,
                                       realExpr, imputedExpr, BPPARAM=BPPARAM)
    return(correlation)
}

#' Compute R and R^2 on a particular row of two SummarizedExperiment assays
#'
#' @param geneName A string. The row name in realExpr and imputedExpr objects
#' that identifies the vectors between which R and R^2 have to be computed
#' @param realExpr A SummarizedExperiment object containing expression data
#' @param imputedExpr The returning object of affiXcanImpute()
#'
#' @return A list of two objects:
#' \itemize{     
#'  \item rho: the pearson's correlation coefficient (R) between the real
#'  expression values and the imputed GReX for the cross-validation i on
#'  the expressed gene y, computed with cor()
#'  \item rho.sq: the coefficient of determination (R^2) between the real
#'  expression values and the imputed GReX for the cross-validation i on
#'  the expressed gene y, computed as pearson^2
#'  \item cor.test.p.val: the p-value of the cor.test() between the real expression values
#'  and the imputed GReX for the cross-validation i on the expressed gene y
#' }
#' @export
#'
#' @examples
#' if (interactive()) {
#' trainingTbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#' 
#' data(exprMatrix)
#' data(regionAssoc)
#' data(trainingCovariates)
#' 
#' assay <- "values"
#' 
#' training <- affiXcanTrain(exprMatrix=exprMatrix, assay=assay,
#' tbaPaths=trainingTbaPaths, regionAssoc=regionAssoc, cov=trainingCovariates,
#' varExplained=80, scale=TRUE)
#' 
#' imputedExpr <- affiXcanImpute(tbaPaths=trainingTbaPaths,
#' affiXcanTraining=training, scale=TRUE)
#' realExpr <- exprMatrix
#' 
#' geneName <- "ENSG00000256377.1"
#' imputedExpr <- SummarizedExperiment::assays(imputedExpr)$GReX
#' realExpr <- SummarizedExperiment::assays(realExpr)[[assay]]
#'
#' correlation <- computeCorrelation(geneName, realExpr, imputedExpr) 
#' }
computeCorrelation <- function(geneName, realExpr, imputedExpr) {
    realExpr <- realExpr[rownames(realExpr)==geneName]
    imputedExpr <- imputedExpr[rownames(imputedExpr)==geneName]
    pearson <- cor(imputedExpr, realExpr)
    pearson.sq <- pearson^2
    pvalue <- cor.test(imputedExpr, realExpr)$p.value
    correlation <- list("rho"=pearson, "rho.sq"=pearson.sq, "cor.test.p.val"=pvalue)
    return(correlation)
}

#' Impute a GReX for each gene for which a model was generated
#'
#' @param tbaPaths A vector of strings, which are the paths to
#' MultiAssayExperiment RDS files containing the tba values
#' @param affiXcanTraining The returning object from affiXcanTrain()
#' @param scale A logical; if scale=FALSE the TBA values will be only centered,
#' not scaled before performing PCA; default is TRUE
#' @param BPPARAM A BiocParallelParam object. Default is bpparam(). For
#' details on BiocParallelParam virtual base class see 
#' browseVignettes("BiocParallel")
#'
#' @return A SummarizedExperiment object containing imputed GReX values
#' @import MultiAssayExperiment SummarizedExperiment BiocParallel crayon
#' @export
#'
#' @examples
#' trainingTbaPaths <- system.file("extdata","training.tba.toydata.rds",
#' package="AffiXcan")
#'
#' data(exprMatrix)
#' data(regionAssoc)
#' data(trainingCovariates)
#'
#' assay <- "values"
#'
#' training <- affiXcanTrain(exprMatrix=exprMatrix, assay=assay,
#' tbaPaths=trainingTbaPaths, regionAssoc=regionAssoc,
#' varExplained=80, scale=TRUE)
#'
#' testingTbaPaths <- system.file("extdata","testing.tba.toydata.rds",
#' package="AffiXcan")
#' 
#' exprmatrix <- affiXcanImpute(tbaPaths=testingTbaPaths,
#' affiXcanTraining=training, scale=TRUE)
affiXcanImpute <- function(tbaPaths, affiXcanTraining, scale=TRUE,
                           BPPARAM=bpparam()) {
    regionsCount <- overlookRegions(tbaPaths)
    if(regionsCount!=affiXcanTraining$regionsCount) {
        warning(cat("The amount of genomic regions included in the training
                    phase is different from those found in the TBA matrices:\n
                    TBA in the training set was computed on ",
                    affiXcanTraining$regionsCount, " regions\n TBA provided in
                    tbaPaths refers to ", regionsCount, " regions\n"))
    }
    cat(bold(green("\nAffiXcan")),": ",
        cyan("Imputing Genetically Regulated Expression (GReX)\n"), sep="")
    cat(bold(cyan("\t--> ")), green("Computing Principal Components\n"), sep="")
    pcs <- affiXcanPcs(tbaPaths, affiXcanTraining, scale, BPPARAM)
    cat(bold(cyan("\t--> ")), green("Imputing GReX values\n"), sep="")
    exprmatrix <- affiXcanGReX(affiXcanTraining, pcs, BPPARAM)
    cat(bold(cyan("\tDone\n")))
    return(exprmatrix)
}