... | ... |
@@ -1,18 +1,18 @@ |
1 | 1 |
|
2 |
-#' Lasso, Ridge and Elasticnet Regularized Generalized Linear Models for Binary Outcomes |
|
2 |
+#' Lasso, Ridge, and Elasticnet Regularized Generalized Linear Models for Binary Outcomes |
|
3 | 3 |
#' |
4 |
-#' @description PomaLasso() is an implementation of the lasso, ridge and elasticnet regression from `glmnet` package for binary outcomes. |
|
4 |
+#' @description `PomaLasso` performs LASSO, Ridge, and Elasticnet regression for feature selection and prediction purposes for binary outcomes. |
|
5 | 5 |
#' |
6 |
-#' @param data A SummarizedExperiment object. |
|
7 |
-#' @param alpha Elasticnet mixing parameter. alpha = 1 is the lasso penalty and alpha = 0 is the ridge penalty. This value must be between 0 and 1. |
|
8 |
-#' @param ntest Numeric indicating the percentage of observations that will be used as test set. Default is NULL (no test set). |
|
9 |
-#' @param nfolds Number of folds for CV (default is 10). Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. Smallest value allowable is nfolds = 3. |
|
10 |
-#' @param lambda A user supplied lambda sequence. Typical usage is to have the program compute its own lambda sequence based on `nlambda` and `lambda.min.ratio`. See `?glmnet::glmnet()`. |
|
11 |
-#' @param labels Logical indicating if feature names should be plotted in coefficient plot or not. Default is FALSE. |
|
6 |
+#' @param data A `SummarizedExperiment` object. |
|
7 |
+#' @param alpha Numeric. Indicates the elasticnet mixing parameter. alpha = 1 is the LASSO penalty and alpha = 0 is the Ridge penalty. |
|
8 |
+#' @param ntest Numeric. Indicates the percentage of observations that will be used as test set. Default is NULL (no test set). |
|
9 |
+#' @param nfolds Numeric. Indicates number of folds for cross-validation (default is 10). Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. Smallest value allowable is nfolds = 3. |
|
10 |
+#' @param lambda Numeric. Indicates the user supplied lambda sequence. Typical usage is to have the program compute its own lambda sequence based on `nlambda` and `lambda.min.ratio`. See `?glmnet::glmnet()`. |
|
11 |
+#' @param labels Logical. Indicates if feature names should be plotted in coefficient plot or not. Default is FALSE. |
|
12 | 12 |
#' |
13 | 13 |
#' @export |
14 | 14 |
#' |
15 |
-#' @return A list with all results including plots, tables and the resulting prediction model. |
|
15 |
+#' @return A `list` with results. |
|
16 | 16 |
#' @references Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010). Regularization Paths for Generalized Linear Models via Coordinate Descent. Journal of Statistical Software, 33(1), 1-22. URL http://www.jstatsoft.org/v33/i01/. |
17 | 17 |
#' @author Pol Castellano-Escuder |
18 | 18 |
#' |
... | ... |
@@ -25,21 +25,18 @@ |
25 | 25 |
#' st000336 %>% |
26 | 26 |
#' PomaImpute() %>% |
27 | 27 |
#' PomaNorm() %>% |
28 |
-#' PomaOutliers() %>% |
|
29 | 28 |
#' PomaLasso() |
30 | 29 |
#' |
31 | 30 |
#' # elasticnet |
32 | 31 |
#' st000336 %>% |
33 | 32 |
#' PomaImpute() %>% |
34 | 33 |
#' PomaNorm() %>% |
35 |
-#' PomaOutliers() %>% |
|
36 | 34 |
#' PomaLasso(alpha = 0.5) |
37 | 35 |
#' |
38 | 36 |
#' # ridge |
39 | 37 |
#' st000336 %>% |
40 | 38 |
#' PomaImpute() %>% |
41 | 39 |
#' PomaNorm() %>% |
42 |
-#' PomaOutliers() %>% |
|
43 | 40 |
#' PomaLasso(alpha = 0) |
44 | 41 |
PomaLasso <- function(data, |
45 | 42 |
alpha = 1, |
... | ... |
@@ -48,44 +45,37 @@ PomaLasso <- function(data, |
48 | 45 |
lambda = NULL, |
49 | 46 |
labels = FALSE){ |
50 | 47 |
|
51 |
- if (missing(data)) { |
|
52 |
- stop("data argument is empty!") |
|
53 |
- } |
|
54 |
- if(!is(data, "SummarizedExperiment")){ |
|
55 |
- stop("data is not a SummarizedExperiment object. \nSee POMA::PomaSummarizedExperiment or SummarizedExperiment::SummarizedExperiment") |
|
48 |
+ if (!is(data, "SummarizedExperiment")){ |
|
49 |
+ stop("data is not a SummarizedExperiment object. \nSee POMA::PomaCreateObject or SummarizedExperiment::SummarizedExperiment") |
|
56 | 50 |
} |
57 | 51 |
if (alpha > 1 | alpha < 0) { |
58 |
- stop("alpha must be a number between 0 and 1...") |
|
52 |
+ stop("alpha must be a number between 0 and 1") |
|
59 | 53 |
} |
60 | 54 |
if(!is.null(ntest)){ |
61 | 55 |
if (ntest > 50 | ntest < 5) { |
62 |
- stop("ntest must be a number between 5 and 50...") |
|
56 |
+ stop("ntest must be a number between 5 and 50 (%)") |
|
63 | 57 |
} |
64 | 58 |
} |
65 |
- if (length(levels(as.factor(SummarizedExperiment::colData(data)[,1]))) > 2) { |
|
66 |
- stop("Your data have more than two groups!") |
|
67 |
- } |
|
68 |
- if (length(levels(as.factor(SummarizedExperiment::colData(data)[,1]))) < 2) { |
|
69 |
- stop("Your data have less than two groups!") |
|
70 |
- } |
|
71 |
- |
|
59 |
+ |
|
72 | 60 |
features <- t(SummarizedExperiment::assay(data)) |
73 |
- response <- as.factor(SummarizedExperiment::colData(data)[,1]) |
|
74 |
- lasso_data <- cbind(response, features) |
|
75 |
- |
|
76 |
- n <- nrow(lasso_data) |
|
61 |
+ group_factor <- as.factor(SummarizedExperiment::colData(data)[,1]) |
|
62 |
+ to_lasso <- cbind(group_factor, features) |
|
63 |
+ |
|
64 |
+ if (length(table(group_factor)[table(group_factor) != 0]) != 2) { |
|
65 |
+ stop("Grouping factor must have exactly 2 levels (first column of the metadata file)") |
|
66 |
+ } |
|
77 | 67 |
|
78 |
- if(!is.null(ntest)){ |
|
68 |
+ if (!is.null(ntest)){ |
|
79 | 69 |
|
80 | 70 |
repeat { |
81 | 71 |
|
82 |
- idx_test <- sample(1:n, (ntest/100)*n, replace = FALSE) |
|
72 |
+ idx_test <- sample(1:nrow(to_lasso), (ntest/100) * nrow(to_lasso), replace = FALSE) |
|
83 | 73 |
|
84 |
- test <- lasso_data[idx_test ,] |
|
74 |
+ test <- to_lasso[idx_test ,] |
|
85 | 75 |
test_x <- test[,-1] |
86 | 76 |
test_y <- test[,1] |
87 | 77 |
|
88 |
- train <- lasso_data[-idx_test ,] |
|
78 |
+ train <- to_lasso[-idx_test ,] |
|
89 | 79 |
train_x <- train[,-1] |
90 | 80 |
train_y <- train[,1] |
91 | 81 |
|
... | ... |
@@ -103,7 +93,7 @@ PomaLasso <- function(data, |
103 | 93 |
|
104 | 94 |
} else { |
105 | 95 |
cv_fit <- glmnet::cv.glmnet(features, |
106 |
- response, |
|
96 |
+ group_factor, |
|
107 | 97 |
family = "binomial", |
108 | 98 |
nfolds = nfolds, |
109 | 99 |
lambda = lambda, |
... | ... |
@@ -120,13 +110,13 @@ PomaLasso <- function(data, |
120 | 110 |
ggplot2::labs(x = "log10(Lambda)", |
121 | 111 |
y = "Estimate") + |
122 | 112 |
ggplot2::geom_vline(xintercept = glance_cv$lambda.min, lty = 2) + |
123 |
- ggplot2::theme_bw() |
|
113 |
+ theme_poma() |
|
124 | 114 |
|
125 | 115 |
tmp_coeffs <- glmnet::coef.glmnet(cv_fit, s = "lambda.min") |
126 | 116 |
final_coef <- data.frame(feature = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = tmp_coeffs@x) %>% |
127 | 117 |
dplyr::as_tibble() |
128 | 118 |
|
129 |
- if(!is.null(ntest)){ |
|
119 |
+ if (!is.null(ntest)){ |
|
130 | 120 |
lasso_pred <- predict(cv_fit, s = cv_fit$lambda.min, newx = data.matrix(test_x), type = "class") |
131 | 121 |
cm <- caret::confusionMatrix(as.factor(lasso_pred), as.factor(test_y)) |
132 | 122 |
} |
... | ... |
@@ -145,14 +135,14 @@ PomaLasso <- function(data, |
145 | 135 |
ggplot2::geom_vline(xintercept = glance_cv$lambda.min, lty = 2) + |
146 | 136 |
ggplot2::theme_bw() + |
147 | 137 |
{if(labels)ggplot2::geom_label(data = tidied_cv2_names, ggplot2::aes(label = term))} + |
148 |
- ggplot2::theme(legend.position = "none") + |
|
149 |
- ggplot2::scale_color_viridis_d(option = "plasma", end = 0.8) |
|
138 |
+ theme_poma(legend_position = "none") + |
|
139 |
+ scale_color_poma_d() |
|
150 | 140 |
|
151 | 141 |
if(!is.null(ntest)){ |
152 | 142 |
return(list(coefficients = final_coef, |
153 |
- coefficientPlot = coefficientplot, |
|
154 |
- cvLassoPlot = cvlasso, |
|
155 |
- confusionMatrix = cm, |
|
143 |
+ coefficients_plot = coefficientplot, |
|
144 |
+ cv_plot = cvlasso, |
|
145 |
+ confusion_matrix = cm, |
|
156 | 146 |
train_x = train_x, |
157 | 147 |
train_y = train_y, |
158 | 148 |
test_x = test_x, |
... | ... |
@@ -160,10 +150,9 @@ PomaLasso <- function(data, |
160 | 150 |
model = cv_fit)) |
161 | 151 |
} else { |
162 | 152 |
return(list(coefficients = final_coef, |
163 |
- coefficientPlot = coefficientplot, |
|
164 |
- cvLassoPlot = cvlasso, |
|
153 |
+ coefficients_plot = coefficientplot, |
|
154 |
+ cv_plot = cvlasso, |
|
165 | 155 |
model = cv_fit)) |
166 | 156 |
} |
167 |
- |
|
168 | 157 |
} |
169 | 158 |
|
... | ... |
@@ -75,7 +75,7 @@ PomaUnivariate <- function(data, |
75 | 75 |
group_factor <- SummarizedExperiment::colData(data)[,1] |
76 | 76 |
to_univariate <- t(SummarizedExperiment::assay(data)) |
77 | 77 |
|
78 |
- # group mean and sd |
|
78 |
+ # group mean and SD |
|
79 | 79 |
group_means <- to_univariate %>% |
80 | 80 |
as.data.frame() %>% |
81 | 81 |
dplyr::mutate(group = group_factor) %>% |
... | ... |
@@ -16,7 +16,7 @@ output: github_document |
16 | 16 |
| _BioC_ branch | Status | Version | Dependencies | Rank | |
17 | 17 |
|- |- |- |- |- | |
18 | 18 |
| [Release](http://bioconductor.org/packages/release/bioc/html/POMA.html) | [](https://bioconductor.org/checkResults/release/bioc-LATEST/POMA/) | [](https://www.bioconductor.org/packages/POMA) | [](http://bioconductor.org/packages/release/bioc/html/POMA.html#since) | [](https://bioconductor.org/packages/stats/bioc/POMA) | |
19 |
-| [Devel](http://bioconductor.org/packages/devel/bioc/html/POMA.html) | [](https://bioconductor.org/checkResults/devel/bioc-LATEST/POMA/) | [](https://bioconductor.org/packages/devel/bioc/html/POMA.html) | [](http://bioconductor.org/packages/devel/bioc/html/POMA.html#since) | [](https://bioconductor.org/packages/stats/bioc/POMA) | |
|
19 |
+| [Devel](http://bioconductor.org/packages/devel/bioc/html/POMA.html) | [](https://bioconductor.org/checkResults/devel/bioc-LATEST/POMA/) | [](https://bioconductor.org/packages/devel/bioc/html/POMA.html) | [](http://bioconductor.org/packages/devel/bioc/html/POMA.html#since) | [](https://bioconductor.org/packages/stats/bioc/POMA) | |
|
20 | 20 |
|
21 | 21 |
<!-- badges: end --> |
22 | 22 |
|
... | ... |
@@ -18,7 +18,7 @@ v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/li |
18 | 18 |
| *BioC* branch | Status | Version | Dependencies | Rank | |
19 | 19 |
|-------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------| |
20 | 20 |
| [Release](http://bioconductor.org/packages/release/bioc/html/POMA.html) | [](https://bioconductor.org/checkResults/release/bioc-LATEST/POMA/) | [](https://www.bioconductor.org/packages/POMA) | [](http://bioconductor.org/packages/release/bioc/html/POMA.html#since) | [](https://bioconductor.org/packages/stats/bioc/POMA) | |
21 |
-| [Devel](http://bioconductor.org/packages/devel/bioc/html/POMA.html) | [](https://bioconductor.org/checkResults/devel/bioc-LATEST/POMA/) | [](https://bioconductor.org/packages/devel/bioc/html/POMA.html) | [](http://bioconductor.org/packages/devel/bioc/html/POMA.html#since) | [](https://bioconductor.org/packages/stats/bioc/POMA) | |
|
21 |
+| [Devel](http://bioconductor.org/packages/devel/bioc/html/POMA.html) | [](https://bioconductor.org/checkResults/devel/bioc-LATEST/POMA/) | [](https://bioconductor.org/packages/devel/bioc/html/POMA.html) | [](http://bioconductor.org/packages/devel/bioc/html/POMA.html#since) | [](https://bioconductor.org/packages/stats/bioc/POMA) | |
|
22 | 22 |
|
23 | 23 |
<!-- badges: end --> |
24 | 24 |
|
... | ... |
@@ -2,7 +2,7 @@ |
2 | 2 |
% Please edit documentation in R/PomaLasso.R |
3 | 3 |
\name{PomaLasso} |
4 | 4 |
\alias{PomaLasso} |
5 |
-\title{Lasso, Ridge and Elasticnet Regularized Generalized Linear Models for Binary Outcomes} |
|
5 |
+\title{Lasso, Ridge, and Elasticnet Regularized Generalized Linear Models for Binary Outcomes} |
|
6 | 6 |
\usage{ |
7 | 7 |
PomaLasso( |
8 | 8 |
data, |
... | ... |
@@ -14,23 +14,23 @@ PomaLasso( |
14 | 14 |
) |
15 | 15 |
} |
16 | 16 |
\arguments{ |
17 |
-\item{data}{A SummarizedExperiment object.} |
|
17 |
+\item{data}{A \code{SummarizedExperiment} object.} |
|
18 | 18 |
|
19 |
-\item{alpha}{Elasticnet mixing parameter. alpha = 1 is the lasso penalty and alpha = 0 is the ridge penalty. This value must be between 0 and 1.} |
|
19 |
+\item{alpha}{Numeric. Indicates the elasticnet mixing parameter. alpha = 1 is the LASSO penalty and alpha = 0 is the Ridge penalty.} |
|
20 | 20 |
|
21 |
-\item{ntest}{Numeric indicating the percentage of observations that will be used as test set. Default is NULL (no test set).} |
|
21 |
+\item{ntest}{Numeric. Indicates the percentage of observations that will be used as test set. Default is NULL (no test set).} |
|
22 | 22 |
|
23 |
-\item{nfolds}{Number of folds for CV (default is 10). Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. Smallest value allowable is nfolds = 3.} |
|
23 |
+\item{nfolds}{Numeric. Indicates number of folds for cross-validation (default is 10). Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. Smallest value allowable is nfolds = 3.} |
|
24 | 24 |
|
25 |
-\item{lambda}{A user supplied lambda sequence. Typical usage is to have the program compute its own lambda sequence based on \code{nlambda} and \code{lambda.min.ratio}. See \code{?glmnet::glmnet()}.} |
|
25 |
+\item{lambda}{Numeric. Indicates the user supplied lambda sequence. Typical usage is to have the program compute its own lambda sequence based on \code{nlambda} and \code{lambda.min.ratio}. See \code{?glmnet::glmnet()}.} |
|
26 | 26 |
|
27 |
-\item{labels}{Logical indicating if feature names should be plotted in coefficient plot or not. Default is FALSE.} |
|
27 |
+\item{labels}{Logical. Indicates if feature names should be plotted in coefficient plot or not. Default is FALSE.} |
|
28 | 28 |
} |
29 | 29 |
\value{ |
30 |
-A list with all results including plots, tables and the resulting prediction model. |
|
30 |
+A \code{list} with results. |
|
31 | 31 |
} |
32 | 32 |
\description{ |
33 |
-PomaLasso() is an implementation of the lasso, ridge and elasticnet regression from \code{glmnet} package for binary outcomes. |
|
33 |
+\code{PomaLasso} performs LASSO, Ridge, and Elasticnet regression for feature selection and prediction purposes for binary outcomes. |
|
34 | 34 |
} |
35 | 35 |
\examples{ |
36 | 36 |
data("st000336") |
... | ... |
@@ -39,21 +39,18 @@ data("st000336") |
39 | 39 |
st000336 \%>\% |
40 | 40 |
PomaImpute() \%>\% |
41 | 41 |
PomaNorm() \%>\% |
42 |
- PomaOutliers() \%>\% |
|
43 | 42 |
PomaLasso() |
44 | 43 |
|
45 | 44 |
# elasticnet |
46 | 45 |
st000336 \%>\% |
47 | 46 |
PomaImpute() \%>\% |
48 | 47 |
PomaNorm() \%>\% |
49 |
- PomaOutliers() \%>\% |
|
50 | 48 |
PomaLasso(alpha = 0.5) |
51 | 49 |
|
52 | 50 |
# ridge |
53 | 51 |
st000336 \%>\% |
54 | 52 |
PomaImpute() \%>\% |
55 | 53 |
PomaNorm() \%>\% |
56 |
- PomaOutliers() \%>\% |
|
57 | 54 |
PomaLasso(alpha = 0) |
58 | 55 |
} |
59 | 56 |
\references{ |
... | ... |
@@ -13,7 +13,7 @@ PomaOddsRatio( |
13 | 13 |
) |
14 | 14 |
} |
15 | 15 |
\arguments{ |
16 |
-\item{data}{A SummarizedExperiment object.} |
|
16 |
+\item{data}{A \code{SummarizedExperiment} object.} |
|
17 | 17 |
|
18 | 18 |
\item{feature_name}{A vector with the name/s of feature/s that will be used to fit the model. If it's NULL (default), all variables will be included in the model.} |
19 | 19 |
|