Browse code

deleted bioc

Keniajin authored on 20/03/2022 07:38:05
Showing 40 changed files

1 1
deleted file mode 100644
... ...
@@ -1,6 +0,0 @@
1
-{
2
-    "source_window_id": "",
3
-    "Source": "Source",
4
-    "cursorPosition": "8,0",
5
-    "scrollLine": "0"
6
-}
7 0
\ No newline at end of file
8 1
deleted file mode 100644
... ...
@@ -1,4 +0,0 @@
1
-{
2
-    "source_window_id": "",
3
-    "Source": "Source"
4
-}
5 0
\ No newline at end of file
6 1
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "064EA836",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/R/launch_protGear_interactive.R",
4
-    "project_path": "R/launch_protGear_interactive.R",
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647521182528.0,
10
-    "source_on_save": false,
11
-    "relative_order": 9,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "52,0",
16
-        "scrollLine": "47"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647760747,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647760747,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,64 +0,0 @@
1
-#' launch_protGear_interactive
2
-#'
3
-#' This is Function is to launch the shiny application
4
-#' @return launches the shiny interactive protGear app
5
-#' @import rmarkdown shiny GGally pheatmap  knitr
6
-#' grid styler factoextra FactoMineR   remotes
7
-#' @importFrom flexdashboard renderValueBox valueBoxOutput valueBox
8
-#' @importFrom shinydashboard renderInfoBox
9
-#' @importFrom  dplyr group_rows between first last
10
-#' @importFrom  kableExtra text_spec
11
-#' @export
12
-#' @examples
13
-#'launch_protGear_interactive()
14
-launch_protGear_interactive <- function() {
15
-  appDir <-
16
-    system.file("shiny-examples",
17
-                "protGear_interactive",
18
-                "protGear_interactive.Rmd",
19
-                package = "protGear")
20
-  if (appDir == "") {
21
-    stop("Could not find example directory. Try re-installing `protGear`.",
22
-         call. = FALSE)
23
-  }
24
-
25
-  rmarkdown::run(file = appDir)
26
-}
27
-
28
-#' launch_select
29
-#'
30
-#' This is Function is to launch mutiple shiny applications for protGear
31
-#' @param theApp accepts one of the folders containing the shiny appplication
32
-#' @return launches the app defined under theApp
33
-#' @export
34
-#' @examples
35
-#' launch_select('protGear_interactive')
36
-launch_select <- function(theApp) {
37
-  # locate all the shiny app examples that exist
38
-  validExamples <-
39
-    list.files(system.file("shiny-examples", package = "protGear"))
40
-
41
-  validExamplesMsg <-
42
-    paste0("The available apps in `protGear` are: '",
43
-           paste(validExamples, collapse = "', '"),
44
-           "'")
45
-
46
-  # if an invalid folder is given, throw an error
47
-  if (missing(theApp) || !nzchar(theApp) ||
48
-      !theApp %in% validExamples) {
49
-    stop(
50
-      'Please run `launch_select()` with a valid example app as an argument.\n',
51
-      validExamplesMsg,
52
-      call. = FALSE
53
-    )
54
-  }
55
-
56
-  # find and launch the app
57
-  appDir <-
58
-    system.file("shiny-examples", theApp, package = "protGear")
59
-  if (grepl('protGear_interactive', theApp)) {
60
-    file_rmd <- 'protGear_interactive.Rmd'
61
-  } else
62
-    file_rmd <- 'index.Rmd'
63
-  rmarkdown::run(file = paste0(appDir, "/", file_rmd))
64
-}
65 0
deleted file mode 100644
... ...
@@ -1,27 +0,0 @@
1
-{
2
-    "id": "112F37F3",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/saved_functions.R",
4
-    "project_path": null,
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647626140637.0,
10
-    "source_on_save": false,
11
-    "relative_order": 15,
12
-    "properties": {
13
-        "tempName": "Untitled1",
14
-        "source_window_id": "",
15
-        "Source": "Source",
16
-        "cursorPosition": "62,0",
17
-        "scrollLine": "36"
18
-    },
19
-    "folds": "",
20
-    "lastKnownWriteTime": 1640081043,
21
-    "encoding": "UTF-8",
22
-    "collab_server": "",
23
-    "source_window": "",
24
-    "last_content_update": 1640081043,
25
-    "read_only": false,
26
-    "read_only_alternatives": []
27
-}
28 0
\ No newline at end of file
29 1
deleted file mode 100644
... ...
@@ -1,64 +0,0 @@
1
-
2
-#'
3
-#' @title Merge array data with the sample ID
4
-#' @description  merge several slide datasets
5
-#'
6
-#' @param dfs A character vector with the names of the
7
-#' @param filenames A vector or list of files names
8
-#' @param data_files A list with of data files
9
-#' @param totsamples The total number of samples per slide. defined in genepix vars
10
-#' @param blockspersample the number of samples in a block
11
-#' @param sampleID_path  location of the sample id files defined in the genepix vars
12
-#' @param bg A logical character indicating whether the background data is being merged or not
13
-#' @export
14
-#' @import purrr
15
-#' @return  A data frame of slide data merged together, for eith bg data or bg corrected data
16
-merge_datasets <- function(dfs,filenames , data_files ,totsamples, blockspersample ,
17
-                           sampleID_path,bg=FALSE){
18
-  if(bg==TRUE){
19
-    data1_transp <- purrr::map(.x=dfs, .f=transp_bg,
20
-                               totsamples=totsamples,blockspersample=blockspersample,
21
-                               sampleID_path=sampleID_path,data_files=data_files )
22
-  }else{
23
-    data1_transp <- purrr::map(.x=dfs, .f=transp,
24
-                               totsamples=totsamples,blockspersample=blockspersample,
25
-                               sampleID_path=sampleID_path,data_files=data_files )
26
-  }
27
-
28
-
29
-  data1_transp <- set_names(data1_transp, purrr::map(filenames, name_of_files))
30
-  return(data1_transp)
31
-}
32
-#'         \\\_End_function\\\         #
33
-#'
34
-#'
35
-
36
-#' Plot mean vs SD of normalised data
37
-#'
38
-#' @param exprs_normalised A normalised object of class data frame or matrix
39
-#'
40
-#' @return A ggplot of mean vs standard deviation
41
-#' @export
42
-#' @description A genereic function to plot \code{mean} vs \code{sd} after normalisation.
43
-#' @import ggplot2
44
-#' @examples
45
-#' matrix_antigen <- readr::read_csv(system.file("extdata", "matrix_antigen.csv", package="protGear"))
46
-#' normlise_vsn <- matrix_normalise(as.matrix(matrix_antigen),
47
-#' method = "vsn",
48
-#' return_plot = FALSE
49
-#' )
50
-#' plot_mean_sd(normlise_vsn)
51
-plot_mean_sd <- function(exprs_normalised){
52
-
53
-  ggplot(exprs_normalised ,  aes(x=rankMean, y=stdev)) +
54
-    geom_jitter( color="red") + theme_classic() +
55
-    ggtitle(paste0(toupper(unique(exprs_normalised$method)))) +
56
-    xlab("Mean rank normalised") +
57
-    ylab("SD normalised") +
58
-    theme_light()
59
-}
60
-
61
-
62
-
63
-
64
-
65 0
deleted file mode 100644
... ...
@@ -1,27 +0,0 @@
1
-{
2
-    "id": "1237BDCB",
3
-    "path": null,
4
-    "project_path": null,
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": true,
9
-    "created": 1643379365839.0,
10
-    "source_on_save": false,
11
-    "relative_order": 16,
12
-    "properties": {
13
-        "tempName": "Untitled1",
14
-        "source_window_id": "",
15
-        "Source": "Source",
16
-        "cursorPosition": "37,0",
17
-        "scrollLine": "0"
18
-    },
19
-    "folds": "",
20
-    "lastKnownWriteTime": 8319874934561204079,
21
-    "encoding": "",
22
-    "collab_server": "",
23
-    "source_window": "",
24
-    "last_content_update": 1643379388859,
25
-    "read_only": false,
26
-    "read_only_alternatives": []
27
-}
28 0
\ No newline at end of file
29 1
deleted file mode 100644
... ...
@@ -1,37 +0,0 @@
1
-
2
-mk_test_trend <- function (x, alternative = c("two.sided", "greater", 
3
-                             "less"), continuity = TRUE) 
4
-{
5
-  if (!is.numeric(x)) {
6
-    stop("'x' must be a numeric vector")
7
-  }
8
-  n <- length(x)
9
-  if (n < 3) {
10
-    stop("'x' must have at least 3 elements")
11
-  }
12
-  na.fail(x)
13
-  alternative <- match.arg(alternative)
14
-  S <- .mkScore(x)
15
-  t <- table(x)
16
-  names(t) <- NULL
17
-  varS <- .varmk(t, n)
18
-  D <- .Dfn(t, n)
19
-  tau <- S/D
20
-  if (continuity) {
21
-    sg <- sign(S)
22
-    z <- sg * (abs(S) - 1)/sqrt(varS)
23
-  }
24
-  else {
25
-    z <- S/sqrt(varS)
26
-  }
27
-  pval <- switch(alternative, two.sided = 2 * min(0.5, pnorm(abs(z), 
28
-                                                             lower.tail = FALSE)), greater = pnorm(z, lower.tail = FALSE), 
29
-                 less = pnorm(z, lower.tail = TRUE))
30
-  DNAME <- deparse(substitute(x))
31
-  ans <- list(data.name = DNAME, p.value = pval, statistic = c(z = z), 
32
-              null.value = c(S = 0), parameter = c(n = n), estimates = c(S = S, 
33
-                                                                         varS = varS, tau = tau), alternative = alternative, 
34
-              method = "Mann-Kendall trend test", pvalg = pval)
35
-  class(ans) <- "htest"
36
-  return(ans)
37
-}
38 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "4AE107A5",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/R/buffer_spot_functions.R",
4
-    "project_path": "R/buffer_spot_functions.R",
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1644591069314.0,
10
-    "source_on_save": false,
11
-    "relative_order": 10,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "23,9",
16
-        "scrollLine": "7"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647760747,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647760747,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,85 +0,0 @@
1
-#' Extract buffer spots of data
2
-#'
3
-#' @param Data1 An object of the class data frame
4
-#' @param buffer_spot A character string containing the name of the buffer
5
-#' spots.
6
-#' @import dplyr
7
-#' @importFrom dplyr select  filter
8
-#' @description A function to extract the buffer spots data. A buffer spot only
9
-#'  has the solution for
10
-#'  proprietary ingredients for stabilizing protein and minimizing evaporation.
11
-#' @return A data frame of the buffer control spots
12
-#' @export
13
-#'
14
-#' @examples
15
-#' bg_correct_df <- readr::read_csv(system.file("extdata", "Data1_sample.csv",
16
-#' package="protGear"))
17
-#' buffer_spots(Data1 = bg_correct_df)
18
-buffer_spots <- function(Data1 , buffer_spot = "buffer") {
19
-  Data2_buffer <- Data1 %>%
20
-    # within each Name count sampleID. We had grouped this earlier
21
-    # n() - gives number of observations in the current group
22
-    mutate(replicate = 1:n()) %>%
23
-    # Select only relevant variables
24
-    dplyr:::select(sampleID,
25
-                   antigen,
26
-                   replicate,
27
-                   FMedianBG_correct,
28
-                   Block,
29
-                   Column,
30
-                   Row)
31
-
32
-  if (buffer_spot == 'buffer') {
33
-    Data2_buffer <- Data2_buffer %>%
34
-      filter(grepl('^[bB][Uu][Ff][Ff][Ee][Rr]', antigen))
35
-  } else {
36
-    Data2_buffer <- Data2_buffer %>%
37
-      filter(grepl(paste0('^', buffer_spot), tolower(antigen)))
38
-  }
39
-
40
-
41
-  Data2_buffer <- Data2_buffer %>%
42
-    # combine Name and replicate
43
-    unite(antigen, antigen, replicate)
44
-  return(Data2_buffer)
45
-}
46
-
47
-
48
-#'  Plot the buffer values
49
-#'
50
-#' @param buffer_names A character string containing the name of the variable
51
-#' with buffer spots. Default set to 'antigen'.
52
-#' @param buffer_mfi A character string containing the name of the variable with
53
-#' MFI value.Assuming background correction is done already.
54
-#' Default to 'FMedianBG_correct'
55
-#' @param slide_id  A character string containing the name of the slide/array
56
-#' identifier variable.
57
-#' @param df A data frame to be used to plot
58
-#' @import dplyr ggplot2
59
-#' @importFrom dplyr select  filter
60
-#' @importFrom gtools mixedsort
61
-#' @importFrom ggplot2 ggplot
62
-#' @return plot of buffer spots
63
-#' @export
64
-#'
65
-#' @examples
66
-#'buffers <- readr::read_csv(system.file("extdata", "buffers_sample2.csv",
67
-#'package="protGear"))
68
-#' plot_buffer(df=buffers,buffer_names = "sampleID")
69
-plot_buffer <- function(df = buffers,
70
-                        buffer_names = "antigen",
71
-                        buffer_mfi = "FMedianBG_correct",
72
-                        slide_id = ".id") {
73
-  x <- buffer_names
74
-  y <- buffer_mfi
75
-  df[[buffer_names]] <- factor(df[[buffer_names]] ,
76
-                               levels = mixedsort(unique(as.character(df[[buffer_names]]))))
77
-
78
-
79
-  p <- ggplot(data = df, aes_string(x = x, y = y)) +
80
-    geom_jitter(aes_string(x = x, y = y, color = slide_id)) +
81
-    geom_boxplot(aes_string(x = x, y = y), alpha = 0.2) +
82
-    theme_light() +
83
-    theme(axis.text.x = element_text(angle = 45, hjust = 1))
84
-  return(p)
85
-}
86 0
deleted file mode 100644
... ...
@@ -1,24 +0,0 @@
1
-{
2
-    "id": "4D556D40",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/runthis.R",
4
-    "project_path": null,
5
-    "type": "r_source",
6
-    "hash": "3771495863",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647626114720.0,
10
-    "source_on_save": false,
11
-    "relative_order": 13,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source"
15
-    },
16
-    "folds": "",
17
-    "lastKnownWriteTime": 1598965142,
18
-    "encoding": "UTF-8",
19
-    "collab_server": "",
20
-    "source_window": "",
21
-    "last_content_update": 1598965142,
22
-    "read_only": false,
23
-    "read_only_alternatives": []
24
-}
25 0
\ No newline at end of file
26 1
deleted file mode 100644
... ...
@@ -1 +0,0 @@
1
-devtools::document()
2 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "5344A29F",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/NEWS.md",
4
-    "project_path": "NEWS.md",
5
-    "type": "markdown",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647521025022.0,
10
-    "source_on_save": false,
11
-    "relative_order": 8,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "7,38",
16
-        "scrollLine": "0"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647521040,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647521040512,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,13 +0,0 @@
1
-# Changes in version 0.99.1 (2021-12-15)
2
-
3
-+ Submitted to Bioconductor
4
-
5
-# Changes in version 0.99.543 (2022-02-14)
6
-+ Submitted to Bioconductor for review
7
-
8
-# Changes in version 0.99.544 (2022-03-17)
9
-+ Removed unnecessary files
10
-+ Added Suggests in Description
11
-+ Updated R requirement to 4.2
12
-+ Added the data documentation
13
-
14 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "5B090C88",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/inst/shiny-examples/protGear_interactive/app.R",
4
-    "project_path": "inst/shiny-examples/protGear_interactive/app.R",
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1644833056795.0,
10
-    "source_on_save": false,
11
-    "relative_order": 15,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "6,4",
16
-        "scrollLine": "0"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1619183120,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1619183120,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,54 +0,0 @@
1
-library(shiny)
2
-library(shinyFiles)
3
-
4
-# Define UI for application that draws a histogram
5
-ui <- fluidPage(
6
-
7
-  # Application title
8
-  titlePanel("Example - Shiny Files Buttons Use"),
9
-
10
-  # Sidebar with a slider input for number of bins
11
-  sidebarLayout(
12
-    sidebarPanel(
13
-      p("First press this ShinyDir Button to locate an existing folder and set
14
-        it to be your new working directory"),
15
-      shinyDirButton("folderChoose","Choose Folder","Choose working directory"),
16
-
17
-      p("In this second step, press the ShinyFiles Button that should point to the
18
-        volumes (Just as the previous button does). In my R version, this is where it
19
-        renders a white interface (see attached picture"),
20
-      shinyFilesButton("filesChoose1","Files Chooser 1","Choose your files",
21
-                       multiple=TRUE),
22
-
23
-      p("In this third button, pI owuld like to link the ShinyFiles interface with
24
-        the selected folder in button 1. This part I have not been able to solve"),
25
-      shinyFilesButton("filesChoose2","Files Chooser 2","Choose your files",
26
-                       multiple=TRUE)
27
-
28
-      ),
29
-
30
-    # Show a plot of the generated distribution
31
-    mainPanel(
32
-      verbatimTextOutput("path")
33
-    )
34
-      )
35
-    )
36
-
37
-# Define server logic required to draw a histogram
38
-server <- function(input, output, session) {
39
-
40
-  volumes <-  getVolumes()
41
-  shinyDirChoose(input, "folderChoose", roots = volumes, session = session)
42
-  sel_path <- reactive({return(print(parseDirPath(volumes, input$folderChoose)))})
43
-  shinyFileChoose(input, "filesChoose1", roots = volumes, session = session)
44
-
45
-
46
-  setWorkingDir<-eventReactive(input$folderChoose,{
47
-    setwd(sel_path())
48
-  })
49
-
50
-  output$path<-renderText(sel_path())
51
-}
52
-
53
-# Run the application
54
-shinyApp(ui = ui, server = server)
55 0
deleted file mode 100644
... ...
@@ -1,27 +0,0 @@
1
-{
2
-    "id": "68D04000",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/vignettes/vignette.Rmd",
4
-    "project_path": "vignettes/vignette.Rmd",
5
-    "type": "r_markdown",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": true,
9
-    "created": 1647516170123.0,
10
-    "source_on_save": false,
11
-    "relative_order": 7,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "258,0",
16
-        "scrollLine": "0",
17
-        "last_setup_crc32": ""
18
-    },
19
-    "folds": "",
20
-    "lastKnownWriteTime": 1647443184,
21
-    "encoding": "UTF-8",
22
-    "collab_server": "",
23
-    "source_window": "",
24
-    "last_content_update": 1647516586911,
25
-    "read_only": false,
26
-    "read_only_alternatives": []
27
-}
28 0
\ No newline at end of file
29 1
deleted file mode 100644
... ...
@@ -1,629 +0,0 @@
1
-title: protGear vignette
2
-  processing suite
3
-author: "Kennedy Mwai"
4
-header-includes:
5
-   - \usepackage[T1]{fontenc}
6
-date: '`r format(Sys.time(), "%d %B, %Y")`'
7
-package: protGear
8
-vignette: >
9
-  %\VignetteIndexEntry{protGear}
10
-  %\VignetteEncoding{UTF-8}
11
-  %\VignetteEngine{knitr::rmarkdown}
12
-link-citations: true
13
-output:
14
-  html_document:
15
-    df_print: paged
16
-    toc: true
17
-    toc_float:
18
-      collapsed: false
19
-    number_sections: false
20
-    theme: spacelab
21
-    highlight: tango
22
-  pdf_document: default
23
-
24
-
25
-
26
-```{r setup, message=FALSE, warning=FALSE}
27
-library(ggpubr)
28
-library(gtools)
29
-library(purrr)
30
-library(scales)
31
-library(pheatmap)
32
-library(data.table)
33
-library(kableExtra)
34
-library(gridExtra)
35
-library(png)
36
-library(knitr)
37
-library(grid)
38
-library(styler)
39
-library(FactoMineR)
40
-library(factoextra)
41
-library(magick)
42
-library(rlang)
43
-library(GGally)
44
-library(ggplotify)
45
-library(remotes)
46
-library(dplyr)
47
-library(tidyr)
48
-knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning = FALSE,
49
-                      fig.align = 'center',
50
-                      dev = "png", 
51
-                       tidy='styler', tidy.opts=list(strict=TRUE))
52
-
53
-
54
-```
55
-
56
-
57
-```{css ,echo=FALSE}
58
-.custom-inline {
59
-  color: red;
60
-  font-weight: 700
61
-}
62
-```
63
-
64
-## Introduction 
65
-
66
-### General information
67
-protGear is a package for protein micro data processing just before the main analysis. The package loads the '`gpr`' or '`txt`' file format extracted by the quantification software and merges this with the specific sample identifiers. The package processes multiple files extracted batch by batch with their corresponding sample identifier file. The sample identifier file has 2 variables '`v1`' and '`v2`' which indicate the mini-array or block number and sample identifier respectively. The '`gpr`' file and the corresponding sample identifier file have the same file name.  protGear also provides a web based $Shiny^{(R)}$ platform for real time visualization of the data processing. 
68
-
69
-In this vignette the general work-flow of protGear will be outlined by processing a sample dataset from a multicentre study __Plasmodium falciparum__ Merozoite Protein Microarray. The multicentre study design motivated the development of the protGear suite. 
70
-
71
-The details of the methodologies are published here https://doi.org/10.1016/j.csbj.2021.04.044
72
-
73
-### Analysis setup
74
-Create 2 folders that hold the '`.gpr`' files and the corresponding sample identifier files.
75
-
76
-![Folder structure of array and sample ID files](folder_structure.png){width=70%}
77
-
78
-
79
-
80
-## Sample identifier file
81
-
82
-![sample ID file structure](image_slide.png)
83
-
84
-### Installation 
85
-
86
-To install `protGear` from BioConductor the following commands in R
87
-
88
-```{r install_package, eval=FALSE}
89
-## install from BioConductor
90
-if (!require("BiocManager", quietly = TRUE))
91
-    install.packages("BiocManager")
92
-BiocManager::install('protGear')
93
-```
94
-
95
-
96
-## Importing data
97
-
98
-```{r call_protGear}
99
-## load the package 
100
-library(protGear)
101
-```
102
-
103
-The first part is to specify the parameters of the micro-array experiment to assist in processing the data.  The parameters specified are 
104
-
105
-  - channel - The scanner fluorescence output used to record the data. It can be  green,red,blue among others with a specified number. 
106
-  - chip_path - the folder where the sub folders of 'gpr' or 'txt' files are stored. This path contains sub folders with the array data, for example the sub folders for the different batches. 
107
-  - totsamples - the number of samples in a slide or array.  
108
-  - blockspersample - The number of blocks a sample it takes. In this example each sample occupies 2 blocks each with 384 spots. 
109
-  - sampleID_path - the folder where the sample identifiers files are stored
110
-  - machine - The indicator for which machine was used to hybridize the samples if the experiment had more than one machine. 
111
-  - date_process -the date of sample processing 
112
-  
113
-The parameters "`chip_path`", "`channel`" , "`totsamples`" and "`sampleID_path`" are mandatory. 
114
-
115
-```{r array_params}
116
-## specify the the parameters to process the data
117
-genepix_vars <- array_vars(channel="635" ,
118
-                           chip_path = system.file("extdata/array_data/", package="protGear"),
119
-                           totsamples = 21,
120
-                           blockspersample = 2,
121
-                           sampleID_path = system.file("extdata/array_sampleID/", package="protGear"),
122
-                           mig_prefix = "_first",
123
-                           machine =1,
124
-                           ## optional 
125
-                           date_process = "0520")
126
-```
127
-
128
-
129
-The exact channel used should be checked in the header of the file from the quantification software under `Wavelengths`. 
130
-
131
-```{r gpr_header}
132
-header_gpr <- readLines(system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"),
133
-                        n=40)
134
-header_gpr <- gsub("\"", "", header_gpr[1:32])
135
-header_gpr[1:32]
136
-```
137
-
138
-
139
-The function  `r text_spec("check_sampleID_files()",color="red")` helps to check whether each micro array file has a corresponding sample identifier file. The sample identifier files are generated from the lab plate maps to match the corresponding samples on a specific slide.If the sample identifier file is missing, protGear automatically generates the id's. 
140
-
141
-### Spatial structure of slide
142
-
143
-protGear offers a functionality to inspect the slide visually for any strong spatial biases when the scan image is not available. However, we recommend using the scanned image to visualize the spatial artefacts that might not be recorded in the `.gpr` file. We include the functions `r text_spec("visualize_slide()",color="red")` and `r text_spec("visualize_slide_2d()",color="red")`  to check the spatial structure. The functions are build on `r text_spec("structure_plot()",color="red")`  which shows the block and mini-array structure of a slide. 
144
-
145
-
146
-
147
-#### Visualize the foreground MFI 
148
-
149
-Here we visualize the foreground MFI using the `r text_spec("visualize_slide",color="red")` function
150
-
151
-```{r slide_struct, fig.align='left'}
152
-visualize_slide(infile=system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"),
153
-                MFI_var ='F635 Median' )
154
-
155
-```
156
-
157
-#### Visualize the background MFI  
158
-
159
-Here we visualize  the background MFI using the `visualize_slide_2d` function
160
-
161
-```{r slide_struct_2d, fig.align='left'}
162
-visualize_slide_2d(infile =system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"), 
163
-                   MFI_var ='B635 Median' )
164
-```
165
-
166
-
167
-### Import .gpr/txt data
168
-
169
-Microarray data is imported using the `r text_spec("read_array_files()",color="red")` function. The function accepts the following mandatory arguments; 
170
-
171
- - `filename` - the name of the file which the data are to be read from. In this example a list of multiple files from a folder is used and passed to `r text_spec("read_array_files()","red")` using `r text_spec("purrr",color="red")`. 
172
- - `data_path` - The path where the file with the data  is located 
173
- - `genepix_vars` -  A list of specific definitions of the experiment design. See  `r text_spec("array_vars()",color="red")`  
174
-
175
-For this example I use the sub-folder 1 specified using `genepix_vars$paths[[1]]` which is this path under vignette folder `r genepix_vars$paths[[1]]`. 
176
- 
177
-```{r array_files}
178
-#### read in all the datasets
179
-### list all the file names under data folder
180
-filenames <- list.files(file.path(genepix_vars$paths[[1]]), 
181
-                        pattern="*.txt$|*.gpr$", full.names=FALSE)
182
-#' @___________________read_in_the_files_with_the_text_data_from_the_chip_____________________________
183
-### read all the data files and save them in a list 
184
-data_path <- paste0(genepix_vars$paths[[1]],"/") 
185
-data_files <- purrr::map(.x = filenames, 
186
-                         .f = read_array_files,
187
-                         data_path=data_path ,
188
-                         genepix_vars=genepix_vars)
189
-data_files <- set_names(data_files, purrr::map(filenames, name_of_files))
190
-```
191
-
192
-
193
-## Background Correction
194
-
195
-Background noise is caused by non-specific fluorescence such as auto-fluorescence of the glass slide or non-specific binding of parameters and reference substances. To cut down the effect of background noise we have included different approaches for background correction. First, we extract the background values, visualize and select the best background approach. We have implemented five different approaches;
196
-  
197
-  1) Local background subtraction
198
-  2) Global background subtraction
199
-  3) Moving minimum background subtraction
200
-  4) Normal and exponential model (normexp) 
201
-  5) Log-linear background correction (Edwards)
202
-
203
-In  '`.gpr`' files the Background column starts with a '`B`' followed by the wavelength or channel. In order to perform background correction, we extract the background mean fluorescent intensities (MFI's) using the function `r text_spec("extract_bg()",color="red")` . The function accepts the arguments `iden` which is the file identifier,  `data_files` a list of data objects with names utilised by `iden` and `genepix_vars` defined using  `r text_spec("array_vars()",color="red")` function. We utilise the `purr::map` function to extract the background MFI of multiple data files. 
204
-
205
-```{r bg_data}
206
-## utilising the map package we process a number of files  under data_files list
207
-dfs <- names(data_files) 
208
-allData_bg <- purrr::map(.x=dfs, .f=extract_bg,data_files=data_files,genepix_vars)
209
-allData_bg <- set_names(allData_bg, purrr::map(filenames, name_of_files))
210
-allData_bg <- plyr::ldply(allData_bg)
211
-```
212
-
213
-### Foreground vs Background
214
-
215
-Before selecting the best background correction approach the MFI signals are be inspected visually. In protGear we first utilise the function `r text_spec("plot_FB()",color="red")` that graphs the background, _BG_Median_ and foreground values, _FBG_Median_. On the protGear Shiny platform the visuals are interactive and you can identify the features or blocks with strong bias. 
216
-
217
-```{r bg_vs_fg}
218
-p1 <- plot_FB(allData_bg,antigen_name="antigen",
219
-              bg_MFI="BG_Median",FG_MFI="FBG_Median",log=FALSE)
220
-
221
-
222
-p1
223
-
224
-```
225
-
226
-### Background MFI by blocks
227
-
228
-```{r bg_plot}
229
-p2 <- plot_bg(df=allData_bg, x_axis="Block",bg_MFI="BG_Median",
230
-        log_mfi=TRUE) 
231
-p2
232
-```
233
-
234
-### Background correction
235
-
236
-After background visualization and selecting the best approach the array data are merged with their specific sample identifier files. 
237
-
238
-_`r text_spec("Note:",color="red")`_ Each array file must have its own corresponding sample identifier `.csv` file stored in `r text_spec("array_vars()",color="red")` function under `sampleID_path`. Check General information section.  
239
-
240
-
241
-The method of background subtraction selected is specified under `r text_spec("method",color="red")` below. The background correction is performed by  `r text_spec("bg_correct()",color="red")` function.
242
-
243
-
244
-
245
-```{r bg_correct}
246
-sample_ID_merged_dfs <- purrr::map(.x=dfs, .f=merge_sampleID ,data_files=data_files , 
247
-                             genepix_vars, method="subtract_local")
248
-sample_ID_merged_dfs <- set_names(sample_ID_merged_dfs, purrr::map(filenames, name_of_files))
249
-```
250
-
251
-
252
-## Buffer spots
253
-
254
-Buffer spots are specific to the experiment design and are not always included. Buffer spots are used to check for unexpected scanning artefacts. The buffer spots should have similar values in  different slides. Some outliers are expected for buffer spots close sample spots or landmark. However you can specify the name of your control antigens here `r text_spec('buffer_spot="buffer"',color="blue")` if you do not have buffer spots.
255
-
256
-```{r buffer_spots_data}
257
-buffer_transp <- purrr::map(.x=sample_ID_merged_dfs, .f=buffer_spots ,  buffer_spot="buffer")
258
-
259
-buffer_transp <- set_names(buffer_transp, purrr::map(filenames, name_of_files))
260
-
261
-buffers <- plyr::ldply(buffer_transp)
262
-plot_buffer(buffers,buffer_names="antigen",buffer_mfi="FMedianBG_correct",slide_id=".id")
263
-
264
-```
265
-
266
-## Coefficient of Variation (CV)
267
-
268
-To calculate the CV's we utilise the  `r text_spec("cv_estimation()",color="red")` function with a `cv_cut_off` specified , sample identifier variable and antigen specified under `sampleID_var` and `antigen` respectively. The `replicate_var` and `mfi_var` identifies the variable with the replicate rank generated and MFI's values. 
269
-
270
-
271
-```{r replicates_plot}
272
-#' @________________________________calculated_cv_for_each_data_file_______________________________________
273
-#' data without the selected mean for the best 2 CVs 
274
-dataCV <- purrr::map(.x=sample_ID_merged_dfs, .f=cv_estimation ,lab_replicates=3  ,
275
-                     cv_cut_off=20,
276
-                     sampleID_var='sampleID', antigen_var='antigen' ,replicate_var='replicate' ,
277
-                     mfi_var='FMedianBG_correct')
278
-
279
-lab_replicates=3
280
-dataCV <- set_names(dataCV, purrr::map(filenames, name_of_files))
281
-
282
-aa <- plyr::ldply(dataCV)
283
-GGally::ggpairs(aa,aes(color=cvCat_all) ,
284
-        columns = paste(seq_len(lab_replicates)), title = "",  axisLabels = "show") +
285
-  theme_light()
286
-   
287
-```
288
-
289
-### Summary of CV values
290
-
291
-Here we create a summary of the CV values for each sample/slide utilising the `r text_spec("cv_by_sample_estimation()",color="red")` function.  This helps to identify samples with high values of CV. On the protGear dashboard an interactive table is created to show the summary for each sample. 
292
-
293
-```{r cv_summary}
294
-
295
-
296
-#' @________________________________summary_of_cv_for_each_sample________________________________________ 
297
-#' creates summaries by cv's greater than 20 and less than 20
298
-
299
-cv_cut_off <- 20
300
-dataCV_sample <- purrr::map(.x=dataCV, .f=protGear::cv_by_sample_estimation , cv_variable="cvCat_all",
301
-                            lab_replicates=3)
302
-dataCV_sample <- set_names(dataCV_sample, purrr::map(filenames, name_of_files))
303
-all_cv_sample <- plyr::ldply(dataCV_sample)
304
-
305
-```
306
-
307
-
308
-```{r cv_by_sample}
309
-less_20 <- rlang::sym(paste0("CV <= ",cv_cut_off, "_perc"))
310
-gt_20 <- rlang::sym(paste0("CV > ",cv_cut_off, "_perc"))
311
-
312
-less_20_per <-  rlang::sym(paste0("% CV <=",cv_cut_off))
313
-gt_20_per <-  rlang::sym(paste0("% CV >",cv_cut_off))
314
-ggplot(all_cv_sample)+
315
-  geom_violin(aes(.id,`CV <= 20_perc`, color="% CV =<20")) +
316
-  geom_violin(aes(.id,`CV > 20_perc`, color="% CV >20")) +
317
-  geom_violin(aes(.id,Others_perc,color="Others")) +
318
-  ylab("% of CV") +
319
-  theme_minimal() +
320
-  ggtitle("% of CV >20 or <=20 for each slide all repeats considered") 
321
-```
322
-
323
-
324
-### Best replicates
325
-
326
-We have implemented a function `r text_spec("best_CV_estimation()",color="red")` to select the best replicates if an experiment has more than 2 technical replicates. This helps to select the less variable replicates.
327
-
328
-```{r best_reps}
329
-#' @________________________________data_with_selected_best_2_CV_______________________________________ 
330
-#' data with the selected mean for the best 2 CVs
331
-dataCV_best2 <- purrr::map(.x=dataCV, .f=best_CV_estimation , slide_id="iden",lab_replicates=3,
332
-                           cv_cut_off=20)
333
-
334
-## give the names to the returned list
335
-dataCV_best2 <- set_names(dataCV_best2, purrr::map(filenames, name_of_files))
336
-
337
-
338
-dataCV_sample_best2 <- purrr::map(.x=dataCV_best2, .f=cv_by_sample_estimation , 
339
-                                  cv_variable="best_CV_cat",lab_replicates=3)
340
-dataCV_sample_best2 <- set_names(dataCV_sample_best2, purrr::map(filenames, name_of_files))
341
-all_cv_sample_best2 <- plyr::ldply(dataCV_sample_best2)
342
-```
343
-
344
-
345
-On the violin below we observe that with selecting the less variable replicates , the percentage of the "good CV" values increases.
346
-
347
-```{r best_cv_plot}
348
-## plot only the CV perccentages
349
-ggplot(all_cv_sample_best2)+
350
-  geom_violin(aes(.id,`CV <= 20_perc`, color="% CV =<20")) +
351
-  geom_violin(aes(.id,`CV > 20_perc`, color="% CV >20")) +
352
-  geom_violin(aes(.id,Others_perc,color="Others")) +
353
-  ylab("% of CV") +
354
-  theme_minimal() +
355
-  ggtitle("% of CV >20 or <=20 for each slide") 
356
-```
357
-
358
-
359
-## Tag subtraction
360
-
361
-Tag subtraction is applied for antigens containing purification tags. A file with the specific TAG name for each antigen is loaded. The file has the `antigen,TAG and TAG_name` and the TAG_name must be part of the antigens listed.
362
-
363
-```{r tag_file}
364
-tag_file <- read.csv(system.file("extdata/TAG_antigens.csv", package="protGear"))
365
-tag_antigens <- c("CD4TAG" , "GST", "MBP")
366
-batch_vars <- list(machine="m1", day="0520")
367
-```
368
-
369
-
370
-### Overview of the TAG file
371
-
372
-```{r tag_glimpse}
373
-tb1 <- data.frame(head(tag_file, n=10))
374
-tb1 %>% 
375
-  kable() %>%
376
-  kable_styling()
377
-```
378
-
379
-
380
-### Subtracting the TAG values
381
-
382
-The function `r text_spec("tag_subtract()",color="red")` implements the TAG subtration by matching the TAG names with the corresponding TAG values. 
383
-
384
-```{r tag_subtract}
385
-#' @________________________________subtract_the_tag_values_______________________________________ 
386
-#'
387
-## tag subtract 
388
-## read in the KILCHip TAG file to substract GST-1, MBP -2 and  CD4TAG - 0 file 
389
-dataCV_tag <- purrr::map(.x=dataCV_best2, .f=tag_subtract , 
390
-                         tag_antigens=tag_antigens, mean_best_CV_var="mean_best_CV",tag_file=tag_file,
391
-                         antigen_var='antigen',
392
-                         batch_vars=batch_vars)
393
-dataCV_tag <- set_names(dataCV_tag, purrr::map(filenames, name_of_files))
394
-dataCV_tag <- plyr::ldply(dataCV_tag)
395
-```
396
-
397
-
398
-In this example here we plot a graph of antigens Tagged with GST before and after TAG subtraction.
399
-
400
-```{r before_after_tag}
401
-aaa <- dataCV_tag %>% 
402
-  filter(TAG_name=="GST") 
403
-
404
-aaa <- aaa %>% 
405
-dplyr::select(.id,sampleID,antigen,mean_best_CV,mean_best_CV_tag)
406
-
407
-aaa <- aaa %>% 
408
-  gather(measure,mfi,-c(.id:antigen))
409
-
410
-ggplot(aaa,aes(as.factor(antigen),mfi,color=measure))  +
411
-  geom_boxplot(aes(fill=measure),alpha=0.5)+
412
-  theme_light() +
413
-  xlab("antigen name")+
414
-  ggtitle("Before and after TAG subtraction") +
415
-  theme(axis.text.x = element_text(angle = 45, hjust = 1))
416
-```
417
-
418
-
419
-## Normalisation
420
-
421
-To normalise the data, we create a matrix `matrix_antigen` with all the data combined. We ensure the slide identifier is included as row names of the matrix or we have a way to join them after normalisation. The `array_matrix` matrix is used to hold these parameters. In place of `AMA1` in the example below you use one of your features or antigen. We have implemented  four different normalisation techniques;
422
-
423
-1) $Log_2$ normalisation
424
-2) Loess normalisation
425
-3) RLM normalisation
426
-4) VSN normalisation. 
427
-
428
-
429
-```{r normalisation}
430
-df_to_normalise <-  dataCV_tag  %>%  ungroup() %>%  
431
-  dplyr::select(slide=.id,sampleID,sample_array_ID,antigen,mean_best_CV) %>%  
432
-  group_by(sampleID, slide)
433
-df_to_normalise$sample_index <- group_indices(.data =df_to_normalise )
434
-
435
-### 
436
-to_normalise <- df_to_normalise %>%
437
- ungroup() %>% dplyr::select(-slide,-sampleID,-sample_array_ID) %>% 
438
-  dplyr::select(antigen, sample_index, everything()) %>%  
439
-  gather(variable, value, -(antigen:sample_index)) %>%
440
-  unite(temp, antigen ) %>%  dplyr::select(-variable) %>%
441
-  spread(temp, value) %>% 
442
-  as.data.frame(.)
443
-
444
-### get the row names of the machine data
445
-row.names(to_normalise) <- to_normalise$sample_index
446
-#batch_all <- as.factor(paste0(to_normalise$machine,"/",to_normalise$day))
447
-#machines <- as.factor(to_normalise$machine)
448
-#day_batches <- as.factor(to_normalise$day)
449
-
450
-## create the matrix to normalise
451
-matrix_antigen <-  to_normalise %>% 
452
-   dplyr::select(-sample_index) %>% 
453
-  as.matrix(.)
454
-
455
-
456
-## create the matrix to hold the important parameters 
457
-## in place of AMA1 you use one of your features or antigen
458
-array_matrix <- df_to_normalise %>% 
459
-  filter(antigen=="AMA1") %>% 
460
-  ungroup() %>% 
461
-   dplyr::select(sample_array_ID,sample_index,slide)
462
-
463
-control_antigens <- c("CommercialHumanIgG","CD4TAG")
464
-
465
-```
466
-
467
-
468
-The `r text_spec("matrix_normalise()",color="red")`  function is used to normalise the data and returns a list of plots and a matrix of normalised values. A plot is returned if ` return_plot = TRUE` otherwise only a matrix of normalised values will be returned. 
469
-
470
-```{r normalised_df}
471
-normlise_df <- matrix_normalise(matrix_antigen, method = "vsn", array_matrix=array_matrix,
472
-                       return_plot = TRUE,control_antigens=control_antigens)
473
-
474
-normlise_df$plot_normalisation
475
-
476
-```
477
-
478
-
479
-### Compare normalisation methods
480
-
481
-On the dashboard we compare the different normalisation techniques using the `r text_spec("mutiple_plot()",color="red")` function after loading the data.
482
-
483
-
484
-```{r compare_methods}
485
-control_antigens <- c("CommercialHumanIgG","CD4TAG")
486
-## no normalisation
487
-normalise_list_none <- matrix_normalise(matrix_antigen=matrix_antigen, 
488
-                                         method = "none", 
489
-                                         array_matrix=array_matrix,
490
-                                         return_plot = TRUE,
491
-                                         control_antigens=control_antigens)
492
-  names(normalise_list_none) <- c("matrix_antigen_none" ,"plot_none")
493
-## log2 normalisation
494
-  normalise_list_log <- matrix_normalise(matrix_antigen=matrix_antigen, 
495
-                                           method = "log2", 
496
-                                           array_matrix=array_matrix,
497
-                                           return_plot = TRUE,
498
-                                           control_antigens=control_antigens)
499
-  names(normalise_list_log) <- c("matrix_antigen_log" ,"plot_log")
500
-## vsn normalisation
501
-   normalise_list_vsn <- matrix_normalise(matrix_antigen=matrix_antigen, 
502
-                                           method = "vsn", 
503
-                                           array_matrix=array_matrix,
504
-                                           return_plot = TRUE,
505
-                                           control_antigens=control_antigens)
506
-    names(normalise_list_vsn) <- c("matrix_antigen_vsn" ,"plot_vsn")
507
-  ## cyclic loess with log
508
-     normalise_list_cyclic_loess_log <- matrix_normalise(matrix_antigen=matrix_antigen, 
509
-                                                        method = "cyclic_loess_log", 
510
-                                                        array_matrix=array_matrix,
511
-                                                        return_plot = TRUE,
512
-                                                        control_antigens=control_antigens)
513
-    names(normalise_list_cyclic_loess_log) <- c("matrix_antigen_cyclic_loess_log" ,
514
-                                                "plot_cyclic_loess_log")
515
-    
516
-
517
-     normalise_list_rlm <- matrix_normalise(matrix_antigen=matrix_antigen, 
518
-                                                  method = "rlm", 
519
-                                                  array_matrix=array_matrix,
520
-                                                  return_plot = TRUE,
521
-                                                  control_antigens=control_antigens)
522
-    names(normalise_list_rlm) <- c("matrix_antigen_rlm" ,"plot_rlm")
523
-    
524
-  
525
-    ## create a list after normalisation
526
- normalised_list <- c(normalise_list_none , 
527
-                      normalise_list_log,
528
-                      normalise_list_vsn,
529
-                      normalise_list_cyclic_loess_log,
530
-                      normalise_list_rlm)
531
-  ##
532
- normalised_list_plot <- normalised_list[grepl("plot",names(normalised_list))]
533
-
534
-  
535
-```
536
-
537
-```{r plot_comparison, fig.align='center', fig.width=12, fig.height=15}
538
-p <- do.call("grid.arrange", c(normalised_list_plot, ncol=2))
539
-```
540
-
541
-
542
-## Heatmaps
543
-
544
-```{r heat_norm}
545
-norm_df <- normlise_df$matrix_antigen_normalised
546
-norm_df <- norm_df %>% 
547
-   dplyr::select(-control_antigens)
548
-p3 <- pheatmap::pheatmap(norm_df ,scale = "none", cluster_rows = FALSE,
549
-                            main=paste('VSN',"Normalised Data"),
550
-                            silent = TRUE)
551
-#-------
552
-## if you want to save the file
553
-# p3 <- ggplotify::as.ggplot(p3)
554
-# p <- p3 +  theme_void()
555
-# ggsave(p ,
556
-#          filename ="heatmap.PNG" ,
557
-#          width = 16 , height = 12 , 
558
-#          limitsize = FALSE,
559
-#          dpi=200 )
560
-#-------
561
-p3
562
-```
563
-
564
-
565
-
566
-## PCA analysis
567
-
568
-```{r pca, fig.align='center',fig.width=16,fig.height=12}
569
-norm_df <- normlise_df$matrix_antigen_normalised
570
-res_pca <- prcomp( norm_df, scale = TRUE)
571
-var <- get_pca_var(res_pca)
572
-vars_visualise=20
573
-#Visualize the PCA
574
-## individuals contributing to the PCA
575
-p1 <- fviz_pca_ind(res_pca,
576
-                    col.var = "contrib", # Color by contributions to the PC
577
-                    gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
578
-                    repel = TRUE     # Avoid text overlapping
579
-  )+    theme_minimal()
580
-  
581
-# Select the top vars_visualise contributing variables
582
-p2 <-fviz_pca_biplot(res_pca, label="var",
583
-                    select.var = list(contrib = vars_visualise)) +
584
-    theme_minimal()
585
-  
586
-  
587
-# Total cos2 of variables on Dim.1 and Dim.2
588
-p3 <-     fviz_cos2(res_pca, choice = "var", axes = 1:2 , top = vars_visualise)
589
-
590
-  
591
-# Color by cos2 values: quality on the factor map
592
-p4 <-  fviz_pca_var(res_pca, col.var = "cos2",
593
-               gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
594
-               select.var = list(contrib = vars_visualise),
595
-               repel = TRUE # Avoid text overlapping
596
-  )
597
-
598
- 
599
-## combine the plots into one grid
600
-## combine the plots into one grid
601
-## combine the plots into one grid
602
-p_pca <- gridExtra::grid.arrange(p1,p2,p3,p4, ncol=2 )
603
-## If you want to save the file
604
-# ggsave(p_pca ,
605
-#          filename ="p_pca.PNG" ,
606
-#          width = 16 , height = 12 , 
607
-#           units = "in",
608
-#          limitsize = FALSE,
609
-#          dpi=300)
610
-p_pca 
611
-```
612
-
613
-
614
-
615
-
616
-## Shiny application
617
-Shiny is an R package that makes it easy to build interactive web apps straight from R. protGear has a built in user-friendly Shiny dashboard to assist in real-time processing and visualization. It provides five sequential steps for handling a data table of fluorescent intensities. 
618
-The dashboard is launched as below from R or RStudio. To Install R: Open an internet browser and go to www.r-project.org.
619
-
620
-```{r shiny_launch, eval=FALSE}
621
-protGear::launch_protGear_interactive()
622
-```
623
-
624
-```{r}
625
-sessionInfo()
626
-```
627
-
628 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "6E10D617",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/R/read_files_functions.R",
4
-    "project_path": "R/read_files_functions.R",
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": true,
9
-    "created": 1647521201664.0,
10
-    "source_on_save": false,
11
-    "relative_order": 10,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "377,26",
16
-        "scrollLine": "363"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647580069,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647624618732,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,1021 +0,0 @@
1
-#' Read multiple array files
2
-#' @title Read array files
3
-#' @param i  The name of the file which the data are to be read from.
4
-#' @param data_path  The path where the file with the data  is located
5
-#' @param genepix_vars  A list of specific definitions of the experiment design.
6
-#' See \code{\link{array_vars}}.
7
-#' @description This helps to read the chip file(s).
8
-#' @return a number of data frames in the global environment
9
-#' @importFrom  data.table fread %like%
10
-#' @importFrom rlang parse_expr :=
11
-#' @importFrom purrr set_names
12
-#' @importFrom magrittr %>%
13
-#' @export
14
-#' @examples
15
-#' ## Not run:
16
-#' genepix_vars <- array_vars(
17
-#' channel = "635",
18
-#' chip_path = system.file("extdata", "array_data/machine1/", package="protGear"),
19
-#' totsamples = 21,
20
-#' blockspersample = 2,
21
-#' mig_prefix = "_first",
22
-#' machine = 1,
23
-#' date_process = "0520"
24
-#' )
25
-#' file_read <- "KK2-06.txt"
26
-#' read_array_files(i=file_read,
27
-#' data_path=system.file("extdata", "array_data/machine1/",
28
-#' package="protGear"), genepix_vars=genepix_vars)
29
-#' ## End(Not run)
30
-read_array_files <- function(i, data_path, genepix_vars) {
31
-  ###loop through all the data to read
32
-  # skip- the number of lines of the data file to skip before beginning to read data ---
33
-  if (length(grep("Block.*Column|Column.*Block", readLines(file.path(data_path, i))) -
34
-             1) == 1) {
35
-    x <-
36
-      grep("Block.*Column|Column.*Block", readLines(file.path(data_path, i))) -
37
-      1
38
-    #print(paste0(x,"_",i))
39
-    d_f <-
40
-      data.table::fread(file.path(data_path, i),
41
-                        skip = x,
42
-                        header = TRUE)
43
-    ## arrange block to ensure the order is maintains
44
-    d_f <- d_f %>% arrange(Block)
45
-    ## calculate the different background  methods
46
-    if (paste0(genepix_vars$BG) %ni% names(d_f)) {
47
-      genepix_vars$BG <-
48
-        names(d_f)[names(d_f) %like% paste0(genepix_vars$BG, '$')]
49
-    }
50
-
51
-    if (paste0(genepix_vars$FG) %ni% names(d_f)) {
52
-      genepix_vars$FG <-
53
-        names(d_f)[names(d_f) %like% paste0(genepix_vars$FG, '$')]
54
-    }
55
-
56
-    expression_med <-
57
-      paste0("median(`", genepix_vars$BG, "`, na.rm = TRUE)")
58
-    exp_minpos <-  paste0("minpositive(`", genepix_vars$BG, "`)")
59
-    d_f <- d_f %>%
60
-
61
-      mutate(global_BGMedian := !!parse_expr(expression_med)) %>%
62
-      ## minimum BG per block and >1 -- Moving minimum approach per block
63
-      group_by(Block) %>%
64
-
65
-      mutate(minimum_BGMedian := !!parse_expr(exp_minpos)) %>%
66
-      ungroup()
67
-
68
-
69
-  }  else
70
-    stop("File " , i, " does not have Block Column Specification")
71
-
72
-}
73
-
74
-
75
-
76
-
77
-#' Extract the background values
78
-#' @title  extract bg
79
-#' @param iden A character indicating the name of the object to be used under data_files.
80
-#' @param data_files A list of data objects with names utilised by iden.
81
-#' @param genepix_vars A list of specific definitions of the experiment design. See \code{\link{array_vars}}.
82
-#' @description A generic function to extract the background data for micro array data.
83
-#' @return A data frame of background values
84
-#' @importFrom dplyr select arrange
85
-#' @importFrom data.table %like%
86
-#' @importFrom purrr set_names
87
-#' @export
88
-#'
89
-#' @examples
90
-#' ## Not run:
91
-#' genepix_vars <- array_vars(
92
-#' channel = "635",
93
-#' chip_path = system.file("extdata", "array_data/machine1/", package="protGear"),
94
-#' totsamples = 21,
95
-#' blockspersample = 2,
96
-#' mig_prefix = "_first",
97
-#' machine = 1,
98
-#' ## optional
99
-#' date_process = "0520"
100
-#' )
101
-#' #Define the data path
102
-#' data_path <- paste0(genepix_vars$chip_path)
103
-#' # List the file names to use
104
-#' filenames <- list.files(genepix_vars$chip_path,
105
-#'                        pattern = '*.txt$|*.gpr$', full.names = FALSE
106
-#' )
107
-#' data_files <- purrr::map(
108
-#'  .x = filenames,
109
-#'   .f = read_array_files,
110
-#'   data_path = data_path,
111
-#'   genepix_vars = genepix_vars
112
-#' )
113
-#' data_files <- purrr::set_names(data_files, purrr::map(filenames, name_of_files))
114
-#' names(data_files)
115
-#' extract_bg(iden ="KK2-06" , data_files=data_files,genepix_vars=genepix_vars)
116
-#' ## End(Not run)
117
-extract_bg <- function(iden, data_files , genepix_vars = genepix_vars)
118
-{
119
-  ## read in the sample ID files
120
-  ## this can be pulled from a mysql table
121
-  ## if the sample ID is not available, we create an automated sampleID
122
-  if (file.exists(file.path(genepix_vars$sampleID_path, paste0(iden , ".csv")))) {
123
-    arraynames <-
124
-      read.csv(
125
-        file.path(genepix_vars$sampleID_path, paste0(iden , ".csv")) ,
126
-        header = TRUE ,
127
-        stringsAsFactors = FALSE ,
128
-        colClasses = "character"
129
-      )
130
-  } else{
131
-    warning(iden,
132
-            " Not found in the sampleID files here",
133
-            genepix_vars$sampleID_path)
134
-    arraynames <- data.frame(
135
-      v1 = (1:genepix_vars$totsamples) ,
136
-      v2 = paste0("SID_gen", 1:genepix_vars$totsamples),
137
-      barcode = iden
138
-    )
139
-
140
-  }
141
-
142
-
143
-  ## replicate to the number of blocks
144
-  ## make sure the block is arranged before merging with the data file
145
-  arraynames <- arraynames %>%
146
-    dplyr::select(v1, v2) %>%
147
-    arrange(as.numeric(v1))
148
-
149
-  ## capture errors for same sample ID in a slide
150
-  if (length(unique(arraynames$v2)) < genepix_vars$totsamples) {
151
-    sink("log_replicates.txt" , append = TRUE)
152
-    warning("Most likely there is a repeated sample name for " , iden)
153
-    sink()
154
-    arraynames <- arraynames %>%
155
-      group_by(v2) %>%
156
-      mutate(index = 1:n()) %>%
157
-      mutate(v2 = ifelse(index > 1, paste0(v2, "_", index), v2)) %>%
158
-      select(-index)
159
-  }
160
-
161
-
162
-  ## get the data from the loop
163
-  ## extract the specific data from the data files
164
-  data <- data_files[[iden]]
165
-
166
-  ## pick the spots per block from the data file
167
-  spotsperblock <- table(data$Block)[[1]]
168
-  sampleID <-
169
-    rep(arraynames$v2, each = spotsperblock * genepix_vars$blockspersample)
170
-
171
-
172
-
173
-  Data1 <- data %>%
174
-    # assign respective sample number to each row
175
-    mutate(
176
-      sample = rep(
177
-        seq_len(genepix_vars$totsamples),
178
-        each = spotsperblock * genepix_vars$blockspersample
179
-      ),
180
-      # Bring in the sampleIDs..192 each sample
181
-      sampleID = sampleID,
182
-      # Abit of formating the Antigen names and concentration
183
-      Name = gsub(':', '', Name),
184
-      Name = gsub('\n', '', Name),
185
-      Name = gsub(' ', '', Name)
186
-    )  %>%
187
-    ##remove uneccessary concs
188
-    ## filter(!grepl('Landmark|Buffer|IgG', Name))   %>%
189
-    # group by iden and antigen name
190
-    group_by(sampleID, Name)
191
-
192
-  if (paste0(genepix_vars$BG) %ni% names(Data1)) {
193
-    genepix_vars$BG <-
194
-      names(Data1)[names(Data1) %like% paste0(genepix_vars$BG, '$')]
195
-  }
196
-
197
-  if (paste0(genepix_vars$FG) %ni% names(Data1)) {
198
-    genepix_vars$FG <-
199
-      names(Data1)[names(Data1) %like% paste0(genepix_vars$FG, '$')]
200
-  }
201
-
202
-  #----------------------------------------------------------------------------------------------------
203
-  ##save the MFI values of the Background
204
-  data1_bg <-
205
-    Data1 %>% dplyr::select(
206
-      sampleID,
207
-      antigen = Name,
208
-      Block,
209
-      FBG_Median = !!genepix_vars$FG ,
210
-      BG_Median = !!genepix_vars$BG
211
-    ) %>%
212
-    mutate(replicate = 1:n()) %>%
213
-    filter(
214
-      !grepl(
215
-        '^[Ll][Aa][Nn][Dd][Mm][Aa][Rr][Kk]|^[bB][Uu][Ff][Ff][Ee][Rr]',
216
-        antigen
217
-      )
218
-    )
219
-  # combine Name and replicate
220
-  #----------------------------------------------------------------------------------------------------
221
-  return(data1_bg)
222
-}
223
-
224
-
225
-
226
-
227
-
228
-#' @title Plot background
229
-#'
230
-#' @param df A 	default dataset to use for plot.
231
-#' @param bg_MFI A numeric \code{variable} describing which is the background MFI
232
-#' @param x_axis The variable on the x axis
233
-#' @param log_mfi 	a logical value indicating whether the MFI values should be
234
-#' log transformed or not.
235
-#'
236
-#' @description  A generic function for plotting of R objects.
237
-#' @return A ggplot of background values
238
-#' @export
239
-#' @import ggpubr
240
-#' @examples
241
-#' ## Not run:
242
-#' #After extracting the background using \code{\link{extract_bg}} we plot the data using
243
-#' allData_bg <- readr::read_csv(system.file("extdata", "bg_example.csv", package="protGear"))
244
-#' plot_bg(allData_bg,
245
-#' x_axis = "antigen",
246
-#' bg_MFI = "BG_Median",  log_mfi = TRUE
247
-#' )
248
-#' ## End(Not run)
249
-
250
-plot_bg <- function(df,
251
-                    x_axis = "antigen",
252
-                    bg_MFI = "BG_Median",
253
-                    log_mfi = TRUE) {
254
-  ## create an id to help in having a numeric sample ID to sort your data
255
-  ## this is because all sampleIDs from the samples were not unique
256
-  ## rename the original sampleID sampleID2
257
-  bg_MFI_sys <- rlang::sym(bg_MFI)
258
-  ### check if the .id exists and renane it to slide
259
-  if ('.id' %in% names(df)) {
260
-    df <- df %>%  dplyr::rename(slide = .id)
261
-  } else
262
-    df <- df %>%  dplyr::mutate(slide = "slide")
263
-
264
-  bg_plot <- df %>%
265
-    dplyr::mutate(log_bg =    log2(!!bg_MFI_sys))
266
-
267
-  # bg_plot$sampleID <- group_indices(.data =bg_plot )
268
-
269
-  if (log_mfi == TRUE) {
270
-    p_pubr <- ggboxplot(
271
-      data = bg_plot ,
272
-      x = x_axis ,
273
-      y = "log_bg",
274
-      facet.by = "replicate",
275
-      ncol = 1
276
-    )
277
-    p_pubr <- ggpar(
278
-      p_pubr,
279
-      font.tickslab = c(6, "#993333"),
280
-      xtickslab.rt = 45 ,
281
-      ylab = "Background of the replicates (log2)"
282
-    )
283
-    p_bg <-
284
-      ggplot(data = bg_plot , aes_string(x = x_axis , y = "log_bg")) +
285
-      geom_boxplot() +
286
-      facet_wrap( ~ replicate, ncol = 1) +
287
-      theme_light() +
288
-      theme(axis.text.x = element_text(angle = 45 , hjust = 1))
289
-
290
-
291
-
292
-  } else if (log_mfi == FALSE) {
293
-    p_pubr <- ggboxplot(
294
-      data = bg_plot ,
295
-      x = x_axis ,
296
-      y = bg_MFI,
297
-      facet.by = "replicate",
298
-      ncol = 1
299
-    )
300
-    p_pubr <- ggpar(
301
-      p_pubr,
302
-      font.tickslab = c(6, "#993333"),
303
-      xtickslab.rt = 45 ,
304
-      ylab = "Background of the replicates raw"
305
-    )
306
-
307
-    p_bg <-
308
-      ggplot(data = bg_plot , aes_string(x = x_axis , y = bg_MFI)) +
309
-      geom_boxplot() +
310
-      facet_wrap( ~ replicate, ncol = 1) +
311
-      theme_light() +
312
-      theme(axis.text.x = element_text(angle = 45 , hjust = 1))
313
-  } else {
314
-    # p_pubr <- ggboxplot(data = bg_plot ,
315
-    #                     x="antigen" , y=bg_MFI,
316
-    #                     facet.by = "replicate",ncol=1)
317
-    # p_pubr <- ggpar(p_pubr,font.tickslab = c(8,"#993333"),
318
-    #                 xtickslab.rt = 45 , ylab = "Background of the replicates raw")
319
-
320
-    p_bg <-
321
-      ggplot(data = bg_plot , aes_string(x = x_axis , y = bg_MFI)) +
322
-      geom_boxplot() +
323
-      facet_wrap( ~ replicate, ncol = 1) +
324
-      theme_light() +
325
-      theme(axis.text.x = element_text(angle = 45 , hjust = 1))
326
-  }
327
-  return(p_pubr)
328
-}
329
-
330
-#___________________________________________________
331
-
332
-
333
-#' Plot foreground and background values
334
-#' @title  plot_FB
335
-#' @param df An object containing the data to which the plot is done.
336
-#' @param antigen_name The \code{variable} describing which features/proteins/
337
-#' antibodies in the data should be used to plot
338
-#' @param bg_MFI A numeric \code{variable} describing which is the background MFI
339
-#' @param FG_MFI A numeric \code{variable} describing which is the foreground MFI
340
-#' @param log_mfi 	a logical value indicating whether the MFI values should be
341
-#' log transformed or not.
342
-#' @description A generic function for plotting the background and foreground values.
343
-#' @return a ggplot of foreground vs background MFI values
344
-#' @export
345
-#' @import ggplot2 dplyr
346
-#' @examples
347
-#' ## Not run:
348
-#' #After extracting the background using \code{\link{extract_bg}} we plot the data using
349
-#' allData_bg <- readr::read_csv(system.file("extdata", "bg_example.csv", package="protGear"))
350
-#' plot_FB(allData_bg,
351
-#' antigen_name = "antigen",
352
-#' bg_MFI = "BG_Median", FG_MFI = "FBG_Median", log = FALSE
353
-#' )
354
-#' ## End(Not run)
355
-plot_FB <-
356
-  function(df,
357
-           antigen_name = "antigen",
358
-           bg_MFI = "BG_Median",
359
-           FG_MFI = "FBG_Median",
360
-           log_mfi = FALSE) {
361
-    ## create an id to help in having a numeric sample ID to sort your data
362
-    ## this is because all sampleIDs from the samples were not unique
363
-    ## rename the original sampleID sampleID2
364
-    bg_MFI_sys <- rlang::sym(bg_MFI)
365
-    FB_MFI_sys <- rlang::sym(FG_MFI)
366
-    ### check if the .id exists and renane it to slide
367
-    if ('.id' %in% names(df)) {
368
-      df <- df %>%  dplyr::rename(slide = .id)
369
-    } else
370
-      df <- df %>%  dplyr::mutate(slide = "slide")
371
-
372
-    bg_plot <- df %>%
373
-      mutate(log_bg = log2(!!bg_MFI_sys),
374
-             log_fb = log2(!!FB_MFI_sys))
375
-
376
-
377
-
378
-    if (log_mfi == TRUE) {
379
-      p <- ggplot(bg_plot , aes(
380
-        log_fb,
381
-        log_bg,
382
-        text = paste(
383
-          "Antigen: ",
384
-          antigen,
385
-          "<br>FG: $",
386
-          FBG_Median,
387
-          "<br>B: $",
388
-          BG_Median
389
-        )
390
-      )) +
391
-        xlab("Foreground MFI") + ylab("Background MFI") +
392
-        geom_jitter() +
393
-        theme_light()
394
-
395
-
396
-
397
-    } else if (log_mfi == FALSE) {
398
-      p <- ggplot(bg_plot , aes(
399
-        FBG_Median,
400
-        BG_Median,
401
-        text = paste(
402
-          "Antigen: ",
403
-          antigen,
404
-          "<br>FG: $",
405
-          FBG_Median,
406
-          "<br>B: $",
407
-          BG_Median
408
-        )
409
-      )) +
410
-        xlab("Foreground MFI") + ylab("Background MFI") +
411
-        geom_jitter() +
412
-        theme_light()
413
-    } else {
414
-      p <- ggplot(bg_plot , aes(
415
-        FBG_Median,
416
-        BG_Median,
417
-        text = paste(
418
-          "Antigen: ",
419
-          antigen,
420
-          "<br>FG: $",
421
-          FBG_Median,
422
-          "<br>B: $",
423
-          BG_Median
424
-        )
425
-      )) +
426
-        xlab("Foreground MFI") + ylab("Background MFI") +
427
-        geom_jitter() +
428
-        theme_light()
429
-    }
430
-    return(p)
431
-  }
432
-
433
-
434
-#' Background correction
435
-#' @title bg_correct
436
-#' @param iden A character indicating the name of the object to be used under Data1
437
-#' @param Data1 A data frame with sample identifiers merged with micro array data.
438
-#' @param genepix_vars A list of specific definitions of the experiment design. See \code{\link{array_vars}}.
439
-#' @param method 	a description of the background correction to be used.  Possible values are \code{"none","subtract_local",
440
-#' "subtract_global","movingmin_bg","minimum_half","edwards" or "normexp"}. The default is \code{"subtract_local"}.
441
-#' @details  The function implements background correction methods developed by \code{\link[limma]{backgroundCorrect}}. But the
442
-#' \code{minimum_half and movingmin_bg} uses the block of the protein array as the grid. If method="movingmin_bg" the minimum
443
-#' background value within a  block is subtracted.
444
-#' If method="minimum_half" then any intensity which is negative after background subtraction is reset to be equal to half the minimum positive value in
445
-#' a block.  If method="movingmin_value" then any intensity which is negative after background subtraction is reset to the minimum positive value
446
-#' in a block. For \code{edwards} we implement a similar algorithm with \code{limma::backgroundCorrect(method="edwards")} and for \code{'normexp'}
447
-#' we use  the saddle-point approximation to maximum likelihood, \code{\link[limma]{backgroundCorrect}} for more details.
448
-#' @description  A generic function to perform background correction.
449
-#' @return A data frame with background corrected data
450
-#' @export
451
-#' @import  dplyr limma
452
-#' @importFrom rlang sym
453
-#' @examples
454
-#' ## Not run:
455
-#' genepix_vars <- array_vars(
456
-#'   channel = "635",
457
-#'   chip_path = system.file("extdata", "array_data/machine1/", package="protGear"),
458
-#'   totsamples = 21,
459
-#'   blockspersample = 2,
460
-#'   mig_prefix = "_first",
461
-#'   machine = 1,
462
-#'   ## optional
463
-#'   date_process = "0520"
464
-#' )
465
-#' raw_df <- readr::read_csv(system.file("extdata", "Data1_bg_sample.csv", package="protGear"))
466
-#' bg_correct(iden="iden",
467
-#' Data1 = raw_df,
468
-#' genepix_vars = genepix_vars, method="subtract_local"
469
-#' )
470
-#' ## End(Not run)
471
-bg_correct <-
472
-  function(iden, Data1, genepix_vars, method = "subtract_local") {
473
-    #----------------------------------------------------------------------------------------------------
474
-    if (paste0(genepix_vars$BG) %ni% names(Data1)) {
475
-      genepix_vars$BG <-
476
-        names(Data1)[names(Data1) %like% paste0(genepix_vars$BG, '$')]
477
-      genepix_vars$BG <- rlang::sym(genepix_vars$BG)
478
-    }
479
-
480
-    if (paste0(genepix_vars$FG) %ni% names(Data1)) {
481
-      genepix_vars$FG <-
482
-        names(Data1)[names(Data1) %like% paste0(genepix_vars$FG, '$')]
483
-      genepix_vars$FG <- rlang::sym(genepix_vars$FG)
484
-    }
485
-
486
-
487
-    ##save the MFI values without formatting the background
488
-    data1_full_bg <- Data1 %>%
489
-      dplyr::select(sampleID,
490
-                    antigen = Name,
491
-                    FMedian = !!genepix_vars$FG) %>%
492
-      mutate(replicate = 1:n()) %>%
493
-      ## removing Land mark and Buffer
494
-      filter(
495
-        !grepl(
496
-          '^[Ll][Aa][Nn][Dd][Mm][Aa][Rr][Kk]|^[bB][Uu][Ff][Ff][Ee][Rr]',
497
-          antigen
498
-        )
499
-      )
500
-    # %>%     spread(antigen, F635Median)
501
-    # combine Name and replicate
502
-    ## save in the background data folder files with higher background values
503
-    file_ident <- paste0(iden, "_raw_MFI_BG", ".csv")
504
-
505
-    #create_dir(path = system.file("processe_data/raw_MFI_BG/"))
506
-    #write_csv(data1_full_bg ,system.file("processed_data/raw_MFI_BG/", 'file_ident', package="protGear"))
507
-    #----------------------------------------------------------------------------------------------------
508
-    if (method == "none" | method == "") {
509
-      #----------------------------------------------------------------------------------------------------
510
-      ##MFI values without subtracting the background
511
-      Data1 <- Data1 %>%
512
-        dplyr::select(
513
-          sampleID,
514
-          sample_array_ID,
515
-          antigen = Name,
516
-          FMedian = !!genepix_vars$FG ,
517
-          BGMedian = !!genepix_vars$BG,
518
-          Block,
519
-          Column,
520
-          Row
521
-        ) %>%
522
-        mutate(FMedianBG_correct = FMedian) %>%
523
-        mutate(replicate = 1:n()) #%>%
524
-      #filter(!grepl('^[Ll][Aa][Nn][Dd][Mm][Aa][Rr][Kk]|^[bB][Uu][Ff][Ff][Ee][Rr]', antigen))
525
-      #----------------------------------------------------------------------------------------------------
526
-
527
-    } else if (method == "subtract_local") {
528
-      ## this approach subtracts the local background estimated by the Array Jet Machine
529
-      #----------------------------------------------------------------------------------------------------
530
-      ##save the MFI values with subtracting the background
531
-      Data1 <- Data1 %>%
532
-        dplyr::mutate(FMedianBG_correct = (!!genepix_vars$FG) - (!!genepix_vars$BG)) %>%
533
-        dplyr::select(
534
-          sampleID,
535
-          sample_array_ID,
536
-          antigen = Name,
537
-          FMedian = !!genepix_vars$FG,
538
-          BGMedian = !!genepix_vars$BG,
539
-          FMedianBG_correct,
540
-          Block,
541
-          Column,
542
-          Row
543
-        ) %>%
544
-        dplyr::mutate(replicate = 1:n())
545
-      #%>%
546
-      # filter(!grepl('^[Ll][Aa][Nn][Dd][Mm][Aa][Rr][Kk]|^[bB][Uu][Ff][Ff][Ee][Rr]', antigen))
547
-      #----------------------------------------------------------------------------------------------------
548
-
549
-    } else if (method == "subtract_global") {
550
-      ## this approach subracts the median of the backgrounds in a slide
551
-      #----------------------------------------------------------------------------------------------------
552
-      ##save the MFI values with subtracting the background
553
-      Data1 <- Data1 %>%
554
-        mutate(FMedianBG_correct = !!genepix_vars$FG - global_BGMedian) %>%
555
-        dplyr::select(
556
-          sampleID,
557
-          sample_array_ID,
558
-          antigen = Name,
559
-          FMedian = !!genepix_vars$FG,
560
-          BGMedian = !!genepix_vars$BG,
561
-          FMedianBG_correct,
562
-          Block,
563
-          Column,
564
-          Row
565
-        ) %>%
566
-        mutate(replicate = 1:n())
567
-      #%>%
568
-      # filter(!grepl('^[Ll][Aa][Nn][Dd][Mm][Aa][Rr][Kk]|^[bB][Uu][Ff][Ff][Ee][Rr]', antigen))
569
-      #----------------------------------------------------------------------------------------------------
570
-    } else if (method == "movingmin_bg") {
571
-      ## this is subtracted
572
-      Data1 <- Data1 %>%
573
-        mutate(FMedianBG_correct = !!genepix_vars$FG-!!genepix_vars$BG) %>%
574
-        ## this is generated while reading the array files using read_array_files function
575
-        mutate(FMedianBG_correct = !!genepix_vars$FG - minimum_BGMedian) %>%
576
-        dplyr::select(
577
-          sampleID,
578
-          sample_array_ID,
579
-          antigen = Name,
580
-          FMedian = !!genepix_vars$FG,
581
-          BGMedian = !!genepix_vars$BG,
582
-          FMedianBG_correct,
583
-          Block,
584
-          Column,
585
-          Row
586
-        ) %>%
587
-        mutate(replicate = 1:n())
588
-      # %>%
589
-      # filter(!grepl('^[Ll][Aa][Nn][Dd][Mm][Aa][Rr][Kk]|^[bB][Uu][Ff][Ff][Ee][Rr]', antigen))
590
-
591
-    } else if (method == "minimum_half") {
592
-      ## this approach ensures all the MFI values are positive
593
-      ## if the MFI <0 after subtraction the MFI is set to the half of the minimum corrected intensities
594
-      #----------------------------------------------------------------------------------------------------
595
-      ##save the MFI values with subtracting the background
596
-      Data1 <- Data1 %>%
597
-        mutate(FMedianBG_correct = !!genepix_vars$FG-!!genepix_vars$BG) %>%
598
-        dplyr::select(
599
-          sampleID,
600
-          sample_array_ID,
601
-          antigen = Name,
602
-          FMedian = !!genepix_vars$FG,
603
-          BGMedian = !!genepix_vars$BG,
604
-          FMedianBG_correct,
605
-          Block,
606
-          Column,
607
-          Row
608
-        )  %>%
609
-        group_by(Block) %>%
610
-        mutate(FMedianBG_correct = ifelse(
611
-          FMedianBG_correct < 0.1,
612
-          (minpositive(FMedianBG_correct) /
613
-             2),
614
-          FMedianBG_correct
615
-        )) %>%
616
-        group_by(sampleID, antigen) %>%
617
-        mutate(replicate = 1:n())
618
-      #----------------------------------------------------------------------------------------------------
619
-    } else if (method == "minimum_value") {
620
-      ## this approach ensures all the MFI values are positive
621
-      ## if the MFI <0 after subtraction the MFI is set to the minimum of the corrected intensities
622
-      #----------------------------------------------------------------------------------------------------
623
-      Data1 <- Data1 %>%
624
-        mutate(FMedianBG_correct = !!genepix_vars$FG-!!genepix_vars$BG) %>%
625
-        dplyr::select(
626
-          sampleID,
627
-          sample_array_ID,
628
-          antigen = Name,
629
-          FMedian = !!genepix_vars$FG,
630
-          BGMedian = !!genepix_vars$BG,
631
-          FMedianBG_correct,
632
-          Block,
633
-          Column,
634
-          Row
635
-        )  %>%
636
-        group_by(Block) %>%
637
-        mutate(FMedianBG_correct = ifelse(FMedianBG_correct < 0.1,
638
-                                          (minpositive(
639
-                                            FMedianBG_correct
640
-                                          )), FMedianBG_correct)) %>%
641
-        group_by(sampleID, antigen) %>%
642
-        mutate(replicate = 1:n())
643
-
644
-      #----------------------------------------------------------------------------------------------------
645
-    } else if (method == "edwards") {
646
-      #a log-linear interpolation method is used to adjust lower intensities as in Edwards (2003).
647
-      Data1 <- Data1 %>%
648
-        mutate(FMedianBG_correct = !!genepix_vars$FG-!!genepix_vars$BG)
649
-      one <- matrix(1, nrow(Data1), 1)
650
-      delta.vec <- function(d, f = 0.1) {
651
-        ##  mean(d < 1e-16, na.rm = TRUE) % of values that are negative
652
-        ## mean(d < 1e-16, na.rm = TRUE) * (1 + f) the % of values just above the negative values
653
-        ## gives the quartile cut off value of the threshold
654
-        quantile(d,
655
-                 probs = mean(d < 1e-16, na.rm = TRUE) * (1 + f),
656
-                 na.rm = TRUE)
657
-      }
658
-      #delta <- one %*% apply(as.matrix(Data1[['FMedianBG_correct']]), 2, delta.vec)
659
-      ## no need to multiply with 1 since its returning the same value and we want to implement in a data frame
660
-      delta <-
661
-        apply(as.matrix(Data1[['FMedianBG_correct']]), 2, delta.vec)
662
-
663
-      ## each value its given its own value accordingly
664
-      ## this helps maintain the variation
665
-      Data1 <- Data1 %>%
666
-        dplyr::select(
667
-          sampleID,
668
-          sample_array_ID,
669
-          antigen = Name,
670
-          FMedian = !!genepix_vars$FG,
671
-          FMedianBG_correct,
672
-          BGMedian = !!genepix_vars$BG,
673
-          Block,
674
-          Column,
675
-          Row
676
-        ) %>%
677
-        group_by(Block) %>%
678
-        mutate(FMedianBG_correct = ifelse(FMedianBG_correct < delta,
679
-                                          (delta * exp(
680
-                                            1 - (BGMedian + delta) / FMedian
681
-                                          )), FMedianBG_correct)) %>%
682
-        group_by(sampleID, antigen) %>%
683
-        dplyr::mutate(replicate = 1:n())
684
-    } else if (method == "normexp") {
685
-      ##a convolution of normal and exponential distributions is fitted to the foreground intensities using
686
-      #the background intensities as a covariate, and the expected signal given the observed foreground becomes
687
-      #the corrected intensity. This results in a smooth monotonic transformation of the background subtracted
688
-      #intensities such that all the corrected intensities are positive.
689
-      ##Both norm exp are implemented in Limma for DNA micro array data
690
-      Data1 <- Data1 %>%
691
-        mutate(FMedianBG_correct = !!genepix_vars$FG-!!genepix_vars$BG)
692
-      E <- as.matrix(Data1[['FMedianBG_correct']])
693
-      rownames(E) <- rownames(Data1)
694
-
695
-      ## here we use Here "saddle" gives the saddle-point approximation to maximum likelihood from
696
-      # Ritchie et al (2007) and improved by Silver et al (2009) --> check limma for details
697
-      ## can we use offset--> updates
698
-      bg_correct <-
699
-        limma::backgroundCorrect.matrix(
700
-          E = E,
701
-          method = "auto",
702
-          offset = 0,
703
-          printer = NULL,
704
-          normexp.method = "saddle",
705
-          verbose = TRUE
706
-        )
707
-      bg_correct <- data.frame(FMedianBG_correct = bg_correct)
708
-
709
-      ## select the important variables and join with the background corrected data
710
-      Data1 <- Data1 %>%
711
-        dplyr::select(
712
-          sampleID,
713
-          sample_array_ID,
714
-          antigen = Name,
715
-          FMedian = !!genepix_vars$FG,
716
-          BGMedian = !!genepix_vars$BG,
717
-          Block,
718
-          Column,
719
-          Row
720
-        ) %>%
721
-        bind_cols(bg_correct) %>%
722
-        group_by(sampleID, antigen) %>%
723
-        dplyr::mutate(replicate = 1:n())
724
-    }
725
-    #Data1 <- Data1 %>% rename(F635MedianB635=F635.Median...B635)
726
-
727
-    return(Data1)
728
-  }
729
-
730
-
731
-
732
-
733
-#' Merge sample ID with the array data
734
-#'
735
-#' @param iden A character indicating the name of the object to be used under data_files.
736
-#' @param data_files A list of data objects with names utilised by iden.
737
-#' @param genepix_vars A list of specific definitions of the experiment design. See \code{\link{array_vars}}.
738
-#' @param method A description of the background correction to be used. See \code{\link{bg_correct}}.
739
-#' @description  A generic function that merges the protein data with the sample identifiers or sample names. If the file
740
-#' does not have sample identifiers the function generates it automatically.
741
-#' @return a data frame merged with corresponding sample ID's. The sample ID are specified in the sample ID files
742
-#' @export
743
-#' @import dplyr
744
-#' @importFrom purrr set_names
745
-#' @examples
746
-#' ## Not run:
747
-#' ### Define the genepix_vars
748
-#' genepix_vars <- array_vars(
749
-#'   channel = "635",
750
-#'   chip_path = system.file("extdata", "array_data/machine1/", package="protGear"),
751
-#'   totsamples = 21,
752
-#'   blockspersample = 2,
753
-#'   mig_prefix = "_first",
754
-#'   machine = 1,
755
-#'   ## optional
756
-#'   date_process = "0520"
757
-#' )
758
-#'
759
-#' ## the path where the micro-array data is located
760
-#' data_path <- paste0(genepix_vars$chip_path)
761
-#' filenames <- list.files(genepix_vars$chip_path,
762
-#'                         pattern = "*.txt$|*.gpr$", full.names = FALSE
763
-#' )
764
-#' ## create a list of all the files
765
-#' data_files <- purrr::map(
766
-#'  .x = filenames,
767
-#'   .f = read_array_files,
768
-#'   data_path = data_path,
769
-#'   genepix_vars = genepix_vars
770
-#' )
771
-#' data_files <- purrr::set_names(data_files, purrr::map(filenames, name_of_files))
772
-#' ## merge the lab data with samples and perform bg correction
773
-#' merge_sampleID(iden = "KK2-06", data_files = data_files,
774
-#'                genepix_vars =genepix_vars,method = "subtract_global" )
775
-#' ## End(Not run)
776
-merge_sampleID <- function(iden, data_files, genepix_vars, method)
777
-{
778
-  ## read in the sample ID files
779
-  ## this can be pulled from a mysql table
780
-  if (file.exists(file.path(genepix_vars$sampleID_path, paste0(iden , ".csv")))) {
781
-    arraynames <-
782
-      read.csv(
783
-        file.path(genepix_vars$sampleID_path, paste0(iden , ".csv")) ,
784
-        header = TRUE ,
785
-        stringsAsFactors = FALSE ,
786
-        colClasses = "character"
787
-      )
788
-  } else{
789
-    warning(iden,
790
-            ".csv Not found in the sampleID files",
791
-            genepix_vars$sampleID_path)
792
-    arraynames <-
793
-      data.frame(
794
-        v1 = (1:genepix_vars$totsamples) ,
795
-        v2 = paste0("SID_gen", 1:genepix_vars$totsamples),
796
-        barcode = iden
797
-      )
798
-  }
799
-
800
-  ## replicate to the number of blocks
801
-  ## make sure the block is arranged before merging with the data file
802
-  arraynames <- arraynames %>%
803
-    dplyr::select(v1, v2) %>%
804
-    arrange(as.numeric(v1))
805
-
806
-  ## capture errors for same sample ID in a slide
807
-  if (length(unique(arraynames$v2)) < genepix_vars$totsamples) {
808
-    sink("log_replicates.txt" , append = TRUE)
809
-    warning("Most likely there is a repeated sample name for " , iden)
810
-    sink()
811
-    arraynames <- arraynames %>%
812
-      group_by(v2) %>%
813
-      mutate(index = 1:n()) %>%
814
-      mutate(v2 = ifelse(index > 1, paste0(v2, "_", index), v2)) %>%
815
-      select(-index)
816
-  }
817
-
818
-
819
-  ## get the data from the loop
820
-  ## extract the specific data from the data files
821
-  data <- data_files[[iden]]
822
-
823
-  ## pick the spots per block from the data file
824
-  spotsperblock <- table(data$Block)[[1]]
825
-  sampleID <-
826
-    rep(arraynames$v2, each = spotsperblock * genepix_vars$blockspersample)
827
-  sample_array_ID <-
828
-    rep(arraynames$v1, each = spotsperblock * genepix_vars$blockspersample)
829
-
830
-
831
-  Data1 <- data %>%
832
-    # assign respective sample number to each row
833
-    mutate(
834
-      sample = rep(
835
-        1:genepix_vars$totsamples,
836
-        each = spotsperblock * genepix_vars$blockspersample
837
-      ),
838
-      # Bring in the sampleIDs..192 each sample
839
-      sampleID = sampleID,
840
-      sample_array_ID = sample_array_ID,
841
-      # Abit of formating the Antigen names and concentration
842
-      Name = gsub(':', '', Name),
843
-      Name = gsub('\n', '', Name),
844
-      Name = gsub(' ', '', Name)
845
-    )  %>%
846
-    ##remove uneccessary concs
847
-    ## filter(!grepl('Landmark|Buffer|IgG', Name))   %>%
848
-    # group by iden and antigen name
849
-    group_by(sampleID, Name)
850
-
851
-  ## DO the background correction
852
-  ## specify the
853
-  Data1 <- bg_correct(iden, Data1 , genepix_vars, method = method)
854
-  Data1 <- Data1 %>%  mutate(iden = iden)
855
-  return(Data1)
856
-}
857
-
858
-
859
-
860
-#' Read a gpr file to visualize
861
-#'
862
-#' @param infile a .gpr file to be used to visualize the expression intensities of the slide spots
863
-#'
864
-#' @return a data frame to visualize the background or foreground values
865
-#' @export
866
-#' @importFrom  data.table fread
867
-#' @examples
868
-#' ## Not run:
869
-#' read_array_visualize(infile = system.file("extdata",
870
-#' "/array_data/machine1/KK2-06.txt", package="protGear"))
871
-#' ## End(Not run)
872
-read_array_visualize <- function(infile) {
873
-  x <- grep('Block.*Column|Column.*Block', readLines(infile))
874
-  # d_f <- read.table(inFile$datapath,skip=x-1, header = T)
875
-  d_f <- data.table::fread(infile, skip = x - 1, header = TRUE)
876
-  return(d_f)
877
-}
878
-
879
-#' Visualize the slide mimicking the original scan image.
880
-#'
881
-#' @param infile a .gpr file to be used to visualize the expression intensities of the slide spots
882
-#' @param MFI_var the MFI variable to plot, can be either the background or foreground value
883
-#' @param d_f  a data frame with array data
884
-#' @param interactive a logical to specify whether an interactive graph is returned or not
885
-#'
886
-#' @import htmltools ggplot2
887
-#' @importFrom plotly ggplotly
888
-#' @return A ggplot of slide foreground values
889
-#' @export
890
-#'
891
-#' @examples
892
-#' ## Not run:
893
-#' visualize_slide(
894
-#' infile = system.file("extdata", "/array_data/machine1/KK2-06.txt", package="protGear"),
895
-#' MFI_var = "B635 Median"
896
-#' )
897
-#' ## End(Not run)
898
-visualize_slide <-
899
-  function(infile,
900
-           MFI_var,
901
-           interactive = FALSE,
902
-           d_f = NA) {
903
-    ## d_f only used for the shiny app
904
-    if (is.na(d_f)) {
905
-      d_f <- read_array_visualize(infile)
906
-
907
-      d_f <- d_f %>%
908
-        group_by(Block) %>%
909
-        mutate(
910
-          meanX = mean(X),
911
-          meanY = mean(Y),
912
-          maxY = max(Y),
913
-          maxX = max(X),
914
-          minY = min(Y),
915
-          minX = min(X)
916
-        )
917
-    }
918
-    ## define mid points to put the block labels
919
-    MFI_var_sys <- rlang::sym(MFI_var)
920
-    mid <- median(log(d_f[[MFI_var]]))
921
-    labels <- sprintf(
922
-      "<b>%s</b><br> MFI= %s ",
923
-      d_f$Name,
924
-      formatC(d_f$`F635 Median`, format = "d", big.mark = ",")
925
-    ) %>%
926
-      lapply(htmltools::HTML)
927
-
928
-    point_size <- 0.5
929
-    if (interactive == FALSE)
930
-      point_size <- 1
931
-    ## plot the visual slide
932
-    p <- ggplot(d_f, aes(x = X, y = -Y, text = labels)) +
933
-      #geom_rect(aes(xmin = minX, xmax = maxX, ymin = -minY, ymax = -maxY),color = "black",alpha=0.0001,fill="blue") +
934
-      geom_point(size = point_size, aes_string(colour = sprintf("log(`%s`)", MFI_var))) +
935
-      theme_void() +
936
-      theme(legend.position = "none")  +
937
-      scale_color_gradient2(
938
-        midpoint = mid,
939
-        low = "blue",
940
-        mid = "white",
941
-        high = "red",
942
-        space = "Lab"
943
-      ) +
944
-      geom_text(aes(
945
-        x = meanX,
946
-        y = -meanY,
947
-        label = paste("Block", Block)
948
-      ),
949
-      color = "black",
950
-      size = 2)
951
-
952
-    if (interactive == FALSE) {
953
-      return(p)
954
-    } else if (interactive == TRUE) {
955
-      p <- ggplotly(p, tooltip = 'text')
956
-      return(p)
957
-    }
958
-  }
959
-
960
-
961
-#' Visualize the slide mimicking the original scan image using a 2d plot.
962
-#'
963
-#' @param infile - a .gpr file to be used to visualize the expression intensities of the slide spots
964
-#' @param MFI_var the MFI variable to plot, can be either the background or foreground value
965
-#' @param d_f a data frame with array data
966
-#'
967
-#' @return A 2d plot of either the background or foreground values
968
-#' @export
969
-#' @import ggplot2
970
-#' @examples
971
-#' ## Not run:
972
-#' visualize_slide_2d(
973
-#' infile = system.file("extdata", "/array_data/machine1/KK2-06.txt", package="protGear"),
974
-#' MFI_var = "B635 Median"
975
-#' )
976
-#' ## End(Not run)
977
-visualize_slide_2d <- function(infile, MFI_var , d_f = NA) {
978
-  if (is.na(d_f)) {
979
-    d_f <- read_array_visualize(infile)
980
-
981
-    d_f <- d_f %>%
982
-      group_by(Block) %>%
983
-      mutate(
984
-        meanX = mean(X),
985
-        meanY = mean(Y),
986
-        maxY = max(Y),
987
-        maxX = max(X),
988
-        minY = min(Y),
989
-        minX = min(X)
990
-      )
991
-  }
992
-
993
-
994
-  mid <- median(log(d_f[[MFI_var]]))
995
-  ggplot(data = d_f,
996
-         aes_string(
997
-           x = 'X',
998
-           y = sprintf("-%s", 'Y'),
999
-           #-Y,
1000
-           z =  sprintf("log(`%s`)", MFI_var)
1001
-         )) + #log(`F635 Median`)
1002
-    ## we have 24 vs 8 per block
1003
-    #stat_summary_2d(fun = median ,binwidth = c(40,120)) +
1004
-    stat_summary_2d(fun = median) +
1005
-    scale_fill_gradient2(
1006
-      midpoint = mid,
1007
-      low = "blue",
1008
-      mid = "white",
1009
-      high = "red",
1010
-      space = "Lab"
1011
-    ) +
1012
-    theme_void() +
1013
-    theme(legend.position = "none") +
1014
-    geom_text(aes(
1015
-      x = meanX,
1016
-      y = -meanY,
1017
-      label = paste("Block", Block)
1018
-    ),
1019
-    color = "black",
1020
-    size = 4)
1021
-}
1022 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "7E08C132",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/R/cv_estimation_tag_functions.R",
4
-    "project_path": "R/cv_estimation_tag_functions.R",
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647521308629.0,
10
-    "source_on_save": false,
11
-    "relative_order": 12,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "326,0",
16
-        "scrollLine": "0"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647521319,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647521319203,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,326 +0,0 @@
1
-#' Coefficient of Variation
2
-#' @title  cv_estimation
3
-#'
4
-#' @param dataC A dataset a data frame with feature variables to be used
5
-#' @param lab_replicates A numeric value indicating the number of lab replicates.
6
-#' @param sampleID_var A character string containing the name of the sample identifier variable. Default set to 'sampleID'
7
-#' @param antigen_var A character string containing the name of the features/protein variable. Default to 'antigen'
8
-#' @param replicate_var A character string containing the name of the replicate variable. Default to 'replicate'
9
-#' @param mfi_var A character string containing the name of the variable with MFI value.Assuming background correction is done already.
10
-#' Default to 'FMedianBG_correct'
11
-#' @param cv_cut_off Optional value indicating the cut off of flagging CV's. Default set at 20.
12
-#' @description A function to calculate the CV for the technical lab replicates. The default values are set as per the object names generated by machine
13
-#' @import dplyr tidyr
14
-#' @importFrom  rlang sym
15
-#' @importFrom tidyr gather spread
16
-#' @return A data frame where CV's of the replicates have been calculated
17
-#' @export
18
-#'
19
-#' @examples
20
-#' dataC <- readr::read_csv(system.file("extdata", "dataC.csv", package="protGear"))
21
-#' ## this file has 3 lab replicates and the default names
22
-#' cv_estimation(dataC  ,lab_replicates=3)
23
-cv_estimation <-
24
-  function(dataC  ,
25
-           lab_replicates ,
26
-           sampleID_var = 'sampleID',
27
-           antigen_var = 'antigen' ,
28
-           replicate_var = 'replicate' ,
29
-           mfi_var = 'FMedianBG_correct' ,
30
-           cv_cut_off = 20) {
31
-    if (lab_replicates == 1) {
32
-      dataC <- dataC %>%
33
-        dplyr::filter(
34
-          !grepl(
35
-            '^[Ll][Aa][Nn][Dd][Mm][Aa][Rr][Kk]|^[Bb][Uu][Ff][Ff][Ee][Rr]',
36
-            !!antigen_var
37
-          )
38
-        )   %>%
39
-        filter(replicate <= lab_replicates)
40
-
41
-      # combine Name and replicate
42
-      #tidyr::unite(col=antigen,antigen,replicate)
43
-      # dataC <- dataC %>%
44
-      #   dplyr::select(sampleID,sample_array_ID, antigen,iden,FMedianBG_correct)
45
-      ## get the name of the file
46
-      iden <- unique(dataC$iden)
47
-      # pick the values after the last underscore
48
-      ## to get the replicate
49
-      #dataC$replicate <- sub(".*_(.*)", "\\1", dataC$antigen)
50
-
51
-      ## create a wide data to
52
-      if (length(unique(dataC$replicate)) > lab_replicates)  {
53
-        try(stop("Some antigens seems to be repeated in a mini array for", iden) , outFile = stdout())
54
-        error_replicates(iden)
55
-      }
56
-
57
-    } else if (lab_replicates > 1) {
58
-      ## Exclude the land mark and Buffer
59
-      dataC <- dataC %>%
60
-        dplyr::filter(
61
-          !grepl(
62
-            '^[Ll][Aa][Nn][Dd][Mm][Aa][Rr][Kk]| ^[Bb][Uu][Ff][Ff][Ee][Rr]',
63
-            !!antigen_var
64
-          )
65
-        )   %>%
66
-        dplyr::filter(!!sym(replicate_var) <= lab_replicates)
67
-
68
-      #combine Name and replicate
69
-      #tidyr::unite(col=antigen,antigen,replicate)
70
-
71
-      ## get the name of the file
72
-      iden <- unique(dataC$iden)
73
-      # pick the values after the last underscore
74
-      ## to get the replicate
75
-      # dataC$replicate <- sub(".*_(.*)", "\\1", dataC$antigen)
76
-      ### pick the nname of the antigen
77
-      ## values until the last underscore
78
-      #dataC$antigen <- sub("\\_[^\\_]*$" , "", dataC$antigen)
79
-
80
-      ## create a wide data to
81
-
82
-      if (length(unique(dataC$replicate)) > lab_replicates)  {
83
-        try(stop("The replicates per antigen per sample are more than expected for ",
84
-                 iden) , outFile = stdout())
85
-        error_replicates(iden)
86
-      } else
87
-        (warning("The replicates are as expected per sample per antigen"))
88
-
89
-
90
-      ## reshaping the data
91
-      Data3 <- dataC %>%
92
-        dplyr::select(contains('iden'),contains('sample_array_ID'),!!sampleID_var, !!mfi_var, !!replicate_var,!!antigen_var) %>%
93
-        spread(replicate, FMedianBG_correct)
94
-
95
-
96
-      ### group the antigens and calculate their CV values
97
-      ## the CV values > 20, get the minimum mean of each of the two mfi values and use it to calculate CV
98
-      ## select the one with lowest CV
99
-      dataC <- dataC %>%
100
-        dplyr::group_by_at(c(antigen_var,sampleID_var)) %>%
101
-        ###mean and then for each grouping of 2
102
-        summarize(meanX=mean(get(mfi_var) , na.rm=TRUE),
103
-                  meanX2_X3=mean(get(mfi_var)[-1] , na.rm=TRUE),
104
-                  meanX1_X3=mean(get(mfi_var)[-2] , na.rm=TRUE),
105
-                  meanX1_X2=mean(get(mfi_var)[-3] , na.rm=TRUE),
106
-                  ###standard deviation and then for each grouping of 2
107
-                  sdX=round(sd(get(mfi_var) , na.rm=TRUE),2),
108
-                  sdX2_X3=round(sd(get(mfi_var)[-1] , na.rm=TRUE),2),
109
-                  sdX1_X3=round(sd(get(mfi_var)[-2] , na.rm=TRUE),2),
110
-                  sdX1_X2=round(sd(get(mfi_var)[-3] , na.rm=TRUE),2),
111
-                  ### cv
112
-                  CVX=(round(sdX/meanX,4))*100 ,
113
-                  CVX2_X3=(round(sdX2_X3/meanX2_X3,4))*100 ,
114
-                  CVX1_X3=(round(sdX1_X3/meanX1_X3,4))*100 ,
115
-                  CVX1_X2=(round(sdX1_X2/meanX1_X2,4))*100) %>%
116
-        mutate(cvCat_all = ifelse(CVX>=0 & CVX<=cv_cut_off , paste0("CV <= ",cv_cut_off),
117
-                                  ifelse(CVX >cv_cut_off & CVX <101, paste0("CV > ",cv_cut_off),"Others")))   %>%
118
-        mutate(cvSelected_all = ifelse(CVX>=0 & CVX<=cv_cut_off , CVX,
119
-                                       ifelse(CVX >cv_cut_off | CVX<=0 ,pmin(CVX2_X3, CVX1_X3,CVX1_X2 , na.rm = TRUE),NA)))
120
-      dataC <- dataC %>%
121
-        left_join(Data3, by=c(antigen_var,sampleID_var)) %>%
122
-        select(antigen, sampleID, sample_array_ID,everything())
123
-    }
124
-    return(dataC)
125
-  }
126
-
127
-
128
-#' Select set of replicates with the best CV
129
-#' @title best CV estimation
130
-#'
131
-#' @param dataCV A data frame
132
-#' @param slide_id A character string containing the identifier of the data frame variable.
133
-#' @param cv_cut_off a numeric value for the CV cut off. Should be between 0-100
134
-#' @param lab_replicates A numeric value indicating the number of lab replicates.
135
-#'
136
-#' @description A function to select the best CV by combining the replicates in duplicates. The function has been build for up to to 3 replicates so far
137
-#' @import dplyr tidyr
138
-#' @importFrom tidyr gather spread
139
-#' @importFrom dplyr mutate select
140
-#' @importFrom readr read_csv
141
-#' @importFrom  plyr .
142
-#' @return A data frame with the best CV's estimated
143
-#' @export
144
-#'
145
-#' @examples
146
-#' dataC <- readr::read_csv(system.file("extdata", "dataC.csv", package="protGear"))
147
-#' ## this file has 3 lab replicates and the default names
148
-#' dataCV <- cv_estimation(dataC  ,lab_replicates=3)
149
-#' best_CV_estimation(dataCV,slide_id = "iden", lab_replicates = 3, cv_cut_off = 20)
150
-best_CV_estimation <-
151
-  function(dataCV,
152
-           slide_id,
153
-           lab_replicates ,
154
-           cv_cut_off) {
155
-    if (lab_replicates > 1) {
156
-      ### Get the mean that corresponds to the lowest CV
157
-      iden <- unique(dataCV[[slide_id]])
158
-      data_best_CV <- as.data.frame(dataCV)
159
-
160
-      ## changing NaN values to 0 to facilitate computation
161
-      is.nan.data.frame <-
162
-        function(x)
163
-          do.call(cbind, lapply(x, is.nan))
164
-      data_best_CV[is.nan.data.frame(data_best_CV)] <- 0
165
-
166
-      ## get the minumum cv and put the value of the as a string variable
167
-      ## ie CVX3 selected meanX3 value will be created
168
-      ## might bring issues here if the subtraction has a NA or its missing for the prescan
169
-      data_best_CV <- data_best_CV %>%
170
-        mutate(x = colnames(.[, c("CVX2_X3", "CVX1_X3", "CVX1_X2")])
171
-               [apply(.[, c("CVX2_X3", "CVX1_X3", "CVX1_X2")], 1, which.min)])  %>%
172
-        dplyr::mutate(xbar = paste0("meanX", gsub("CVX", "", x)))
173
-
174
-      ## get the actual value of the mean that corresponds to that
175
-      #http://stackoverflow.com/questions/43762869/get-row-value-corresponding-to-a-column-name
176
-      data_best_CV <- data_best_CV %>%
177
-        mutate(row = 1:n()) %>%
178
-        # select(-c(`1`,`2`,`3`,iden)) %>%
179
-        gather(prop, val, meanX1_X2:meanX2_X3) %>%
180
-        group_by(row) %>%
181
-        mutate(selected = val[xbar == prop]) %>%
182
-        spread(prop, val) %>% dplyr::select(-row)
183
-
184
-
185
-      ##create the final selected mean in the data set
186
-      data_best_CV <-  data_best_CV %>%
187
-        mutate(
188
-          meanSelected = ifelse(
189
-            CVX >= 0 & CVX <= cv_cut_off ,
190
-            meanX,
191
-            ifelse((CVX > cv_cut_off &
192
-                      CVX < 101)  | CVX <= 0 , selected, NA)
193
-          ),
194
-          mean_best_CV = selected
195
-        ) %>%
196
-        dplyr::select(-xbar, -selected)
197
-      data_best_CV <-  data_best_CV %>%
198
-        mutate(
199
-          best_CV = pmin(CVX2_X3, CVX1_X3, CVX1_X2, na.rm = TRUE) ,
200
-          best_CV_cat  = ifelse(
201
-            best_CV >= 0 & best_CV <= cv_cut_off ,
202
-            paste0("CV <= ", cv_cut_off),
203
-            ifelse(
204
-              best_CV > cv_cut_off &
205
-                best_CV < 101,
206
-              paste0("CV > ", cv_cut_off),
207
-              "Others"
208
-            )
209
-          )
210
-        )
211
-
212
-    } else if (lab_replicates == 1) {
213
-      ### Get the mean that corresponds to the lowest CV
214
-      iden <- unique(dataCV[[slide_id]])
215
-      data_best_CV <- as.data.frame(dataCV)
216
-
217
-      ## changing NaN values to 0 to facilitate computation
218
-      is.nan.data.frame <-
219
-        function(x)
220
-          do.call(cbind, lapply(x, is.nan))
221
-      data_best_CV[is.nan.data.frame(data_best_CV)] <- 0
222
-
223
-      ### generate the variabrl
224
-      data_best_CV <- data_best_CV %>%
225
-        mutate(mean_best_CV = FMedianBG_correct)
226
-    }
227
-
228
-    ## return the dataset of interest
229
-    return(data_best_CV)
230
-  }
231
-#'
232
-#'
233
-#'         \\\_End_Function_\\\         #
234
-#'
235
-#'
236
-
237
-
238
-
239
-
240
-#' Subtract the purification TAG data
241
-#' @title tag_subtract
242
-#'
243
-#' @param tag_antigens A character vector with the names of proteins or antigens used as TAG.
244
-#' @param mean_best_CV_var A character string containing the identifier of the variable with the MFI values.
245
-#' @param dataC_mfi A dataframe
246
-#' @param sampleID_var A character string containing the name of the sample identifier variable. Default set to 'sampleID'
247
-#' @param antigen_var A character string containing the name of the features/protein variable. Default to 'antigen'
248
-#' @param batch_vars A list of characters identifying variables in dataC_mfi for indicating  batch.
249
-#' @param tag_file A data frame with variables \code{antigen, TAG, TAG_name } to show the TAG for the different antigens or proteins in dataC_mfi
250
-#' @import dplyr tidyr
251
-#' @importFrom tidyr gather spread
252
-#' @importFrom rlang := !! UQ
253
-#' @return A data frame of TAG values subtracted
254
-#' @export
255
-#'
256
-#' @examples
257
-#' tag_file <- readr::read_csv(system.file("extdata", "TAG_antigens.csv", package="protGear"))
258
-#' tag_antigens <- c("CD4TAG", "GST", "MBP")
259
-#' batch_vars <- list(machine = "m1", day = "0520")
260
-#' dataC <- readr::read_csv(system.file("extdata", "dataC.csv", package="protGear"))
261
-#' ## this file has 3 lab replicates and the default names
262
-#' dataCV <- cv_estimation(dataC  ,lab_replicates=3)
263
-#' dataCV_best2 <- best_CV_estimation(dataCV,slide_id = "iden", lab_replicates = 3, cv_cut_off = 20)
264
-#' tag_subtract(dataCV_best2,tag_antigens=tag_antigens, mean_best_CV_var="mean_best_CV",
265
-#'  tag_file = tag_file,antigen_var = "antigen", batch_vars = batch_vars)
266
-tag_subtract <-
267
-  function(dataC_mfi,
268
-           tag_antigens,
269
-           mean_best_CV_var,
270
-           tag_file,
271
-           batch_vars,
272
-           sampleID_var = 'sampleID',
273
-           antigen_var = 'antigen') {
274
-    mean_best_CV_var <-  rlang::sym(paste0(mean_best_CV_var))
275
-    sampleID_var <-  rlang::sym(paste0(sampleID_var))
276
-    antigen_var <-  rlang::sym(paste0(antigen_var))
277
-    ## remove the tag for the data
278
-
279
-    dataC_mfi_tags <- dataC_mfi %>%
280
-      dplyr::filter(UQ(antigen_var) %in% tag_antigens) %>%
281
-      ungroup() %>%
282
-      dplyr::select(!!sampleID_var,!!antigen_var ,!!mean_best_CV_var) %>%
283
-      ##change all the negative TAG values to zero
284
-      mutate(!!mean_best_CV_var := ifelse(!!mean_best_CV_var < 0, 0, !!mean_best_CV_var)) %>%
285
-      spread(!!antigen_var ,!!mean_best_CV_var)
286
-
287
-    ## join the data with the fused antigen name for subtraction
288
-    dataC_mfi <-
289
-      left_join(dataC_mfi , tag_file, by = paste(antigen_var))
290
-
291
-    dataC_mfi <- dataC_mfi %>% ungroup() %>%
292
-      dplyr::select(
293
-        !!sampleID_var ,
294
-        contains('sample_array_ID'),
295
-        !!sampleID_var ,
296
-        TAG,
297
-        everything() ,
298
-        -row
299
-      )
300
-    dataC_mfi <- dataC_mfi %>% ungroup() %>%
301
-      left_join(x = dataC_mfi ,
302
-                y = dataC_mfi_tags ,
303
-                by = paste(sampleID_var)) %>%
304
-      mutate(TAG_mfi = NA)
305
-    vars_in <- names(dataC_mfi)
306
-
307
-
308
-    ## mutate the tag_mfi variable
309
-    for (i in tag_antigens) {
310
-      tag_var <- rlang::sym(paste0(i))
311
-      dataC_mfi <- dataC_mfi %>%
312
-        mutate(TAG_mfi = ifelse(TAG_name == i &
313
-                                  is.na(TAG_mfi), !!tag_var, TAG_mfi))
314
-    }
315
-
316
-    ## subtract the TAG values depending on the TAG of
317
-    mean_best_CV_tag_var <-
318
-      rlang::sym(paste0(mean_best_CV_var, "_tag"))
319
-    dataC_mfi <- dataC_mfi %>%
320
-      mutate(TAG_mfi = ifelse(is.na(TAG_mfi), 0, TAG_mfi)) %>%
321
-      mutate(!!mean_best_CV_tag_var := !!mean_best_CV_var - TAG_mfi)  %>%
322
-      dplyr::select(c(vars_in, paste(mean_best_CV_tag_var))) %>%
323
-      ## add the batch variables to the data
324
-      mutate(machine = batch_vars[["machine"]] , day = batch_vars[["day"]])
325
-    return(dataC_mfi)
326
-  }
327 0
deleted file mode 100644
... ...
@@ -1,24 +0,0 @@
1
-{
2
-    "id": "9120B816",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/git_bioc_link.txt",
4
-    "project_path": null,
5
-    "type": "text",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647626123539.0,
10
-    "source_on_save": false,
11
-    "relative_order": 14,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source"
15
-    },
16
-    "folds": "",
17
-    "lastKnownWriteTime": 1643058003,
18
-    "encoding": "UTF-8",
19
-    "collab_server": "",
20
-    "source_window": "",
21
-    "last_content_update": 1643058003,
22
-    "read_only": false,
23
-    "read_only_alternatives": []
24
-}
25 0
\ No newline at end of file
26 1
deleted file mode 100644
... ...
@@ -1,3 +0,0 @@
1
-ssh-keygen -t ed25519 -C "keniajin@gmail.com"
2
-
3
-https://bioconductor.org/developers/how-to/git/new-package-workflow/
4 0
\ No newline at end of file
5 1
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "991A0EA2",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/R/normalisation_functions.R",
4
-    "project_path": "R/normalisation_functions.R",
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1644591162172.0,
10
-    "source_on_save": false,
11
-    "relative_order": 12,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "3,27",
16
-        "scrollLine": "0"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647580101,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647580101,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,627 +0,0 @@
1
-#' Normalize  Arrays
2
-#'
3
-#' @param matrix_antigen An object of class matrix with  features/proteins as columns and samples as the rows
4
-#' @param method character string specifying the normalization method. Choices are \code{"none","log2","vsn","cyclic_loess"}
5
-#' \code{"cyclic_loess_log" ,"rlm"}
6
-#' @param batch_correct A logical value indicating whether batch correction should be done or not
7
-#' @param batch_var1 A character or factor vector of size similar to rows of \code{matrix_antigen} indicating the first batch.
8
-#' @param batch_var2 A character or factor vector of size similar to rows of \code{matrix_antigen} indicating the second batch.
9
-#' @param return_plot A logical value indicating whether a plot is returned to show the results of normalisation.
10
-#' @param control_antigens  logical vector specifying the subset of spots which are non-differentially-expressed control spots,
11
-#' for use with \code{method="rlm"}
12
-#' @param array_matrix  An object of class dataframe or matrix used with \code{method='rlm'} indicating the sample index and
13
-#' @param plot_by_antigen Logical to indicate whether to plot by antigen or not
14
-#' slide name for the matrix_antigen object.
15
-#' @import limma tibble vsn
16
-#' @return A data frame of normalised values
17
-#' @export
18
-#'
19
-#' @examples
20
-#' matrix_antigen <- readr::read_csv(system.file("extdata", "matrix_antigen.csv", package="protGear"))
21
-#' #VSN
22
-#' normlise_vsn <- matrix_normalise(as.matrix(matrix_antigen),
23
-#' method = "vsn",
24
-#' return_plot = TRUE
25
-#' )
26
-#' ## log
27
-#' normlise_log <- matrix_normalise(as.matrix(matrix_antigen),
28
-#' method = "log2",
29
-#' return_plot = TRUE
30
-#' )
31
-#' ## cyclic_loess_log
32
-#' normlise_cylic_log <- matrix_normalise(as.matrix(matrix_antigen),
33
-#' method = "cyclic_loess_log",
34
-#' return_plot = TRUE
35
-#' )
36
-matrix_normalise <-
37
-  function(matrix_antigen,
38
-           method = "log2",
39
-           batch_correct = FALSE,
40
-           batch_var1,
41
-           batch_var2 = day_batches,
42
-           return_plot = FALSE,
43
-           plot_by_antigen = TRUE,
44
-           control_antigens = NULL,
45
-           array_matrix = NULL) {
46
-    if (method == "log2") {
47
-      exprs_log <- matrix_antigen
48
-      ## since log can handle negative values , we convert all the negative values to a constant value
49
-      # exprs_log[exprs_log<0] <- 1
50
-
51
-      ## any value that is negative is allocated the smallest value in the batch
52
-      min_pos <- minpositive(exprs_log)
53
-      exprs_log <- replace(exprs_log, exprs_log < 1, min_pos)
54
-      exprs_normalised <- log2(exprs_log)
55
-
56
-
57
-    } else if (method == "none") {
58
-      exprs_normalised <- matrix_antigen
59
-
60
-    } else if (method == "vsn") {
61
-      ## this approach utilises the VSN package
62
-      ## structure the data to be normalised
63
-      ## convert it to an Expression Set
64
-      exprs_antigen <-
65
-        Biobase::ExpressionSet(assayData = matrix_antigen)
66
-
67
-      if (batch_correct == FALSE) {
68
-        ## normalise without adjusting for the strata on VSN
69
-        data_points <- dim(matrix_antigen)[[1]]
70
-        exprs_normalised <-  vsn::justvsn(x = exprs_antigen ,
71
-                                          minDataPointsPerStratum = data_points)
72
-      } else if (batch_correct == TRUE) {
73
-        exprs_normalised <-
74
-          vsn::justvsn(
75
-            x = exprs_antigen ,
76
-            strata = machines ,
77
-            minDataPointsPerStratum = data_points
78
-          )
79
-      }
80
-
81
-
82
-    } else if (method == "cyclic_loess") {
83
-      exprs_normalised <-  limma::normalizeCyclicLoess(
84
-        matrix_antigen,
85
-        weights = NULL,
86
-        span = 0.7,
87
-        iterations = 3,
88
-        method = "fast"
89
-      )
90
-    } else if (method == "cyclic_loess_log") {
91
-      exprs_normalised <-   limma::normalizeCyclicLoess(
92
-        matrix_antigen,
93
-        weights = NULL,
94
-        span = 0.7,
95
-        iterations = 3,
96
-        method = "fast"
97
-      )
98
-      ## any value that is negative is allocated the smallest value in the batch
99
-      min_pos <- minpositive(exprs_normalised)
100
-      exprs_normalised <-
101
-        replace(exprs_normalised, exprs_normalised < 1, min_pos)
102
-      exprs_normalised <- log2(exprs_normalised)
103
-    } else if (method == "rlm") {
104
-      if (!is.null(control_antigens)) {
105
-        exprs_df <-
106
-          rlm_normalise_matrix(
107
-            matrix_antigen = matrix_antigen,
108
-            array_matrix = array_matrix,
109
-            control_antigens = control_antigens
110
-          )
111
-        exprs_normalised <-   rlm_normalise(exprs_df)
112
-        exprs_normalised <-  exprs_normalised  %>%
113
-          gather(variable, value,-(antigen:sampleID2)) %>%
114
-          unite(temp, antigen) %>% select(-variable) %>%
115
-          spread(temp, value) %>%
116
-          column_to_rownames(var = "sampleID2")
117
-      } else if (is.null(control_antigens) | is.null(array_matrix)) {
118
-        stop("Specify the control antigens or array_matrix to use RLM")
119
-      }
120
-    }
121
-
122
-    cv_val <-
123
-      round(sd(as.matrix(exprs_normalised), na.rm = TRUE) / mean(as.matrix(exprs_normalised), na.rm = TRUE), 4) *
124
-      100
125
-    cv_val <- paste(cv_val, "%", sep = "")
126
-    #exprs_normalised_df <- as.data.frame(exprs_normalised)
127
-
128
-    ## return the data in the structure as it was supplied
129
-    if (method == "vsn") {
130
-      ## this kept if a different structure is considered
131
-      matrix_antigen_normalised <- t(as.data.frame(exprs_normalised))
132
-      row.names(matrix_antigen_normalised) <-
133
-        as.numeric(gsub("X", "", row.names(matrix_antigen_normalised)))
134
-      matrix_antigen_normalised <-
135
-        as.data.frame(matrix_antigen_normalised)
136
-    } else{
137
-      matrix_antigen_normalised <-  as.data.frame(exprs_normalised)
138
-      #matrix_antigen_normalised <- exprs_normalised_df
139
-    }
140
-
141
-    ## Whether to plot by sample or antigen
142
-    if (plot_by_antigen == TRUE) {
143
-      plot_normalisation <-
144
-        plot_normalised_antigen(
145
-          exprs_normalised_df = matrix_antigen_normalised,
146
-          method = method,
147
-          batch_correct = batch_correct
148
-        )
149
-    } else{
150
-      plot_normalisation <-
151
-        plot_normalised(
152
-          exprs_normalised_df = matrix_antigen_normalised,
153
-          method = method,
154
-          batch_correct = batch_correct
155
-        )
156
-    }
157
-
158
-
159
-
160
-    if (return_plot == TRUE) {
161
-      return(
162
-        list(
163
-          matrix_antigen_normalised = matrix_antigen_normalised,
164
-          plot_normalisation = plot_normalisation
165
-        )
166
-      )
167
-    } else {
168
-      return(matrix_antigen_normalised)
169
-    }
170
-
171
-  }
172
-
173
-
174
-
175
-
176
-
177
-
178
-
179
-
180
-
181
-#' Nomrmalise using RLM
182
-#' @param matrix_antigen A matrix with antigen data
183
-#' @param array_matrix A matrix with control antigen data
184
-#' @param control_antigens the control antigens for RLM normalisation
185
-#' @description  A function for \code{method='rlm'} from \code{\link{matrix_normalise}}.
186
-#' @return A RLM normalised data frame
187
-#' @import dplyr
188
-#' @export
189
-#'
190
-#' @examples
191
-#' matrix_antigen <- readr::read_csv(system.file("extdata", "matrix_antigen.csv", package="protGear"))
192
-#' # rlm_normalise_matrix(matrix_antigen=matrix_antigen, array_matrix=array_matrix,
193
-#' # control_antigens=control_antigens)
194
-rlm_normalise_matrix <-
195
-  function(matrix_antigen,
196
-           array_matrix,
197
-           control_antigens) {
198
-    array_matrix$sample_index <-
199
-      as.character(array_matrix$sample_index)
200
-
201
-    array_matrix <- array_matrix %>%
202
-      group_by(slide) %>%
203
-      mutate(Array = group_indices())
204
-
205
-    rlm_normalise_df <- as.data.frame.matrix(matrix_antigen) %>%
206
-      #dplyr::mutate(sample_index=row.names(matrix_antigen))
207
-      rownames_to_column(var = "sample_index")
208
-
209
-
210
-    rlm_normalise_df <- rlm_normalise_df %>%
211
-      select(sample_index, everything()) %>%
212
-      gather(antigen, MFI_val, -sample_index) %>%
213
-      left_join(array_matrix, by = "sample_index") %>%
214
-      rename(Block = sample_array_ID) %>%
215
-      mutate(Block = as.numeric(Block))
216
-
217
-
218
-    ## create a variable to indicate the control antigens
219
-    rlm_normalise_df <-   rlm_normalise_df %>%
220
-      mutate(Description = ifelse(antigen %in% all_of(control_antigens) , "Control", "Sample"))
221
-    return(rlm_normalise_df)
222
-  }
223
-
224
-
225
-#' RLM normalisation
226
-#'
227
-#' @param rlm_normalise_df rlm normalised data frame
228
-#' @description  A function for \code{method='rlm'} from \code{\link{matrix_normalise}}.
229
-#' @return an elist of RLM normalisation to be utilised by \code{\link{rlm_normalise_matrix}}
230
-#' @export
231
-#' @keywords internal
232
-#' @examples
233
-#' matrix_antigen <- readr::read_csv(system.file("extdata", "matrix_antigen.csv", package="protGear"))
234
-#' #rlm_normalise_df <- rlm_normalise_matrix(matrix_antigen=matrix_antigen, array_matrix=array_matrix,
235
-#' # control_antigens=control_antigens)
236
-#' # rlm_normalise(rlm_normalise_df)
237
-rlm_normalise <- function(rlm_normalise_df) {
238
-  rlm_normalise_C <- rlm_normalise_df %>%
239
-    filter(grepl("Control", Description))
240
-  rlm_normalise_E <- rlm_normalise_df %>%
241
-    filter(grepl("Sample", Description))
242
-
243
-
244
-  ## create the matrix of the control antigens
245
-  rlm_normalise_C <- rlm_normalise_C  %>%
246
-    select(Array, Block, antigen, MFI_val) %>%
247
-    spread(Array, MFI_val)
248
-
249
-  ## create the matrix of the sample antigens
250
-  rlm_normalise_E <- rlm_normalise_E  %>%
251
-    select(Array, Block, antigen, MFI_val) %>%
252
-    spread(Array, MFI_val)
253
-
254
-
255
-  ## create Elist for the controls and samples
256
-  sample_elist <- list(E = rlm_normalise_E %>%
257
-                         select(-c(Block, antigen)) ,
258
-                       genes =  rlm_normalise_E %>% select(Block:antigen))
259
-
260
-
261
-  controls_elist <- list(E = rlm_normalise_C %>%
262
-                           select(-c(Block, antigen)) ,
263
-                         genes =  rlm_normalise_C %>%
264
-                           select(Block:antigen))
265
-
266
-
267
-  ## changing less values to the lowest positivie MFI value
268
-  min_pos <- minpositive(controls_elist$E)
269
-  controls_elist$E <-
270
-    replace(controls_elist$E, controls_elist$E < 1, min_pos / 2)
271
-
272
-  ## change the control EList to log2 -->
273
-  ## RLM reccommends
274
-  controls_elist$E  <- as.matrix(controls_elist$E)
275
-  controls_elist$E <- log2(controls_elist$E)
276
-  rownames(controls_elist$E) <- controls_elist$genes$antigen
277
-
278
-  contr_names <- unique(rownames(controls_elist$E))
279
-  contr_names_len <- length(contr_names)
280
-  contr_mapping <- matrix(nrow = contr_names_len, ncol = 1)
281
-  rownames(contr_mapping) <- contr_names
282
-  contr_mapping[, 1] <- seq_len(contr_names_len)
283
-
284
-
285
-  ##the number of not control antigens
286
-  p_features <- nrow(sample_elist$E)
287
-  p_controls <- nrow(controls_elist$E)
288
-  n_arrays <- ncol(sample_elist$E)
289
-  n_blocks <- max(controls_elist$genes$Block , na.rm = TRUE)
290
-  y <- c(controls_elist$E)
291
-
292
-
293
-  ## should be contr_names_len - 3 ... check
294
-  ## should it be changed when the controls are more
295
-  dummies <- matrix(0, ncol = {
296
-    n_arrays + n_blocks + contr_names_len - 3
297
-  }, nrow = length(y))
298
-
299
-
300
-  rnames <- vector(length = length(y))
301
-  a_cols <- paste("a", 1:{
302
-    n_arrays - 1
303
-  }, sep = "")
304
-
305
-
306
-  b_cols <- paste("b", 1:{
307
-    n_blocks - 1
308
-  }, sep = "")
309
-
310
-
311
-
312
-  ####changed
313
-  t_cols <- paste("t", 1:{
314
-    contr_names_len - 1
315
-  }, sep = "")
316
-
317
-
318
-
319
-  colnames(dummies) <- c(a_cols, b_cols, t_cols)
320
-
321
-  ## matrix to hold the parameters per array for each of the control antigen
322
-  a_params <- matrix(nrow = n_arrays, ncol = 2)
323
-  rownames(a_params) <- colnames(controls_elist$E)
324
-
325
-  ## matrix to hold the parameters for the Blocks
326
-  b_params <- matrix(nrow = n_blocks, ncol = 2)
327
-  idx <- 1
328
-
329
-  ## putting values to the defined matrix
330
-  # fill the matrix with 1's in such manner that for each row
331
-  # the combination of 1's and 0's is unique
332
-  for (i in seq_len(n_arrays)) {
333
-    a_tmp <- paste("a", i, sep = "")
334
-    a_params[i, 1] <- a_tmp
335
-    for (j in seq_len(p_controls)) {
336
-      rnames[idx] <- rownames(controls_elist$E)[j]
337
-      b_idx <- controls_elist$genes$Block[j]
338
-      b_tmp <- paste("b", b_idx, sep = "")
339
-      b_params[b_idx, 1] <- b_tmp
340
-      t_idx <- contr_mapping[rownames(controls_elist$E)[j],
341
-                             1]
342
-      t_tmp <- paste("t", t_idx, sep = "")
343
-      if (i == n_arrays) {
344
-        dummies[idx, a_cols] <- -1
345
-      }
346
-      else {
347
-        dummies[idx, a_tmp] <- 1
348
-      }
349
-      if (b_idx == n_blocks) {
350
-        dummies[idx, b_cols] <- -1
351
-      }
352
-      else {
353
-        dummies[idx, b_tmp] <- 1
354
-      }
355
-      if (t_idx == contr_names_len) {
356
-        dummies[idx, t_cols] <- -1
357
-      }
358
-      else {
359
-        dummies[idx, t_tmp] <- 1
360
-      }
361
-      idx <- idx + 1
362
-    }
363
-  }
364
-
365
-  ## bind with the control spots
366
-  dummies <- cbind(y, dummies)
367
-  rownames(dummies) <- rnames
368
-
369
-  ## add the error term
370
-  e <- rnorm(length(y), 0, 1)
371
-
372
-  ## run the model of controls and a matrix created out of controls
373
-  rlm_result <- MASS::rlm(y ~ . + e, data = data.frame(dummies))
374
-
375
-  ### PAA approach
376
-  a_params[-n_arrays, 2] <-
377
-    rlm_result$coefficients[a_params[-n_arrays, 1]]
378
-  a_params_sum <- sum(as.numeric(a_params[-n_arrays, 2]))
379
-  a_params[n_arrays, 2] <- -a_params_sum
380
-
381
-
382
-  b_params[-n_blocks, 2] <-
383
-    rlm_result$coefficients[b_params[-n_blocks, 1]]
384
-  b_params_sum <- sum(as.numeric(b_params[-n_blocks, 2]))
385
-  b_params[n_blocks, 2] <- -b_params_sum
386
-
387
-
388
-  a <- t(matrix(rep(as.numeric(a_params[, 2]), p_features),
389
-                ncol = p_features))
390
-
391
-  b <- matrix(mapply(function(i) {
392
-    as.numeric(b_params[sample_elist$genes$Block[i], 2])
393
-  }, rep(seq_len(p_features), n_arrays)), ncol = n_arrays)
394
-
395
-
396
-
397
-  ### added by ken - to convert all the negatives in the sample antigens to make them to the half of
398
-  ## the lowest positive
399
-
400
-  min_pos <- minpositive(sample_elist$E)
401
-  sample_elist$E <-
402
-    replace(sample_elist$E, sample_elist$E < 1, min_pos / 2)
403
-
404
-  ###
405
-  ## sample antigens matrix log 2
406
-  sample_elist_log <- as.matrix(log2(sample_elist$E))
407
-
408
-  sample_elist_normalised <- sample_elist_log - a - b
409
-  row.names(sample_elist_normalised) <-
410
-    paste0(sample_elist$genes$Block, "_", sample_elist$genes$antigen)
411
-
412
-
413
-  sample_elist_normalised <-
414
-    as.data.frame(sample_elist_normalised) %>%
415
-    rownames_to_column("antigen_name") %>%
416
-    mutate(
417
-      Block = as.integer(sub('_.*$', '', antigen_name)) ,
418
-      antigen = sub("[^_]*(.*)", "\\1", antigen_name),
419
-      antigen = sub('.', '', antigen)
420
-    ) %>%
421
-    select(Block, antigen_name, antigen, everything())
422
-
423
-  elist_normalised_df <- sample_elist_normalised %>%
424
-    gather(Array, meanBest2_RLM, -c("Block", "antigen_name", "antigen")) %>%
425
-    mutate(Array = as.numeric(Array)) %>%
426
-    right_join(rlm_normalise_df, by = c("antigen", "Block", "Array"))
427
-
428
-
429
-  elist_normalised_df <-
430
-    elist_normalised_df %>% group_by(sample_index, slide)
431
-
432
-  ## add the sample ID variable
433
-  elist_normalised_df$sampleID2 <-
434
-    group_indices(.data = elist_normalised_df)
435
-
436
-  ##
437
-  elist_normalised_df <- elist_normalised_df %>%
438
-    ungroup() %>%
439
-    select(antigen, sampleID2, meanBest2_RLM)
440
-
441
-  return(elist_normalised_df)
442
-
443
-}
444
-
445
-
446
-#' Trend test using Cox–Stuart (C–S) and Mann–Kendall (M–K) trend tests
447
-#'
448
-#' @param name Name of the test
449
-#' @param p_val p value from the test
450
-#' @param z_val the Z value of the test
451
-#'
452
-#' @return A statistics of mean standard deviation trend
453
-#' @export
454
-#'
455
-#' @examples
456
-#' output_trend_stats(name="t.test",p_val=0.001, z_val=5)
457
-output_trend_stats <- function(name, p_val, z_val) {
458
-  if (p_val < 0.00001) {
459
-    prop <- "<0.00001"
460
-    z <- round(z_val, 4)
461
-  } else{
462
-    prop <- as.character(round(p_val, 4))
463
-    z <- round(z_val, 4)
464
-  }
465
-  output <- paste0(name, ' Z-val = ', z, ' P-val = ', prop)
466
-}
467
-
468
-
469
-
470
-#' Comparison of normalised data by sample
471
-#'
472
-#' @param exprs_normalised_df a normalised data frame
473
-#' @param method the method of normalisation used
474
-#' @param batch_correct the batch correction
475
-#'
476
-#' @import dplyr   ggplot2
477
-#' @importFrom  readr  read_csv
478
-#' @importFrom Kendall MannKendall
479
-#' @importFrom genefilter rowSds
480
-#' @importFrom plyr .
481
-#' @return A ggplot of normalised data
482
-#' @export
483
-#'
484
-#' @examples
485
-#' matrix_antigen <- readr::read_csv(system.file("extdata", "matrix_antigen.csv", package="protGear"))
486
-#' normlise_vsn <- matrix_normalise(as.matrix(matrix_antigen),
487
-#' method = "vsn",
488
-#' return_plot = FALSE
489
-#' )
490
-#' plot_normalised(normlise_vsn,method="vsn",batch_correct=FALSE)
491
-plot_normalised <-
492
-  function(exprs_normalised_df,
493
-           method,
494
-           batch_correct) {
495
-    exprs_normalised_df_plot <-  exprs_normalised_df %>%
496
-      dplyr::mutate(
497
-        mean_all_anti = rowMeans(., na.rm = TRUE),
498
-        stdev_all_anti = genefilter::rowSds(as.matrix(.), na.rm = TRUE)
499
-      ) %>%
500
-      dplyr::mutate(
501
-        rank_mean_all_anti = rank(mean_all_anti) ,
502
-        method = method,
503
-        batch_correct = batch_correct
504
-      ) %>%
505
-      arrange(rank_mean_all_anti)
506
-
507
-    # perform the trend test using the Cox–Stuart (C–S) and Mann–Kendall (M–K) trend tests for
508
-    #for the null hypothesis of no trend in the transformed standard deviations under several transformation
509
-    #cs_trend <- trend::cs.test(exprs_normalised_df_plot$stdev_all_anti)
510
-    # mk_trend <- trend::mk.test(exprs_normalised_df_plot$stdev_all_anti)
511
-
512
-    mk_trend <-
513
-      Kendall::MannKendall(exprs_normalised_df_plot$stdev_all_anti)
514
-
515
-
516
-
517
-    cs_stuart <- "Cox-Stuart"
518
-    m_kendall <-
519
-      output_trend_stats('Mann-Kendall (tau stats)', mk_trend$sl[[1]], mk_trend$tau[[1]])
520
-
521
-
522
-
523
-    normalisation_approaches <- c(
524
-      "Log2" = "log2",
525
-      "VSN" = "vsn",
526
-      "Cyclic Loess" = "cyclic_loess",
527
-      "Cyclic Loess (log)" = "cyclic_loess_log",
528
-      "RLM" = "rlm"
529
-    )
530
-    norm_method <- names(which(normalisation_approaches == method))
531
-
532
-    p_norm <-
533
-      ggplot(exprs_normalised_df_plot ,
534
-             aes(x = rank_mean_all_anti, y = stdev_all_anti)) +
535
-      geom_jitter(color = "red") + theme_classic() +
536
-      expand_limits(y = c(0, 10)) +  stat_cor() + geom_smooth(color = 'blue', se = FALSE, size =
537
-                                                                0.5) +
538
-      ggtitle(paste(norm_method, "Normalisation")) + xlab("pooled mean rank (mean of features by sample)") +
539
-      ylab("pooled SD") +
540
-      labs(caption = paste0(m_kendall , "\n", cs_stuart)) +
541
-      theme(plot.caption = element_text(
542
-        hjust = 0,
543
-        color = "black",
544
-        face = "italic"
545
-      ))
546
-    return(p_norm)
547
-  }
548
-
549
-
550
-
551
-#' Comparison of normalised data by feature
552
-#'
553
-#' @param exprs_normalised_df a normalised data frame
554
-#' @param method the method of normalisation used
555
-#' @param batch_correct the batch correction
556
-#'
557
-#' @import dplyr
558
-#' @importFrom Kendall MannKendall
559
-#' @return A ggplot of various normalisation approaches
560
-#' @export
561
-#'
562
-#' @examples
563
-#' matrix_antigen <- readr::read_csv(system.file("extdata", "matrix_antigen.csv", package="protGear"))
564
-#' normlise_vsn <- matrix_normalise(as.matrix(matrix_antigen),
565
-#' method = "vsn",
566
-#' return_plot = FALSE
567
-#' )
568
-#' plot_normalised_antigen(normlise_vsn,method="vsn",batch_correct=FALSE)
569
-plot_normalised_antigen <-
570
-  function(exprs_normalised_df,
571
-           method,
572
-           batch_correct) {
573
-    ## how can we combine these two functions and make them the same
574
-    antigen_summ <- exprs_normalised_df %>%
575
-      gather(antigen, MFI) %>%
576
-      group_by(antigen) %>%
577
-      dplyr::summarise(mean_mfi = mean(MFI, na.rm = TRUE),
578
-                       sd_mfi = sd(MFI, na.rm = TRUE)) %>%
579
-      dplyr::mutate(
580
-        rank_mean_all_anti = rank(mean_mfi) ,
581
-        method = method,
582
-        batch_correct = batch_correct
583
-      ) %>%
584
-      arrange(rank_mean_all_anti)
585
-
586
-    # perform the trend test using the Cox–Stuart (C–S) and Mann–Kendall (M–K) trend tests for
587
-    #for the null hypothesis of no trend in the transformed standard deviations under several transformation
588
-    #cs_trend2 <- trend::cs.test(antigen_summ$sd_mfi)
589
-    #mk_trend2 <- trend::mk.test(antigen_summ$sd_mfi[!is.na(antigen_summ$sd_mfi)])
590
-
591
-    #cs_stuart2 <- output_trend_stats("Cox-Stuart",cs_trend2$p.value, cs_trend2$statistic)
592
-    #m_kendall2 <- output_trend_stats('Mann-Kendall',mk_trend2$p.value,mk_trend2$statistic )
593
-
594
-    ## Changed here to use Kendall packages
595
-    mk_trend2 <-
596
-      Kendall::MannKendall(antigen_summ$sd_mfi[!is.na(antigen_summ$sd_mfi)])
597
-    cs_stuart2 <-
598
-      "Cox-Stuart" #output_trend_stats("Cox-Stuart",cs_trend2$p.value, cs_trend2$statistic[[1]])
599
-    m_kendall2 <-
600
-      output_trend_stats('Mann-Kendall (tau stats)', mk_trend2$sl[[1]], mk_trend2$tau[[1]])
601
-
602
-
603
-    normalisation_approaches <- c(
604
-      "Log2" = "log2",
605
-      "VSN" = "vsn",
606
-      "Cyclic Loess" = "cyclic_loess",
607
-      "Cyclic Loess (log)" = "cyclic_loess_log",
608
-      "RLM" = "rlm"
609
-    )
610
-    norm_method <- names(which(normalisation_approaches == method))
611
-
612
-    p_norm <-
613
-      ggplot(antigen_summ ,  aes(x = rank_mean_all_anti, y = sd_mfi)) +
614
-      geom_jitter(color = "red") + theme_classic() +
615
-      expand_limits(y = c(0, 10)) +  stat_cor() + geom_smooth(color = 'blue', se = FALSE, size =
616
-                                                                0.5) +
617
-      ggtitle(paste(norm_method, "Normalisation")) + xlab("Pooled mean rank (mean of features)") +
618
-      ylab("pooled SD") +
619
-      labs(caption = paste0(m_kendall2 , "\n", cs_stuart2)) +
620
-      theme(plot.caption = element_text(
621
-        hjust = 0,
622
-        color = "black",
623
-        face = "italic"
624
-      ))
625
-
626
-    return(p_norm)
627
-  }
628 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "A69B539A",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/R/global_functions.R",
4
-    "project_path": "R/global_functions.R",
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647521299135.0,
10
-    "source_on_save": false,
11
-    "relative_order": 11,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "7,16",
16
-        "scrollLine": "0"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647580133,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647580133,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,239 +0,0 @@
1
-##________________________________________________________________________###
2
-###creating a pipe operator to negate the %in% operator
3
-#https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html
4
-# http://stackoverflow.com/questions/24941080/meaning-of-symbol-in-r
5
-"%ni%" <- Negate("%in%")
6
-
7
-
8
-#' Get the minimum positive value
9
-#'
10
-#' @param x  A numeric vector or variable
11
-#'
12
-#' @return Returns the minimum positive value in an object
13
-#' @export
14
-#'
15
-#' @examples
16
-#' minpositive(c(-1,-2,3,5,6,7,8,9,10))
17
-minpositive <- function(x) {
18
-  min(x[x > 0], na.rm = TRUE)
19
-}
20
-
21
-
22
-
23
-#' List the array structure variables
24
-#'
25
-#' @param channel A character indicating the channel that the data was scanned at. It is mostly included in the MFI variable names.
26
-#' @param totsamples A numeric value indicating teh number of samples on a slide.
27
-#' @param blockspersample A numeric value indicating the numer of blocks in a mini-array. The \code{".gal"} file can help in getting this
28
-#' @param chip_path A character indicating the path of the folder  location with the array data.
29
-#' @param sampleID_path A character indicating the path of the folder location with the sample identifiers matching the array structure.
30
-#' @param mig_prefix Optional: A character indicating the identifier of an MIG dilution file
31
-#' @param machine Optional:A character indicating the machine used to process the data in the folder
32
-#' @param FG  Optional:A character indicating the name of the foreground variable name. if not specified its created as \code{paste0("F",channel,".Median")}
33
-#' @param BG Optional:A character indicating the name of the background variable name.  if not specified its created as \code{paste0("B",channel,".Median")}
34
-#' @param FBG Optional:A character indicating the name of the foreground - background variable name.  if not specified its created as \code{paste0("F",channel,".Median...B",channel)}
35
-#' @param date_process Optional:A character indicating the date when the samples were processed.
36
-#'
37
-#' @description A generic function returning a list with the data structure.
38
-#' @importFrom rlang sym
39
-#' @return a list of parameters required to process the data
40
-#' @export
41
-#'
42
-#' @examples
43
-#' ## specify the the parameters to process the data
44
-#' genepix_vars <- array_vars(
45
-#' ## the channel the data was processed in
46
-#'   channel = "635",
47
-#'   ## folder where the array data is stored
48
-#'   chip_path = "data/array_data",
49
-#'   ## the number of samples per slide or in as single run
50
-#'   totsamples = 21,
51
-#'   ## How many blocks each sample occupies
52
-#'   blockspersample = 2,
53
-#'   ## folder where the array data samples id files are stored
54
-#'   sampleID_path = "data/array_sampleID/",
55
-#'   ## optional
56
-#'   mig_prefix = "_first",
57
-#'   machine = 1,
58
-#'   date_process = "0520"
59
-#' )
60
-#' genepix_vars
61
-#' @return genepix_vars
62
-#'
63
-array_vars <- function(channel = "635",
64
-                       totsamples ,
65
-                       FG = "",
66
-                       BG = "",
67
-                       FBG = "",
68
-                       blockspersample,
69
-                       chip_path = "data/array_data",
70
-                       sampleID_path = "data/array_sampleID/",
71
-                       mig_prefix = "_first",
72
-                       machine = "",
73
-                       date_process = "") {
74
-  ####List the directories with the CHIP data###############
75
-  paths <- list.dirs(path = chip_path, recursive =  TRUE)
76
-  ## remove the parent directory
77
-  ## the folders with the chip data with the different batches is left
78
-  paths <- paths[!grepl(paste0(chip_path, "$") , paths)]
79
-  if (FG == "") {
80
-    FG <- rlang::sym(paste0("F", channel, ".Median"))
81
-  } else{
82
-    FG <- rlang::sym(FG)
83
-  }
84
-  if (BG == "") {
85
-    BG <- rlang::sym(paste0("B", channel, ".Median"))
86
-  } else{
87
-    BG <- rlang::sym(BG)
88
-  }
89
-  if (FBG == "") {
90
-    FBG <- rlang::sym(paste0("F", channel, ".Median...B", channel))
91
-  } else{
92
-    FBG <- rlang::sym(FBG)
93
-  }
94
-
95
-  genepix_vars <-
96
-    list(
97
-      FG = FG,
98
-      #rlang::sym(paste0("F",channel,".Median")),
99
-      BG = BG,
100
-      #rlang::sym(paste0("B",channel,".Median")) ,
101
-      FBG = FBG,
102
-      #rlang::sym(paste0("F",channel,".Median...B",channel)),
103
-      paths = paths,
104
-      chip_path = chip_path,
105
-      sampleID_path = sampleID_path,
106
-      mig_prefix = mig_prefix,
107
-      machine = machine,
108
-      date_process = date_process,
109
-      totsamples = totsamples,
110
-      blockspersample = blockspersample,
111
-      mp = machine,
112
-      dp = date_process
113
-    )
114
-  return(genepix_vars)
115
-}
116
-
117
-
118
-
119
-
120
-
121
-#' Title Create directory function
122
-#'
123
-#'
124
-#' @param path folder location to create a directory
125
-#' @description creating a directory
126
-#' @return created directory
127
-#' @export
128
-#'
129
-#' @examples
130
-#' create_dir("data/sample_folder")
131
-create_dir <- function(path) {
132
-  if (!file.exists(paste0(path))) {
133
-    dir.create(path)
134
-  } else
135
-    warning("The folder", path, " already exists")
136
-}
137
-
138
-
139
-###___________________________________________________
140
-
141
-#' Object names of a list
142
-#'
143
-#' @param i - a list filenames with .txt or .gpr extension
144
-#'
145
-#' @return a list of file names
146
-#' @export
147
-#' @description A generic function returning a vector with the names of files in the same directory. Removes the file extension
148
-#' @examples
149
-#' name_of_files("KK2-06.txt")
150
-#' @return name
151
-
152
-name_of_files <- function(i) {
153
-  name <- gsub("\\.txt*|\\.gpr*", "", i, perl = TRUE)
154
-  name <- gsub(" repeat", "", name, perl = TRUE)
155
-  name <- gsub(" ", "_", name, perl = TRUE)
156
-  return(name)
157
-}
158
-
159
-
160
-#___________________________________________________
161
-# Function to be called in case of replicated error
162
-
163
-#'         \\\_Start_Function_For Error\\\         #
164
-#'
165
-#' @description A generic function to write into the log file with a replicate check error
166
-#' @param iden An id for the file with replicates error
167
-#' @return  a log file showing the replicate errors
168
-#' @keywords internal
169
-#'
170
-error_replicates <- function(iden) {
171
-  sink("log_replicates.txt" , append = TRUE)
172
-  warning("The replicates per antigen per sample are more than expected for ",
173
-          iden)
174
-  sink()
175
-}
176
-#'         \\\_End_Function_\\\         #
177
-#___________________________________________________
178
-
179
-
180
-
181
-
182
-###
183
-#' Check existing sample ID names
184
-#' @param genepix_vars A list of specific definitions of the experiment design. See \code{\link{array_vars}}.
185
-#' @description  A generic function to check if the file(s) witht the MFI values have a corresponding sample ID file. Sample ID file is
186
-#' a file with the identifiers for the samples in array file.
187
-#' @return A file with missing corresponding sample ID files
188
-#' @importFrom stats median quantile rnorm sd
189
-#' @importFrom utils read.csv write.table
190
-#' @export
191
-#'
192
-#' @examples
193
-#' genepix_vars <- array_vars(
194
-#' channel = "635",
195
-#' chip_path = system.file("extdata", "array_data/machine1/", package="protGear"),
196
-#' totsamples = 21,
197
-#' blockspersample = 2,
198
-#' mig_prefix = "_first",
199
-#' machine = 1,
200
-#' date_process = "0520"
201
-#' )
202
-#' check_sampleID_files(genepix_vars)
203
-check_sampleID_files <- function(genepix_vars) {
204
-  ## copy all sample ID with missing CSV file
205
-  ##
206
-  sid_files <-
207
-    gsub(".csv", "", list.files(genepix_vars$sampleID_path))
208
-  ## check if all the chip files have an existing sampleID file
209
-  sid_check <-
210
-    gsub(
211
-      ".txt|.gpr",
212
-      "",
213
-      list.files(
214
-        genepix_vars$chip_path ,
215
-        recursive = TRUE,
216
-        pattern = "*.txt|*.gpr",
217
-        full.names = FALSE
218
-      )
219
-    )
220
-  ## convert all the file names to caps to avoid merge errrors due to case
221
-  sid_check <- toupper(sub(".*/(.*)", "\\1", sid_check))
222
-
223
-  ## missing sampleID for any given file
224
-  miss_id_file <- sid_check[sid_check  %ni%  toupper(sid_files)]
225
-
226
-
227
-
228
-  if (length(miss_id_file) > 0) {
229
-    write.table(miss_id_file, "missing_IDfile.txt")
230
-  }
231
-
232
-  if (length(miss_id_file) == 0) {
233
-    warning("All array files have a corresponding sampleID file")
234
-    return(0)
235
-  } else{
236
-    return(miss_id_file)
237
-  }
238
-
239
-}
240 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "BC189306",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/DESCRIPTION",
4
-    "project_path": "DESCRIPTION",
5
-    "type": "dcf",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647626669159.0,
10
-    "source_on_save": false,
11
-    "relative_order": 17,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "3,9",
16
-        "scrollLine": "0"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647580196,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647580196,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,31 +0,0 @@
1
-Package: protGear
2
-Type: Package
3
-Title: Protein Micro Array Data Management and Interactive Visualization
4
-Version: 0.99.544
5
-Authors@R: c(person("Kennedy", "Mwai", role = c("cre", "aut"), email = "keniajin@gmail.com"),
6
-             person(c("James", "Mburu"), role = "aut" , email = "mburuwanja@gmail.com") ,
7
-             person(c("Jacqueline", "Waeni"), role = "ctb" , email = "jacqwaeni@gmail.com"))
8
-Description: A generic three-step pre-processing package for protein microarray data. This package contains different data pre-processing procedures to allow comparison of their performance.These steps are background correction, the coefficient of variation (CV) based filtering,  batch correction and normalization.
9
-License: GPL-3
10
-URL: https://github.com/Keniajin/protGear
11
-BugReports: https://github.com/Keniajin/protGear/issues
12
-Depends:
13
-    R (>= 4.2), dplyr (>= 0.8.0) , limma (>= 3.40.2) ,vsn (>= 3.54.0)
14
-Imports:
15
-     magrittr (>= 1.5) , stats (>= 3.6) , ggplot2 (>= 3.3.0)  , tidyr (>= 1.1.3) , data.table (>= 1.14.0),
16
-    ggpubr (>= 0.4.0), gtools (>= 3.8.2) , tibble (>= 3.1.0) ,  rmarkdown (>= 2.9) , knitr (>= 1.33),
17
-    utils (>= 3.6), genefilter (>= 1.74.0), readr (>= 2.0.1) , Biobase (>= 2.52.0), 
18
-    plyr (>= 1.8.6) , Kendall (>= 2.2)  , shiny (>= 1.0.0) ,  purrr (>= 0.3.4),  plotly (>= 4.9.0) , MASS (>= 7.3) , 
19
-    htmltools (>= 0.4.0) , flexdashboard (>= 0.5.2) , shinydashboard (>= 0.7.1) , kableExtra (>= 1.3.4), 
20
-    GGally (>= 2.1.2) , pheatmap (>= 1.0.12) ,  grid(>= 4.1.1),
21
-     styler (>= 1.6.1) , factoextra (>= 1.0.7) ,FactoMineR (>= 2.4) , rlang (>= 0.4.11),
22
-     remotes (>= 2.4.0) 
23
-Suggests:
24
-    gridExtra (>= 2.3),   png (>= 0.1-7) , magick (>= 2.7.3) , ggplotify (>= 0.1.0) , scales (>= 1.1.1) ,
25
-    shinythemes (>= 1.2.0) , shinyjs (>= 2.0.0) , shinyWidgets (>= 0.6.2) ,
26
-     shinycssloaders (>= 1.0.0) , shinyalert (>= 3.0.0) , shinyFiles (>= 0.9.1)
27
-biocViews: Microarray, OneChannel, Preprocessing , BiomedicalInformatics , Proteomics , BatchEffect, Normalization , Bayesian, Clustering, Regression,SystemsBiology, ImmunoOncology
28
-Encoding: UTF-8
29
-LazyData: false
30
-RoxygenNote: 7.1.2
31
-VignetteBuilder: knitr
32 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "DC0B415C",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/githubio_link.R",
4
-    "project_path": null,
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647626162454.0,
10
-    "source_on_save": false,
11
-    "relative_order": 16,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "8,0",
16
-        "scrollLine": "0"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1625213270,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1625213270,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,14 +0,0 @@
1
-##https://www.r-bloggers.com/2017/08/building-a-website-with-pkgdown-a-short-guide/
2
-
3
-require(devtools)
4
-#use_readme_rmd()
5
-#use_news_md()
6
-#use_vignette("protGear")
7
-use_github_links(overwrite = TRUE)
8
-
9
-library(pkgdown)
10
-## build own vignette automatically
11
-#devtools::build_vignettes()
12
-build_site()
13
-
14
-
15 0
deleted file mode 100644
... ...
@@ -1,26 +0,0 @@
1
-{
2
-    "id": "EE75FEDD",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/protGear/R/cv_by_sample_estimation.R",
4
-    "project_path": "R/cv_by_sample_estimation.R",
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647521320986.0,
10
-    "source_on_save": false,
11
-    "relative_order": 13,
12
-    "properties": {
13
-        "source_window_id": "",
14
-        "Source": "Source",
15
-        "cursorPosition": "16,46",
16
-        "scrollLine": "0"
17
-    },
18
-    "folds": "",
19
-    "lastKnownWriteTime": 1647521330,
20
-    "encoding": "UTF-8",
21
-    "collab_server": "",
22
-    "source_window": "",
23
-    "last_content_update": 1647521330130,
24
-    "read_only": false,
25
-    "read_only_alternatives": []
26
-}
27 0
\ No newline at end of file
28 1
deleted file mode 100644
... ...
@@ -1,44 +0,0 @@
1
-#' Summarise CV by samples
2
-#' @title cv by sample
3
-#'
4
-#' @param cv_variable A character string containing the identifier of the variable with CV values.
5
-#' @param lab_replicates A numeric value indicating the number of lab replicates.
6
-#' @param  sampleID_var A character string containing the name of the sample identifier variable. Default set to 'sampleID'
7
-#' @param dataCV A dataframe
8
-#' @import dplyr tidyr
9
-#' @importFrom tidyr gather
10
-#' @importFrom dplyr lag filter
11
-#' @description A function to give the summary of the CV's by the sampleID
12
-#' @return A data frame of CV calculated by sample
13
-#' @export
14
-#'
15
-#' @examples
16
-#' dataC <- readr::read_csv(system.file("extdata", "dataC.csv", package="protGear"))
17
-#' ## this file has 3 lab replicates and the default names
18
-#' dataCV <- cv_estimation(dataC  ,lab_replicates=3)
19
-#' cv_by_sample_estimation(dataCV, cv_variable = "cvCat_all", lab_replicates = 3)
20
-cv_by_sample_estimation <-
21
-  function(dataCV,
22
-           cv_variable,
23
-           lab_replicates,
24
-           sampleID_var = 'sampleID') {
25
-    ## creating a summary of the CV's by sampleID for each file
26
-    ## helps in identifying samples with a high CV value
27
-    if (lab_replicates > 1) {
28
-      iden <- unique(dataCV$iden)
29
-      dataC_cvSample <-  dataCV %>%
30
-        group_by_at(c(sampleID_var, cv_variable)) %>%
31
-        summarise(n = n()) %>%
32
-        mutate(perc = round((n / sum(n)) * 100, 2)) %>%
33
-        #rename(cvCat='get(cv_variable)') %>%
34
-        ungroup() %>%
35
-        gather(variable, value,-c(sampleID_var, cv_variable)) %>%
36
-        unite(temp,!!(cv_variable), variable) %>%
37
-        tidyr::spread(temp, value, fill = 0)
38
-    } else{
39
-      dataC_cvSample <- NULL
40
-      warning("The experiment is specified not to have lab replicates")
41
-    }
42
-
43
-    return(dataC_cvSample)
44
-  }
45 0
deleted file mode 100644
... ...
@@ -1,27 +0,0 @@
1
-{
2
-    "id": "F4C665F5",
3
-    "path": "C:/Users/kmwai/OneDrive - Kemri Wellcome Trust/H_Drive/PhD_work/projects/protGear_data/examples_functions.R",
4
-    "project_path": null,
5
-    "type": "r_source",
6
-    "hash": "0",
7
-    "contents": "",
8
-    "dirty": false,
9
-    "created": 1647515325438.0,
10
-    "source_on_save": false,
11
-    "relative_order": 7,
12
-    "properties": {
13
-        "tempName": "Untitled4",
14
-        "source_window_id": "",
15
-        "Source": "Source",
16
-        "cursorPosition": "8,45",
17
-        "scrollLine": "0"
18
-    },
19
-    "folds": "",
20
-    "lastKnownWriteTime": 1639483627,
21
-    "encoding": "UTF-8",
22
-    "collab_server": "",
23
-    "source_window": "",
24
-    "last_content_update": 1639483627,
25
-    "read_only": false,
26
-    "read_only_alternatives": []
27
-}
28 0
\ No newline at end of file
29 1
deleted file mode 100644
... ...
@@ -1,97 +0,0 @@
1
-library(protGear)
2
-system.file("extdata", "/array_data/machine1/KK2-06.txt", package="protGear")
3
-visualize_slide(
4
-  infile = system.file("extdata", "/array_data/machine1/KK2-06.txt", package="protGear"),
5
-  MFI_var = "B635 Median"
6
-)
7
-
8
-
9
-## specify the the parameters to process the data
10
-genepix_vars <- array_vars(
11
-  channel = "635",
12
-  chip_path = file.path(system.file("extdata", "array_data/machine1/", package="protGear")),
13
-  totsamples = 21,
14
-  blockspersample = 2,
15
-  mig_prefix = "_first",
16
-  machine = 1,
17
-  ## optional
18
-  date_process = "0520"
19
-)
20
-
21
-
22
-
23
-
24
-genepix_vars <- array_vars(
25
-  channel = "635",
26
-  chip_path = file.path(system.file("extdata", "array_data/machine1/", package="protGear")),
27
-  totsamples = 21,
28
-  blockspersample = 2,
29
-  mig_prefix = "_first",
30
-  machine = 1,
31
-  ## optional
32
-  date_process = "0520"
33
-)
34
-
35
-data_path <- paste0(genepix_vars$chip_path)
36
-filenames <- list.files(file.path(genepix_vars$chip_path),
37
-                        pattern = "*.txt$|*.gpr$", full.names = F
38
-)
39
-data_files <- purrr::map(
40
-  .x = filenames,
41
-  .f = read_array_files,
42
-  data_path = data_path,
43
-  genepix_vars = genepix_vars
44
-)
45
-data_files <- set_names(data_files, purrr::map(filenames, name_of_files))
46
-names(data_files)
47
-bg <- extract_bg(iden ="KK2-06" , data_files=data_files,genepix_vars=genepix_vars)
48
-
49
-readr::write_csv(bg %>% select(-slide),"inst/extdata/bg_example.csv")
50
-plot_bg(bg,
51
-        antigen_name = "antigen",
52
-        bg_MFI = "BG_Median", FG_MFI = "FBG_Median", log = F
53
-)
54
-
55
-
56
-
57
-### Define the genepix_vars
58
-genepix_vars <- array_vars(
59
-  channel = "635",
60
-  chip_path = file.path(system.file("extdata", "array_data/machine1/", package="protGear")),
61
-  totsamples = 21,
62
-  blockspersample = 2,
63
-  mig_prefix = "_first",
64
-  machine = 1,
65
-  ## optional
66
-  date_process = "0520"
67
-)
68
-
69
-## the path where the micro-array data is located
70
-data_path <- paste0(genepix_vars$chip_path)
71
-filenames <- list.files(file.path(genepix_vars$chip_path),
72
-                        pattern = "*.txt$|*.gpr$", full.names = F
73
-)
74
-## create a list of all the files
75
-data_files <- purrr::map(
76
-  .x = filenames,
77
-  .f = read_array_files,
78
-  data_path = data_path,
79
-  genepix_vars = genepix_vars
80
-)
81
-data_files <- set_names(data_files, purrr::map(filenames, name_of_files))
82
-
83
-merge_sampleID(iden = "KK2-06", data_files = data_files,
84
-               genepix_vars =genepix_vars,method = "subtract_global" )
85
-readr::write_csv(df_1,"inst/extdata/Data1_sample.csv")
86
-
87
-readr::write_csv(Data1,"inst/extdata/Data1_bg_sample.csv")
88
-
89
-
90
-
91
-## buffers
92
- bg_correct_df <- readr::read_csv(system.file("extdata", "Data1_sample.csv", package="protGear"))
93
-aa <- buffer_spots(Data1 = bg_correct_df)
94
-readr::write_csv(aa %>% mutate(.id="slideid"),"inst/extdata/buffers_sample2.csv")
95
-
96
-buffers <- readr::read_csv(system.file("extdata", "buffers_sample2.csv", package="protGear"))
97
-plot_buffer(df=buffers,buffer_names = "sampleID")
98 0
deleted file mode 100644
99 1
deleted file mode 100644
100 2
deleted file mode 100644
... ...
@@ -1,13 +0,0 @@
1
-.Rproj.user
2
-.Rhistory
3
-.RData
4
-.Ruserdata
5
-.git
6
-data/
7
-processed_data/
8
-to_update.md
9
-*.Rproj
10
-log_file.txt
11
-.gitignore
12
-.Rbuildignore
13
-.gitignores/
14 0
deleted file mode 100644
... ...
@@ -1,3 +0,0 @@
1
-docs/
2
-www/
3
-