Browse code

Merge branch 'master' into devel

Vlad Petyuk authored on 30/11/2018 18:22:36
Showing 16 changed files

... ...
@@ -2,7 +2,7 @@ Package: MSnID
2 2
 Type: Package
3 3
 Title: Utilities for Exploration and Assessment of Confidence of
4 4
     LC-MSn Proteomics Identifications
5
-Version: 1.7.3
5
+Version: 1.15.2
6 6
 Author: Vlad Petyuk with contributions from Laurent Gatto
7 7
 Maintainer: Vlad Petyuk <petyuk@gmail.com>
8 8
 Description: Extracts MS/MS ID data from mzIdentML (leveraging mzID package) or
... ...
@@ -19,4 +19,4 @@ Imports: MSnbase (>= 1.12.1), mzID (>= 1.3.5), R.cache,
19 19
     data.table, Biobase, ProtGenerics, reshape2, dplyr, mzR
20 20
 Suggests: BiocStyle, msmsTests, ggplot2, RUnit, BiocGenerics
21 21
 LazyData: yes
22
-biocViews: Proteomics, MassSpectrometry
22
+biocViews: Proteomics, MassSpectrometry, ImmunoOncology
... ...
@@ -13,7 +13,8 @@ importFrom("ProtGenerics", "proteins")
13 13
 importFrom("mzID", "mzID")
14 14
 importFrom("mzR","openIDfile","score","modifications","fileName")
15 15
 importMethodsFrom("mzID", "flatten")
16
-importFrom("parallel", "mclapply", "detectCores","makeCluster","stopCluster")
16
+importFrom("parallel", "mclapply", "detectCores","makeCluster",
17
+           "stopCluster","parLapply","clusterExport","clusterEvalQ")
17 18
 importClassesFrom("MSnbase", "MSnSet")
18 19
 importClassesFrom("Biobase", "AnnotatedDataFrame")
19 20
 #-----------------------------------------------------------------------------
... ...
@@ -1,3 +1,21 @@
1
+Version 1.15.2
2
+-------------
3
+ o Broadened the scope of the tool to ImmunoOncology - that is finding rare
4
+   events that reflected in protein sequence.
5
+   
6
+Version 1.15.1
7
+-------------
8
+ o mzR now added scan number(s) into the table representation of
9
+   mzIdentML object. As a results it caused an error in my unit test
10
+   checking if the file reads properly. Fixed this check with updated hash.
11
+
12
+Version 1.11.1
13
+-------------
14
+ o Switch from mclapply to parLapply in filter optimizaition.
15
+   This fixes a bug 
16
+   "Assertion failure at kmp_runtime.cpp(6480): __kmp_thread_pool == __null."
17
+   Also allows to run parallel on Window OS
18
+
1 19
 Version 1.7.3
2 20
 -------------
3 21
  o Logical error fix in PSM FDR calculation. Prior it ignored redundancy
... ...
@@ -1,5 +1,4 @@
1 1
 
2
-
3 2
 .PROTON_MASS <- 1.007276466812
4 3
 
5 4
 setClass(Class="MSnID",
... ...
@@ -14,4 +13,3 @@ setClass(Class="MSnIDFilter",
14 13
                 filterList="list",
15 14
                 validParNames="character")
16 15
 )
17
-
... ...
@@ -155,13 +155,26 @@ setMethod(
155 155
 
156 156
 
157 157
 #----Filter---------------------------------------------------------------------
158
+# # Old implementation. Just as a backup.
159
+# # See https://github.com/vladpetyuk/MSnID/issues/5 for the issue.
160
+# setMethod(
161
+#     "apply_filter",
162
+#     signature(msnidObj="MSnID", filterObj="character"),
163
+#     definition=function(msnidObj, filterObj)
164
+#     {
165
+#         exprssn <- parse(text=filterObj, srcfile=NULL, keep.source=FALSE)
166
+#         msnidObj@psms <- msnidObj@psms[eval(exprssn),]
167
+#         return(msnidObj)
168
+#     }
169
+# )
158 170
 setMethod(
159 171
     "apply_filter",
160 172
     signature(msnidObj="MSnID", filterObj="character"),
161 173
     definition=function(msnidObj, filterObj)
162 174
     {
163 175
         exprssn <- parse(text=filterObj, srcfile=NULL, keep.source=FALSE)
164
-        msnidObj@psms <- msnidObj@psms[eval(exprssn),]
176
+        idx <- eval(exprssn, envir = msnidObj@psms, enclos = parent.frame())
177
+        msnidObj@psms <- msnidObj@psms[idx,]
165 178
         return(msnidObj)
166 179
     }
167 180
 )
... ...
@@ -567,15 +580,29 @@ setAs("MSnID", "MSnSet",
567 580
 setMethod("infer_parsimonious_accessions", "MSnID",
568 581
           definition=function(object)
569 582
           {
583
+              # # Old code for inferring accessions.
584
+              # # It is too slow.
585
+              # infer_acc <- function(x){
586
+              #     res <- list()
587
+              #     while(nrow(x) > 0){
588
+              #         top_prot <- names(which.max(table(x[['accession']])))
589
+              #         top_peps <- subset(x, accession == top_prot)
590
+              #         res <- c(res, list(top_peps))
591
+              #         x <- subset(x, !(pepSeq %in% top_peps[,"pepSeq"]))
592
+              #     }
593
+              #     return(Reduce(rbind,res))
594
+              # }
595
+
570 596
               infer_acc <- function(x){
571 597
                   res <- list()
598
+                  setDT(x)
572 599
                   while(nrow(x) > 0){
573
-                      top_prot <- names(which.max(table(x[['accession']])))
600
+                      top_prot <- x[, .N, by=accession][which.max(N),,]$accession
574 601
                       top_peps <- subset(x, accession == top_prot)
575 602
                       res <- c(res, list(top_peps))
576
-                      x <- subset(x, !(pepSeq %in% top_peps[,"pepSeq"]))
603
+                      x <- subset(x, !(pepSeq %in% top_peps[[1]]))
577 604
                   }
578
-                  return(Reduce(rbind,res))
605
+                  return(rbindlist(res, use.names=F, fill=FALSE, idcol=NULL))
579 606
               }
580 607
               
581 608
               x <- unique(psms(object)[,c("pepSeq","accession")])
... ...
@@ -1,6 +1,4 @@
1 1
 
2
-
3
-
4 2
 .is_filterList_valid <- function(filterList)
5 3
 {
6 4
     entries=c("comparison","threshold")
... ...
@@ -141,4 +139,3 @@ setMethod("$<-", "MSnIDFilter",
141 139
             }
142 140
 )
143 141
 
144
-
... ...
@@ -88,6 +88,34 @@
88 88
 }
89 89
 
90 90
 
91
+.optimize_filter.grid.parLapply <- function(filterObj, msnidObj,
92
+                                           fdr.max, level, n.iter, mc.cores)
93
+{
94
+    par.grid <- .construct_optimization_grid(filterObj, msnidObj, n.iter)
95
+    cl <- makeCluster(mc.cores)
96
+    # clusterExport(cl, 
97
+    #                         list(".get_num_pep_for_fdr"),
98
+    #                         envir = "package:MSnID")
99
+    clusterEvalQ(cl, library("MSnID"))
100
+    evaluations <- parLapply(cl,
101
+                             seq_len(nrow(par.grid)),
102
+                             function(i){
103
+                                .get_num_pep_for_fdr(par.grid[i,],
104
+                                                     msnidObj,
105
+                                                     filterObj,
106
+                                                     fdr.max,
107
+                                                     level)})
108
+    stopCluster(cl)
109
+    evaluations <- round(unlist(evaluations))
110
+    if(all(evaluations == 0)){
111
+        warning(.msg.invalid.optimization.results.grid)
112
+        return(filterObj)
113
+    }
114
+    optim.pars <- par.grid[which.max(evaluations),]
115
+    newFilter <- update(filterObj, as.numeric(optim.pars))
116
+    return(newFilter)
117
+}
118
+
91 119
 setMethod("optimize_filter",
92 120
             signature(filterObj="MSnIDFilter", msnidObj="MSnID"),
93 121
             definition=function(filterObj, msnidObj, fdr.max,
... ...
@@ -120,19 +148,24 @@ setMethod("optimize_filter",
120 148
     }
121 149
     #
122 150
     if(method == "Grid"){
123
-        if(.Platform$OS.type == "unix"){
124
-            if(is.null(mc.cores))
125
-                mc.cores <- getOption("mc.cores", 2L)
126
-            optimFilter <-
127
-                .optimize_filter.grid.mclapply(filterObj, msnidObj,
128
-                                               fdr.max, level, 
129
-                                               n.iter, mc.cores)
130
-        }else{
131
-            # yet to implement effective parallelization on Windows
132
-            optimFilter <-
133
-                .optimize_filter.grid(filterObj, msnidObj,
134
-                                        fdr.max, level, n.iter)
135
-        }
151
+        # if(.Platform$OS.type == "unix"){
152
+        #     if(is.null(mc.cores))
153
+        #         mc.cores <- getOption("mc.cores", 2L)
154
+        #     optimFilter <-
155
+        #         .optimize_filter.grid.mclapply(filterObj, msnidObj,
156
+        #                                        fdr.max, level, 
157
+        #                                        n.iter, mc.cores)
158
+        # }else{
159
+        #     # yet to implement effective parallelization on Windows
160
+        #     optimFilter <-
161
+        #         .optimize_filter.grid(filterObj, msnidObj,
162
+        #                                 fdr.max, level, n.iter)
163
+        # }
164
+        mc.cores <- getOption("mc.cores", 2L)
165
+        optimFilter <-
166
+            .optimize_filter.grid.parLapply(filterObj, msnidObj,
167
+                                            fdr.max, level,
168
+                                            n.iter, mc.cores)
136 169
     }
137 170
     if(method %in% c("Nelder-Mead", "SANN")){
138 171
         optim.out <- optim(par=as.numeric(filterObj),
... ...
@@ -1,3 +1,5 @@
1 1
 MSnID
2 2
 ====
3
+[![Rdoc](http://www.rdocumentation.org/badges/version/MSnID)](http://www.rdocumentation.org/packages/MSnID)
4
+===
3 5
 `MSnID` is an R/Bioconductor package that provides convience tools for handling and filtering of MS/MS identification results. The official page is the Bioconductor landing page ([release](http://www.bioconductor.org/packages/release/bioc/html/MSnID.html) or [devel](http://www.bioconductor.org/packages/devel/bioc/html/MSnID.html) versions). The [github page](https://github.com/vladpetyuk/MSnID) page is for sharing, testing, issue tracking and forking/pulling purposes.
... ...
@@ -30,5 +30,6 @@ test_column_names <- function() {
30 30
 
31 31
 test_data_load_mzR <- function() {
32 32
     # now, check if it is what it supposed to be
33
-    checkIdentical(digest(psms(msnid3)),'e5c572c07878673f1165822969f81869')
33
+    # checkIdentical(digest(psms(msnid3)),'e5c572c07878673f1165822969f81869')
34
+    checkIdentical(digest(psms(msnid3)),'0b4e3b61e3fe007ed11651632fa3f1fb')
34 35
 }
... ...
@@ -1,9 +1,9 @@
1 1
 library("MSnID")
2 2
 data(c_elegans)
3 3
 
4
-test_infer_parsimonious_accessions <- function(){
5 4
 
6
-        # explicitely adding parameters that will be used for data filtering
5
+test_infer_parsimonious_accessions_old <- function(){
6
+    # explicitely adding parameters that will be used for data filtering
7 7
     msnidObj$msmsScore <- -log10(msnidObj$`MS-GF:SpecEValue`)
8 8
     msnidObj$absParentMassErrorPPM <- abs(mass_measurement_error(msnidObj))
9 9
     
... ...
@@ -16,3 +16,33 @@ test_infer_parsimonious_accessions <- function(){
16 16
 }
17 17
 
18 18
 
19
+
20
+# Above is the old function for testing protein inference.  I'll leave it for
21
+# now.  Below is the new way, where first all the inference will be done
22
+# outside of the test functions.
23
+
24
+
25
+# explicitely adding parameters that will be used for data filtering
26
+msnidObj$msmsScore <- -log10(msnidObj$`MS-GF:SpecEValue`)
27
+msnidObj$absParentMassErrorPPM <- abs(mass_measurement_error(msnidObj))
28
+# quick-and-dirty filter. The filter is too strong for the sake of saving time
29
+# at the minimal set of proteins inference step.
30
+msnidObj <- apply_filter(msnidObj, 'msmsScore > 12 & absParentMassErrorPPM < 2')
31
+msnidObj2 <- infer_parsimonious_accessions(msnidObj)
32
+
33
+test_infer_parsimonious_accessions_number <- function(){
34
+   checkEqualsNumeric(length(proteins(msnidObj2)), 551)
35
+}
36
+
37
+# test_infer_parsimonious_accessions_hash <- function(){
38
+#    checkIdentical(digest(psms(msnidObj2)),'42c967304603b17ef667fae5b8d5657f')
39
+# }
40
+
41
+test_infer_parsimonious_accessions_hash <- function(){
42
+    checkIdentical(digest(psms(msnidObj2)$accession),
43
+                   '6a3566a95b2e49a0f966d22ed897a752')
44
+}
45
+
46
+# Future challenges is to come up with tests that check inference that is
47
+# done outside of MSnID object
48
+
... ...
@@ -63,5 +63,3 @@ table(msnidObj$numMissCleavages)
63 63
 }
64 64
 
65 65
 
66
-
67
-
... ...
@@ -1,5 +1,3 @@
1
-% validCleavagePattern="[RK]\\.[^P]"
2
-
3 1
 \name{assess_termini}
4 2
 \alias{assess_termini}
5 3
 
... ...
@@ -60,5 +60,3 @@ massErr <- (msnidObj$experimentalMassToCharge -
60 60
 hist(massErr,xlim=c(-1,+1), breaks=seq(-1.5,+1.5,0.01))
61 61
 }
62 62
 
63
-
64
-
... ...
@@ -1,5 +1,3 @@
1
-% this is document of generic function
2
-
3 1
 \name{psms}
4 2
 \alias{psms}
5 3
 \alias{psms<-}
... ...
@@ -69,6 +69,3 @@ ppm <- mass_measurement_error(msnidObj)
69 69
 hist(ppm, 200, xlim=c(-20,+20))
70 70
 }
71 71
 
72
-
73
-
74
-
... ...
@@ -496,6 +496,3 @@ unlink(".Rcache", recursive=TRUE)
496 496
 \bibliographystyle{plainnat}
497 497
 \bibliography{msnid}
498 498
 \end{document}
499
-
500
-
501
-