Browse code

refactor: patially rewrite code to exctract header info

- Create index list in R instead of C++.
- Use `size_t` instead of `int` where suitable.
- Ensure that correct data types are passed between R and C++.
- Use new (own) implementation to extract the acquisition number from the
spectrum ID instead of `id::translateNativeIDToScanNumber` from proteowizard
to avoid segfaults on macOS (https://github.com/sneumann/xcms/issues/422).

jorainer authored on 25/10/2021 08:58:52 • Steffen Neumann committed on 25/10/2021 15:08:38
Showing5 changed files

... ...
@@ -2,7 +2,7 @@ Package: mzR
2 2
 Type: Package
3 3
 Title: parser for netCDF, mzXML, mzData and mzML and mzIdentML files
4 4
        (mass spectrometry data)
5
-Version: 2.27.1
5
+Version: 2.27.3
6 6
 Author: Bernd Fischer, Steffen Neumann, Laurent Gatto, Qiang Kou, Johannes Rainer
7 7
 Maintainer: Steffen Neumann <sneumann@ipb-halle.de>,
8 8
             Laurent Gatto <laurent.gatto@uclouvain.be>,
... ...
@@ -1,3 +1,12 @@
1
+CHANGES IN VERSION 2.27.3
2
+-------------------------
3
+ o Pwiz backend partially re-written to avoid segfault on macOS
4
+   (https://github.com/sneumann/xcms/issues/422).
5
+
6
+CHANGES IN VERSION 2.27.2
7
+-------------------------
8
+ o Remove support for the ramp backend.
9
+
1 10
 CHANGES IN VERSION 2.27.1
2 11
 -------------------------
3 12
  o Add missing atomic_count_sync.hpp BH file (see PR #248 by vjcitn)
... ...
@@ -50,10 +50,12 @@ setMethod("detector", "mzRpwiz",
50 50
 
51 51
 setMethod("header", c("mzRpwiz", "missing"),
52 52
           function(object) {
53
-              res <- object@backend$getAllScanHeaderInfo()
54
-              res$filterString <- as.character(res$filterString)
55
-              res$spectrumId <- as.character(res$spectrumId)
56
-              res
53
+              scans <- seq_len(object@backend$getLastScan())
54
+              ## res <- object@backend$getAllScanHeaderInfo()
55
+              ## res$filterString <- as.character(res$filterString)
56
+              ## res$spectrumId <- as.character(res$spectrumId)
57
+              ## res
58
+              header(object, scans)
57 59
           })
58 60
 
59 61
 setMethod("header", c("mzRpwiz", "numeric"),
... ...
@@ -3,6 +3,7 @@
3 3
 RcppPwiz::RcppPwiz()
4 4
 {
5 5
   msd = NULL;
6
+  nativeIdFormat = CVID_Unknown;
6 7
   instrumentInfo = Rcpp::List::create();
7 8
   chromatogramsInfo = Rcpp::DataFrame::create();
8 9
   isInCacheInstrumentInfo = FALSE;
... ...
@@ -15,12 +16,15 @@ RcppPwiz::~RcppPwiz()
15 16
   RcppPwiz::close();
16 17
 }
17 18
 
18
-void RcppPwiz::open(const string& fileName)
19
+// void RcppPwiz::open(const string& fileName)
20
+void RcppPwiz::open(Rcpp::StringVector fileName)
19 21
 {
20 22
 
21
-  filename = fileName;
22
-  msd = new MSDataFile(fileName);
23
-
23
+  filename = Rcpp::as<std::string>(fileName(0));
24
+  msd = new MSDataFile(filename);
25
+  // Better not to guess the native ID format. For mzML/mzXML all should be fine
26
+  // with the default one.
27
+  // nativeIdFormat = id::getDefaultNativeIDFormat(*msd);
24 28
 }
25 29
 
26 30
 /* Release all memory on close. */
... ...
@@ -30,6 +34,7 @@ void RcppPwiz::close()
30 34
     {
31 35
       delete msd;
32 36
       msd = NULL;
37
+      nativeIdFormat = CVID_Unknown;
33 38
       instrumentInfo = Rcpp::List::create();
34 39
       chromatogramsInfo = Rcpp::DataFrame::create();
35 40
       isInCacheInstrumentInfo = FALSE;
... ...
@@ -127,12 +132,47 @@ Rcpp::List RcppPwiz::getInstrumentInfo ( )
127 132
   return instrumentInfo;
128 133
 }
129 134
 
135
+// int RcppPwiz::getAcquisitionNumber(size_t index) const
136
+// {
137
+//   const SpectrumIdentity& si = msd->run.spectrumListPtr->spectrumIdentity(index);
138
+//   string scanNumber = id::translateNativeIDToScanNumber(nativeIdFormat, si.id);
139
+//   if (scanNumber.empty()) {
140
+//     return static_cast<int>(index) + 1;
141
+//   }
142
+//   else
143
+//     return lexical_cast<int>(scanNumber);
144
+//   // return static_cast<int>(index) + 1;
145
+// }
146
+
147
+// Using this function instead of pwiz translateNativeIDToScanNumber because
148
+// it randomly causes segfaults on macOS.
149
+int RcppPwiz::getAcquisitionNumber(string id, size_t index) const
150
+{
151
+  if (id.find("controllerType") != std::string::npos) {
152
+    if (id.find("controllerType=0 controllerNumber=1") == std::string::npos)
153
+      return static_cast<int>(index) + 1;
154
+  }
155
+  string e;
156
+  std::smatch match;
157
+  if (id.find("scan=") != std::string::npos)
158
+    e ="scan=(\\d+)";
159
+  else if (id.find("index=") != std::string::npos)
160
+    e = "index=(\\d+)";
161
+  else if (id.find("spectrum=") != std::string::npos)
162
+    e = "spectrum=(\\d+)";
163
+  else if (id.find("scanId=") != std::string::npos)
164
+    e = "scanId=(\\d+)";
165
+  else return static_cast<int>(index) + 1;
166
+  if (std::regex_search(id, match, std::regex(e)))
167
+    return lexical_cast<int>(match[1]);
168
+  else return static_cast<int>(index) + 1;
169
+}
170
+
130 171
 Rcpp::DataFrame RcppPwiz::getScanHeaderInfo (Rcpp::IntegerVector whichScan) {
131 172
   if (msd != NULL) {
132 173
     SpectrumListPtr slp = msd->run.spectrumListPtr;
133
-    int N = slp->size();
134
-    int N_scans = whichScan.size();
135
-    CVID nativeIdFormat = id::getDefaultNativeIDFormat(*msd);
174
+    size_t N = slp->size();
175
+    size_t N_scans = whichScan.size();
136 176
     Rcpp::IntegerVector seqNum(N_scans); // number in sequence observed file (1-based)
137 177
     Rcpp::IntegerVector acquisitionNum(N_scans); // scan number as declared in File (may be gaps)
138 178
     Rcpp::IntegerVector msLevel(N_scans);
... ...
@@ -165,23 +205,20 @@ Rcpp::DataFrame RcppPwiz::getScanHeaderInfo (Rcpp::IntegerVector whichScan) {
165 205
     Rcpp::NumericVector scanWindowLowerLimit(N_scans);
166 206
     Rcpp::NumericVector scanWindowUpperLimit(N_scans);
167 207
     
168
-    for (int i = 0; i < N_scans; i++) {
208
+    for (size_t i = 0; i < N_scans; i++) {
169 209
       int current_scan = whichScan[i];
170
-      SpectrumPtr sp = slp->spectrum(current_scan - 1, false);
210
+      size_t current_index = static_cast<size_t>(current_scan - 1);
211
+      // SpectrumPtr sp = slp->spectrum(current_index, false);
212
+      SpectrumPtr sp = slp->spectrum(current_index, DetailLevel_FullMetadata);
171 213
       Scan dummy;
172 214
       Scan& scan = sp->scanList.scans.empty() ? dummy : sp->scanList.scans[0];
215
+      if (scan.empty())
216
+	Rprintf("Scan with index %d empty.\n", current_scan);
173 217
       // seqNum
174 218
       seqNum[i] = current_scan;
175
-      // acquisitionNum
176
-      string id = sp->id;
177
-      string scanNumber = id::translateNativeIDToScanNumber(nativeIdFormat, id);
178
-      if (scanNumber.empty()) {
179
-	acquisitionNum[i] = current_scan;
180
-      } else {
181
-	acquisitionNum[i] = lexical_cast<int>(scanNumber);
182
-      }
219
+      acquisitionNum[i] = getAcquisitionNumber(sp->id, current_index);
183 220
       // spectrumId
184
-      spectrumId[i] = sp->id;
221
+      spectrumId[i] = Rcpp::String(sp->id);
185 222
       // msLevel
186 223
       msLevel[i] = sp->cvParam(MS_ms_level).valueAs<int>();
187 224
       // peaksCount
... ...
@@ -231,14 +268,8 @@ Rcpp::DataFrame RcppPwiz::getScanHeaderInfo (Rcpp::IntegerVector whichScan) {
231 268
 	collisionEnergy[i] = precursor.activation.cvParam(MS_collision_energy).valueAs<double>();
232 269
 	// precursorScanNum
233 270
 	size_t precursorIndex = slp->find(precursor.spectrumID);
234
-	if (precursorIndex < slp->size()) {
235
-	  const SpectrumIdentity& precursorSpectrum = slp->spectrumIdentity(precursorIndex);
236
-	  string precursorScanNumber = id::translateNativeIDToScanNumber(nativeIdFormat, precursorSpectrum.id);
237
-	  if (precursorScanNumber.empty()) {
238
-	    precursorScanNum[i] = precursorIndex + 1;
239
-	  } else {
240
-	    precursorScanNum[i] = lexical_cast<int>(precursorScanNumber);
241
-	  }
271
+	if (precursorIndex < N) {
272
+	  precursorScanNum[i] = getAcquisitionNumber(precursor.spectrumID, precursorIndex);
242 273
 	} else {
243 274
 	  precursorScanNum[i] = NA_INTEGER;
244 275
 	}
... ...
@@ -277,7 +308,7 @@ Rcpp::DataFrame RcppPwiz::getScanHeaderInfo (Rcpp::IntegerVector whichScan) {
277 308
     
278 309
     Rcpp::List header(31);
279 310
     std::vector<std::string> names;
280
-    int i = 0;
311
+    size_t i = 0;
281 312
     names.push_back("seqNum");
282 313
     header[i++] = Rcpp::wrap(seqNum);
283 314
     names.push_back("acquisitionNum");
... ...
@@ -352,7 +383,7 @@ Rcpp::DataFrame RcppPwiz::getAllScanHeaderInfo ( ) {
352 383
   if (msd != NULL) {
353 384
     if (!isInCacheAllScanHeaderInfo) {
354 385
       SpectrumListPtr slp = msd->run.spectrumListPtr;
355
-      int N = slp->size();
386
+      size_t N = slp->size();
356 387
       
357 388
       allScanHeaderInfo = getScanHeaderInfo(Rcpp::seq(1, N));
358 389
       isInCacheAllScanHeaderInfo = TRUE;	    
... ...
@@ -366,21 +397,23 @@ Rcpp::DataFrame RcppPwiz::getAllScanHeaderInfo ( ) {
366 397
 Rcpp::List RcppPwiz::getPeakList(Rcpp::IntegerVector whichScan) {
367 398
   if (msd != NULL) {
368 399
     SpectrumListPtr slp = msd->run.spectrumListPtr;
369
-    int n_scans = slp->size();
370
-    int n_want = whichScan.size();
400
+    size_t n_scans = slp->size();
401
+    size_t n_want = whichScan.size();
371 402
     int current_scan;
372 403
     SpectrumPtr sp;
373 404
     BinaryDataArrayPtr mzs,ints;
374 405
     std::vector<double> data;
375 406
     Rcpp::NumericVector data_matrix;
376 407
     Rcpp::List res(n_want);
377
-    for (int i = 0; i < n_want; i++) {
408
+    for (size_t i = 0; i < n_want; i++) {
378 409
       current_scan = whichScan[i];
379 410
       if (current_scan < 1 || current_scan > n_scans) {
380 411
 	Rprintf("Index whichScan out of bounds [1 ... %d].\n", n_scans);
381 412
 	return Rcpp::List::create( );
382 413
       }
383
-      sp = slp->spectrum(current_scan - 1, true);
414
+      size_t current_index = static_cast<size_t>(current_scan - 1);
415
+      // sp = slp->spectrum(current_index, true);
416
+      sp = slp->spectrum(current_index, DetailLevel_FullData);
384 417
       mzs = sp->getMZArray();
385 418
       ints = sp->getIntensityArray();
386 419
       if (!mzs.get() || !ints.get()) {
... ...
@@ -847,7 +880,7 @@ Rcpp::DataFrame RcppPwiz::getChromatogramsInfo( int whichChrom )
847 880
 Rcpp::DataFrame RcppPwiz::getChromatogramHeaderInfo (Rcpp::IntegerVector whichChrom)
848 881
 {
849 882
   if (msd != NULL) {
850
-    CVID nativeIdFormat_ = id::getDefaultNativeIDFormat(*msd);
883
+    // CVID nativeIdFormat_ = id::getDefaultNativeIDFormat(*msd);
851 884
     ChromatogramListPtr clp = msd->run.chromatogramListPtr;
852 885
     if (clp.get() == 0) {
853 886
       Rf_warningcall(R_NilValue, "The direct support for chromatogram info is only available in mzML format.");
... ...
@@ -980,7 +1013,7 @@ Rcpp::NumericMatrix RcppPwiz::get3DMap ( std::vector<int> scanNumbers, double wh
980 1013
       //Rprintf("%d\n",1);
981 1014
       for (int i = 0; i < scanNumbers.size(); i++)
982 1015
         {
983
-	  SpectrumPtr s = slp->spectrum(scanNumbers[i] - 1, true);
1016
+	  SpectrumPtr s = slp->spectrum(scanNumbers[i] - 1, DetailLevel_FullMetadata);
984 1017
 	  vector<MZIntensityPair> pairs;
985 1018
 	  s->getMZIntensityPairs(pairs);
986 1019
 
... ...
@@ -28,6 +28,7 @@
28 28
 #include <fstream>
29 29
 #include <string>
30 30
 #include <iostream>
31
+#include <regex>
31 32
 
32 33
 #if defined(__MINGW32__)
33 34
 #include <windows.h>
... ...
@@ -49,18 +50,21 @@ private:
49 50
     Rcpp::DataFrame allScanHeaderInfo;
50 51
     bool isInCacheAllScanHeaderInfo;
51 52
     string filename;
53
+    CVID nativeIdFormat;
52 54
     void addSpectrumList(MSData& msd,
53 55
 			 Rcpp::DataFrame& spctr_header,
54 56
 			 Rcpp::List& spctr_data,
55 57
 			 bool rtime_seconds);
56 58
     void addDataProcessing(MSData& msd, Rcpp::StringVector soft_proc);
57
-
59
+    int getAcquisitionNumber(string id, size_t index) const;
60
+  
58 61
 public:
59 62
 
60 63
     RcppPwiz();
61 64
     virtual ~RcppPwiz();
62 65
 
63
-    void open(const string& fileNames);
66
+    // void open(const string& fileNames);
67
+    void open(Rcpp::StringVector fileNames);
64 68
     void close();
65 69
     /* void writeMSfile(const string& filenames, const string& format); */
66 70
     void writeSpectrumList(const string& file, const string& format,