Browse code

Updating pwiz to 3_0_21263

Steffen Neumann authored on 23/09/2021 12:34:25
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,2521 @@
1
+//
2
+// $Id$
3
+//
4
+//
5
+// Original author: Bryson Gibbons <bryson.gibbons@pnnl.gov>
6
+//
7
+// Copyright 2014 Pacific Northwest National Laboratory
8
+//                Richland, WA 99352
9
+//
10
+// Licensed under the Apache License, Version 2.0 (the "License"); 
11
+// you may not use this file except in compliance with the License. 
12
+// You may obtain a copy of the License at 
13
+//
14
+// http://www.apache.org/licenses/LICENSE-2.0
15
+//
16
+// Unless required by applicable law or agreed to in writing, software 
17
+// distributed under the License is distributed on an "AS IS" BASIS, 
18
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
19
+// See the License for the specific language governing permissions and 
20
+// limitations under the License.
21
+//
22
+
23
+
24
+#define PWIZ_SOURCE
25
+
26
+#include "SpectrumList_MZRefiner.hpp"
27
+#include "pwiz/data/vendor_readers/Agilent/SpectrumList_Agilent.hpp"
28
+#include "pwiz/data/identdata/Serializer_pepXML.hpp"
29
+#include "pwiz/data/common/CVTranslator.hpp"
30
+#include "pwiz/data/identdata/IdentDataFile.hpp"
31
+#include "pwiz/data/msdata/MSData.hpp"
32
+#include "pwiz/data/proteome/Peptide.hpp"
33
+#include "pwiz/utility/misc/optimized_lexical_cast.hpp"
34
+#include "pwiz/utility/misc/Std.hpp"
35
+#include "pwiz/utility/misc/Filesystem.hpp"
36
+#include "pwiz/utility/misc/Singleton.hpp"
37
+#include <iomanip>
38
+#include <numeric>
39
+
40
+namespace pwiz {
41
+namespace analysis {
42
+
43
+using namespace pwiz::cv;
44
+using namespace pwiz::msdata;
45
+using namespace pwiz::identdata;
46
+
47
+// Constants specific to mzRefiner
48
+const unsigned int MINIMUM_RESULTS_FOR_GLOBAL_SHIFT = 100;
49
+// 500 is kind of arbitrary, but it seemed like a good number to use as a requirement for dependent data shifts
50
+// If there are only 500 data points, there will be very limited shift smoothing.
51
+const unsigned int MINIMUM_RESULTS_FOR_DEPENDENT_SHIFT = 500;
52
+
53
+// Forward declarations to allow having the comparison functions contained in the classes.
54
+class ShiftData;
55
+class ScanData;
56
+typedef boost::shared_ptr<ShiftData> ShiftDataPtr;
57
+typedef boost::shared_ptr<ScanData> ScanDataPtr;
58
+// Can't use boost::dynamic_pointer_cast<ScanData>(ShiftDataPtr) because there are no virtual members.
59
+// dynamic_pointer_cast also has higher overhead
60
+
61
+// Data necessary to calculate the multiple types of shifts
62
+class ShiftData
63
+{
64
+public:
65
+    double scanTime; // In Seconds
66
+    double massError;
67
+    double ppmError;
68
+    double calcMz;
69
+    double experMz;
70
+
71
+    // Possible virtual member of msLevel or something similar?
72
+
73
+    static bool byScanTime(const ShiftData& i, const ShiftData& j) { return i.scanTime < j.scanTime; }
74
+    static bool byPpmError(const ShiftData& i, const ShiftData& j) { return i.ppmError < j.ppmError; }
75
+    static bool byCalcMz(const ShiftData& i, const ShiftData& j) { return i.calcMz < j.calcMz; }
76
+    static bool byExperMz(const ShiftData& i, const ShiftData& j) { return i.experMz < j.experMz; }
77
+    static bool byScanTimePtr(ShiftDataPtr i, ShiftDataPtr j) { return i->scanTime < j->scanTime; }
78
+    static bool byPpmErrorPtr(ShiftDataPtr i, ShiftDataPtr j) { return i->ppmError < j->ppmError; }
79
+    static bool byCalcMzPtr(ShiftDataPtr i, ShiftDataPtr j) { return i->calcMz < j->calcMz; }
80
+    static bool byExperMzPtr(ShiftDataPtr i, ShiftDataPtr j) { return i->experMz < j->experMz; }
81
+};
82
+
83
+// Extra data needed for preparing the shifts, but not needed for calculating them
84
+class ScanData : public ShiftData
85
+{
86
+public:
87
+    size_t scanId;
88
+    string nativeID;
89
+    int rank;
90
+    string peptideSeq;
91
+    int peptideSeqLength;
92
+    int msLevel;
93
+    double scoreValue;
94
+    int charge;
95
+
96
+    // Sorting functions for vector of ScanData
97
+    static bool byScanId(const ScanData& i, const ScanData& j) { return i.scanId < j.scanId; }
98
+    static bool byScanIdPtr(ScanDataPtr i, ScanDataPtr j) { return i->scanId < j->scanId; }
99
+    // TODO: potentially unsafe casting, should fix (how?)
100
+    static bool byScanIdPtr2(ShiftDataPtr i, ShiftDataPtr j) { return boost::static_pointer_cast<ScanData>(i)->scanId < boost::static_pointer_cast<ScanData>(j)->scanId; }
101
+    // Not particularly useful, I think - depends on score used, and on other functions that would need to be added to CVConditionalFilter
102
+    static bool byScore(const ScanData& i, const ScanData& j) { return i.scoreValue < j.scoreValue; }
103
+    static bool byScorePtr(ScanDataPtr i, ScanDataPtr j) { return i->scoreValue < j->scoreValue; }
104
+};
105
+
106
+class CVConditionalFilter
107
+{
108
+    public:
109
+    class CVConditionalFilterConfigData
110
+    {
111
+    public:
112
+        CVConditionalFilterConfigData() : software(CVID_Unknown), step(0.0), maxSteps(0) {}
113
+        CVConditionalFilterConfigData(const string& p_cvTerm, const string& p_rangeSet, double p_step, int p_maxSteps) : software(CVID_Unknown), cvTerm(p_cvTerm), rangeSet(p_rangeSet), step(p_step), maxSteps(p_maxSteps) {}
114
+        CVID software;
115
+        string cvTerm;
116
+        string rangeSet;
117
+        double step;
118
+        int maxSteps;
119
+    };
120
+
121
+    CVConditionalFilter(CVConditionalFilterConfigData configData);
122
+    CVConditionalFilter(CVID software, const string& cvTerm, const string& rangeSet, double step = 0.0, int maxStep = 0);
123
+    CVConditionalFilter(CVID software, const string& cvTerm, double maxValue, double minValue, bool useMax, bool useMin, double step = 0, int maxStep = 0);
124
+    void updateFilter(CVID software, const string& cvTerm, double maxValue, double minValue, bool useMax, bool useMin, double step = 0, int maxStep = 0);
125
+    void updateFilter(CVID software, const string& cvTerm, const string& rangeSet, double step = 0.0, int maxStep = 0);
126
+    bool passesFilter(identdata::SpectrumIdentificationItemPtr& sii, double& scoreVal) const;
127
+    bool isBetter(double lScoreVal, double rScoreVal) const;
128
+    bool isBetter(const ScanDataPtr& lData, const ScanDataPtr& rData) const;
129
+    bool adjustFilterByStep();
130
+    cv::CVID getCVID() const { return cvid; }
131
+    double getMax() const { return max; }
132
+    double getMin() const { return min; }
133
+    bool getAnd() const { return isAnd; }
134
+    const string& getScoreName() { return scoreName; }
135
+    const string& getThreshold() { return threshold; }
136
+
137
+    private:
138
+    CVConditionalFilter();
139
+    string scoreName;
140
+    string threshold;
141
+    cv::CVID cvid;
142
+    bool useName;
143
+    double max;
144
+    double min;
145
+    double step;
146
+    double maxSteps;
147
+    int stepCount;
148
+    bool isAnd;
149
+    double center;
150
+    bool isMin;
151
+    bool isMax;
152
+};
153
+typedef boost::shared_ptr<CVConditionalFilter> CVConditionalFilterPtr;
154
+ostream& operator<<(ostream& out, CVConditionalFilter filter);
155
+
156
+// A functor for using STL sorts with an object instance
157
+struct sortFilter
158
+{
159
+    CVConditionalFilterPtr filter;
160
+    sortFilter(CVConditionalFilterPtr& f) : filter(f) {};
161
+    bool operator() (const ScanDataPtr& lData, const ScanDataPtr& rData) const
162
+    {
163
+        return filter->isBetter(lData, rData);
164
+    }
165
+};
166
+
167
+// Abstract class for a shift: common functionality required for all shifts
168
+class AdjustmentObject
169
+{
170
+    public:
171
+    AdjustmentObject(double gShift = 0.0, double gStDev = 0.0, double gMAD = 0.0) : globalShift(gShift), globalStDev(gStDev), globalMAD(gMAD) {}
172
+    virtual ~AdjustmentObject() {}
173
+
174
+    string getAdjustmentType() const    { return adjustmentType; }
175
+    string getPrettyAdjustment() const  { return prettyAdjustment; }
176
+    double getStDev() const             { return stdev; }
177
+    double getMAD() const               { return mad; }
178
+    double getPctImp() const            { return percentImprovement; }
179
+    double getPctImpMAD() const         { return percentImprovementMAD; }
180
+    double getGlobalShift() const       { return globalShift; }
181
+    double getGlobalStDev() const       { return globalStDev; }
182
+    double getGlobalMAD() const         { return globalMAD; }
183
+    virtual double shift(double scanTime, double mass) const = 0;
184
+    virtual void calculate(vector<ShiftDataPtr>& data) = 0;
185
+    virtual string getShiftRange() const = 0;
186
+    virtual string getShiftOutString() const = 0;
187
+    
188
+    protected:
189
+    string adjustmentType;
190
+    string prettyAdjustment;
191
+    string adjTypeShort;
192
+    double stdev;
193
+    double mad;
194
+    double percentImprovement;
195
+    double percentImprovementMAD;
196
+    double globalShift;
197
+    double globalStDev;
198
+    double globalMAD;
199
+};
200
+typedef boost::shared_ptr<AdjustmentObject> AdjustmentObjectPtr;
201
+
202
+// A basic shift - everything is being shifted by the same approximate value
203
+class AdjustSimpleGlobal : public AdjustmentObject
204
+{
205
+    public:
206
+    double getShift() const       { return shiftError; }
207
+    double getModeError() const   { return modeError; }
208
+    double getMedianError() const { return medianError; }
209
+    double getAvgError() const    { return avgError; }
210
+    double getAvgStDev() const    { return avgStDev; }
211
+    double getModeStDev() const   { return modeStDev; }
212
+    double getMedianStDev() const { return medianStDev; }
213
+    AdjustSimpleGlobal()    { adjustmentType = "SimpleGlobal"; adjTypeShort = "SG"; prettyAdjustment = "Global Shift"; }
214
+    void setZeroShift()     { shiftError = 0.0;  }
215
+    virtual double shift(double scanTime, double mass) const;
216
+    virtual void calculate(vector<ShiftDataPtr>& data);
217
+    virtual string getShiftRange() const     { return lexical_cast<string>(shiftError); }
218
+    virtual string getShiftOutString() const { return "Global PPM Shift"; }
219
+    bool checkForPeak() const;
220
+    
221
+    private:
222
+    vector<int> freqHistBins;
223
+    double freqHistBinSize;
224
+    int freqHistBinCount;
225
+    double shiftError;
226
+    double avgError;
227
+    double modeError;
228
+    double medianError;
229
+    double avgStDev;
230
+    double modeStDev;
231
+    double medianStDev;
232
+};
233
+typedef boost::shared_ptr<AdjustSimpleGlobal> AdjustSimpleGlobalPtr;
234
+
235
+// Support common functionality for a "binned" dependency shift
236
+class BinnedAdjustmentObject : public AdjustmentObject
237
+{
238
+    public:
239
+    BinnedAdjustmentObject(double gShift, double gStDev, double gMAD) : AdjustmentObject(gShift, gStDev, gMAD) {}
240
+    virtual double shift(double scanTime, double mass) const = 0;
241
+    virtual void calculate(vector<ShiftDataPtr>& data) = 0;
242
+    virtual string getShiftRange() const;
243
+    virtual string getShiftOutString() const = 0;
244
+    double getRoughStDev() const           { return roughStDev; }
245
+    double getRoughPctImp() const          { return percentImprovementRough; }
246
+    double getSmoothedStDev() const        { return smoothedStDev; }
247
+    double getSmoothedPctImp() const       { return percentImprovementSmoothed; }
248
+    double getRoughMAD() const             { return roughMAD; }
249
+    double getRoughPctImpMAD() const       { return percentImprovementRoughMAD; }
250
+    double getSmoothedMAD() const          { return smoothedMAD; }
251
+    double getSmoothedPctImpMAD() const    { return percentImprovementSmoothedMAD; }
252
+    
253
+    protected:
254
+    virtual double binShift(double dependency, double mass) const;
255
+    virtual void smoothBins();
256
+    virtual void cleanNoise();
257
+    virtual void cleanNoiseHelper(size_t start, int step);
258
+    virtual void processBins();
259
+    virtual void getStats(double& stDev, double& mad);
260
+    vector<double> shifts;
261
+    vector<bool> isValidBin;
262
+    vector<unsigned int> counts;
263
+    double binSize;
264
+    vector< vector<double> > sortBins;
265
+    size_t bins;
266
+    size_t highestCountBin;
267
+    size_t lowestValidBin;
268
+    size_t highestValidBin;
269
+    double roughStDev;
270
+    double percentImprovementRough;
271
+    double smoothedStDev;
272
+    double percentImprovementSmoothed;
273
+    double roughMAD;
274
+    double percentImprovementRoughMAD;
275
+    double smoothedMAD;
276
+    double percentImprovementSmoothedMAD;
277
+};
278
+typedef boost::shared_ptr<BinnedAdjustmentObject> BinnedAdjustmentObjectPtr;
279
+
280
+/*******************************************************************************************************
281
+ * Sorts a list by the default sorting function, and returns the median
282
+ * The parameter would be const, but we can't sort a const list.
283
+ * Using pass-by-reference to avoid vector copy costs.
284
+ * Returns: median, list is sorted in ascending order (by default comparison)
285
+ ******************************************************************************************************/
286
+double median(vector<double>& list)
287
+{
288
+    double median = 0.0;
289
+    std::sort(list.begin(), list.end());
290
+
291
+    // Pseudo-median: use integer division to always get center (odd count) or next lower value (even count)
292
+    //median = list[list.size() / 2];
293
+    // True median: average the two center values when the count is even.
294
+    // Possibility of replacing this with Boost's statistical accumulators
295
+    if (list.size() == 0)
296
+    {
297
+        return 0.0;
298
+    }
299
+    if (list.size() % 2) // Get "false" (0) if even, "true" (1) if odd
300
+    {
301
+        median = list[list.size() / 2];
302
+    }
303
+    else
304
+    {
305
+        // list size / 2 gives the size divided by 2 - will always be the high index for the center (e.g., 2 / 2 = 1, the highest valid index...)
306
+        // Subtract 1 to get the low index for the center
307
+        double lShift = list[(list.size() / 2) - 1]; // next lower value
308
+        double rShift = list[list.size() / 2]; // next higher value
309
+        median = (lShift + rShift) / 2.0;
310
+    }
311
+    return median;
312
+}
313
+
314
+class AdjustByScanTime : public BinnedAdjustmentObject
315
+{
316
+public:
317
+    // Bin size of 1.25 (min) captures 20 +/- 5 ms1 scans on Thermo Orbitrap
318
+    // Changed to 75 seconds (make all references change to that as well)
319
+    // Captures at least 14 scans, and up to 50, on Agilent QTOF
320
+    // Should probably be made variable, but determining what it should be in the end is the challenge.
321
+    AdjustByScanTime(double gShift, double gStDev, double gMAD) : BinnedAdjustmentObject(gShift, gStDev, gMAD) { adjustmentType = "ByScanTime"; adjTypeShort = "Time"; binSize = 75; prettyAdjustment = "Using scan time dependency"; }
322
+    virtual double shift(double scanTime, double mass) const;
323
+    virtual void calculate(vector<ShiftDataPtr>& data);
324
+    virtual string getShiftOutString() const { return "scan time dependent shift"; }
325
+
326
+private:
327
+};
328
+typedef boost::shared_ptr<AdjustByScanTime> AdjustByScanTimePtr;
329
+
330
+class AdjustByMassToCharge : public BinnedAdjustmentObject
331
+{
332
+    public:
333
+    AdjustByMassToCharge(double gShift, double gStDev, double gMAD) : BinnedAdjustmentObject(gShift, gStDev, gMAD) { adjustmentType = "ByMassToCharge"; adjTypeShort = "MZ"; binSize = 25;  prettyAdjustment = "Using mass to charge dependency"; }
334
+    virtual double shift(double scanTime, double mass) const;
335
+    virtual void calculate(vector<ShiftDataPtr>& data);
336
+    virtual string getShiftOutString() const { return "m/z dependent shift"; }
337
+    
338
+    private:
339
+};
340
+typedef boost::shared_ptr<AdjustByMassToCharge> AdjustByMassToChargePtr;
341
+
342
+/*********************************************************************************************
343
+ * Determine the mode within a specified accuracy for a global shift
344
+ ********************************************************************************************/
345
+void AdjustSimpleGlobal::calculate(vector<ShiftDataPtr>& data)
346
+{
347
+    std::sort(data.begin(), data.end(), ShiftData::byPpmErrorPtr);
348
+    medianError = data[data.size() / 2]->ppmError;
349
+    double sumPpmError = 0.0;
350
+    freqHistBinSize = 0.5;
351
+    /****************************************************************************************
352
+     * Decent ideas for bin size
353
+     * 0.1 = 1001 bins
354
+     * 0.2 = 501 bins
355
+     * 0.4 = 251 bins
356
+     * 0.5 = 201 bins
357
+     * 0.8 = 126 bins
358
+     * 1   = 101 bins
359
+     * 2   = 51 bins
360
+     ***************************************************************************************/
361
+    // -50 to 50 is 100 values, if we don't count zero
362
+    freqHistBinCount = 100.0 / freqHistBinSize + 1.0;
363
+    freqHistBins.resize(freqHistBinCount, 0); // -50 to 50, with extra for zero
364
+    BOOST_FOREACH(const ShiftDataPtr &i, data)
365
+    {
366
+        if (-50 <= i->ppmError && i->ppmError <= 50)
367
+        {
368
+            // Add 50 to enter valid range, add .5 for rounding purposes, then truncate for valid rounding
369
+            // -50.5+50.5 = 0, -0.5+50.5=50, -0.2+50.5 = 50.7=>50, 50.499+50.5=100.999=>100.
370
+            // 0.5 for rounding is a constant - do not change.
371
+            freqHistBins[(int)((i->ppmError + 50.0) * (1.0 / freqHistBinSize) + 0.5)]++;
372
+            sumPpmError += i->ppmError;
373
+        }
374
+    }
375
+    
376
+    avgError = sumPpmError / (double)data.size();
377
+    double sumVariance = 0;
378
+    double modeVariance = 0;
379
+    double medianVariance = 0;
380
+    vector<double> madList;
381
+    
382
+    /// Output ppmError Histogram...
383
+    int high = 0;
384
+    for (int i = 0; i < freqHistBinCount; ++i)
385
+    {
386
+        if (freqHistBins[i] > high)
387
+        {
388
+            high = freqHistBins[i];
389
+            modeError = (i * freqHistBinSize) - 50.0;
390
+        }
391
+    }
392
+    BOOST_FOREACH(const ShiftDataPtr &i, data)
393
+    {
394
+        sumVariance += pow(i->ppmError - avgError, 2);
395
+        modeVariance += pow(i->ppmError - modeError, 2);
396
+        medianVariance += pow(i->ppmError - medianError, 2);
397
+        madList.push_back(abs(i->ppmError - medianError));
398
+    }
399
+    // variance, average of squared differences from mean
400
+    //double avgVariance = sumVariance / (double)data.size();
401
+    avgStDev = sqrt(sumVariance / (double)data.size());
402
+    modeStDev = sqrt(modeVariance / (double)data.size());
403
+    medianStDev = sqrt(medianVariance / (double)data.size());
404
+    mad = median(madList);
405
+
406
+    // This is the global stdev, it's what we are trying to improve
407
+    percentImprovement = 0.0;
408
+    
409
+    // Set the type of shift to use here.
410
+    shiftError = medianError;
411
+    stdev = medianStDev;
412
+
413
+    ostringstream oss2;
414
+    oss2 << ": " << shiftError << " ppm";
415
+    prettyAdjustment += oss2.str();
416
+    
417
+    // Set the global shift/stdev values
418
+    globalShift = shiftError;
419
+    globalStDev = stdev;
420
+    globalMAD = mad;
421
+}
422
+
423
+/*********************************************************************************************
424
+* Check for a peak where the median is found - if no significant peak, return false
425
+********************************************************************************************/
426
+bool AdjustSimpleGlobal::checkForPeak() const
427
+{
428
+    if (freqHistBins.empty())
429
+    {
430
+        return false;
431
+    }
432
+    // Checking 10 ppm to either side of the shift value
433
+    size_t medianBin = (int)((shiftError + 50.0) * (1.0 / freqHistBinSize) + 0.5);
434
+    size_t tenPpmBins = (1.0 / freqHistBinSize) * 10;
435
+    size_t medianLessTenBin = medianBin >= tenPpmBins ? medianBin - tenPpmBins : 0;
436
+    size_t medianPlusTenBin = medianBin < freqHistBinCount - tenPpmBins ? medianBin + tenPpmBins : freqHistBinCount - 1;
437
+    // Find max
438
+    // Use the mode; why re-find it?
439
+    // We need the bin index
440
+    size_t maxBin = (int)((modeError + 50.0) * (1.0 / freqHistBinSize) + 0.5);
441
+    if (maxBin < medianLessTenBin || medianPlusTenBin < maxBin)
442
+    {
443
+        // The max is outside of +/- 10 ppm from the median; obvious problems
444
+        ////////////// Or, severe bi-modal dataset; might be a nice quick check, but might exclude resonable data. ////////////////////////////////////////////////////////////////////////////////////////////////////////
445
+        return false;
446
+    }
447
+    // Find average of all values not in the +/- 10 ppm range
448
+    int sum = 0;
449
+    int count = 0;
450
+    for (size_t i = 0; i < freqHistBins.size(); i++)
451
+    {
452
+        if (medianLessTenBin <= i && i <= medianPlusTenBin)
453
+        {
454
+            continue;
455
+        }
456
+        // outside of +/- 10 ppm range - get average
457
+        // Exclude zero counts - prevent bias
458
+        if (freqHistBins[i] > 0)
459
+        {
460
+            sum += freqHistBins[i];
461
+            count++;
462
+        }
463
+    }
464
+
465
+    if (count == 0)
466
+    {
467
+        // All data points were within 10 ppm of the median.
468
+        return true; ////////////////////////////////////////////////// May warrant further evaluation? ///////////////////////////////////////////////////////////////////////////////////////////////////////
469
+    }
470
+    double average = (double)sum / (double)count;
471
+    // Is max > average * 5?
472
+    return freqHistBins[maxBin] >= average * 5;
473
+}
474
+
475
+/*********************************************************************************************
476
+ * Return a value shifted by the global ppm mode
477
+ ********************************************************************************************/
478
+double AdjustSimpleGlobal::shift(double scanTime, double mass) const
479
+{
480
+    return mass * (1 - shiftError / 1.0e6);
481
+}
482
+
483
+/*********************************************************************************************
484
+ * Perform some basic smoothing to balance erratic or missing data
485
+ ********************************************************************************************/
486
+void BinnedAdjustmentObject::smoothBins()
487
+{
488
+    // Do not store shifts on original data - We don't want bias based on the shifts
489
+    vector<double> newShifts(shifts.size(), globalShift);
490
+
491
+    for (int i = lowestValidBin; (size_t)i <= highestValidBin; ++i)
492
+    {
493
+        int count = counts[i];
494
+        double sum = shifts[i] * (double)counts[i];
495
+        // Run at least once - we want to include next and previous values, as weights
496
+        // We also want at least 100 scans to increase reliability
497
+        for (int j = 1; (j < 2 || count < 100) && (size_t)j < shifts.size(); ++j)
498
+        {
499
+            if ((size_t)(i + j) <= highestValidBin)
500
+            {
501
+                count += counts[i + j];
502
+                sum += shifts[i + j] * (double)counts[i + j];
503
+            }
504
+            // Prevent negatives, compared as signed values...
505
+            if (i - j > 0 && (size_t)(i - j) >= lowestValidBin)
506
+            {
507
+                count += counts[i - j];
508
+                sum += shifts[i - j] * (double)counts[i - j];
509
+            }
510
+        }
511
+        // Stored the smoothed data
512
+        isValidBin[i] = true;
513
+        newShifts[i] = sum / (double)count;
514
+    }
515
+    
516
+    // Overwrite the original shifts with the smoothed ones.
517
+    shifts.swap(newShifts);
518
+}
519
+
520
+/*********************************************************************************************
521
+ * Clean out the noise at the tail ends of the run
522
+ * This can have some undesired effects, should probably be made optional
523
+ * It may be useful for scores/datasets that are noisier, by limiting the dependency shifts
524
+ *    where data is sparse
525
+ ********************************************************************************************/
526
+void BinnedAdjustmentObject::cleanNoiseHelper(size_t start, int step)
527
+{
528
+    // Enforce stepping one item at a time, in the set direction
529
+    if (step < 0)
530
+        step = -1;
531
+    else
532
+        step = 1;
533
+    double stDevToUse = globalStDev;
534
+    if (stDevToUse < 3.0)
535
+    {
536
+        // We have a pretty good dataset, we can probably just exit this function;
537
+        //return;
538
+        stDevToUse *= 2.0; // Set it to something not overly sensitive; 2 StDev
539
+    }
540
+    bool wipeoutRest = false;
541
+    size_t lastNonZeroIndex = start; // Assume we are not starting at zero
542
+    for (int i = start; i < (int)bins && i >= 0; i += step)
543
+    {
544
+        if (wipeoutRest)
545
+        {
546
+            // We've hit two zero counts in a row, clear out the rest.
547
+            shifts[i] = globalShift;
548
+            counts[i] = 0;
549
+            continue;
550
+        }
551
+        else if (counts[i] > 20)
552
+        {
553
+            // Figure that it is probably good data
554
+            lastNonZeroIndex = i;
555
+            continue;
556
+        }
557
+        else if (counts[i] > 0)
558
+        {
559
+            double thisShift = shifts[i];
560
+            // Use lastNonZeroIndex to get the last one we considered valid.
561
+            double lastShift = shifts[lastNonZeroIndex];
562
+            // Check to see if the difference from the last bin to this one is more than a reasonable amount (usually 1 StDev)...
563
+            if ((thisShift + stDevToUse) < lastShift || (thisShift - stDevToUse) > lastShift)
564
+            {
565
+                // Reset to the global; This will later be wiped out during smoothing, if in range.
566
+                shifts[i] = globalShift;
567
+                counts[i] = 0;
568
+                if (abs((int)i - (int)lastNonZeroIndex) > 1)
569
+                {
570
+                    // We got two pretty bad sets in a row. Ignore whatever is left.
571
+                    wipeoutRest = true;
572
+                }
573
+            }
574
+            else
575
+            {
576
+                lastNonZeroIndex = i;
577
+            }
578
+        }
579
+        else
580
+        {
581
+            if (abs((int)i - (int)lastNonZeroIndex) > 1)
582
+            {
583
+                // Found two zero counts in a row. Assume no good data in rest of vector.
584
+                wipeoutRest = true;
585
+            }
586
+            // Otherwise zero, don't do anything special.
587
+        }
588
+    }
589
+    // Reset the lowest and highest valid bins to the new limits.
590
+    if (step < 0)
591
+    {
592
+        lowestValidBin = lastNonZeroIndex;
593
+    }
594
+    else
595
+    {
596
+        highestValidBin = lastNonZeroIndex;
597
+    }
598
+}
599
+
600
+/*********************************************************************************************
601
+ * Clear out the noisy results on either end - this is risky, but the results if we don't do this
602
+ *    can be worthless.
603
+ * This is more useful for identification tools that don't provide a score that easily lends
604
+ *    itself to this shift - MS-GF+'s SpecEValue seems to provide a pretty clean plot, but
605
+ *    MyriMatch's mvh and xcorr tend to provide a noisy plot in my experience.
606
+ ********************************************************************************************/
607
+void BinnedAdjustmentObject::cleanNoise()
608
+{
609
+    cleanNoiseHelper(highestCountBin, -1);
610
+    cleanNoiseHelper(highestCountBin, 1);
611
+}
612
+
613
+/*********************************************************************************************
614
+ * Calculate bin medians and statistics
615
+ ********************************************************************************************/
616
+void BinnedAdjustmentObject::processBins()
617
+{
618
+    // Need to eventually add some way of cleaning up noisy ends - where there is no apparent "quality data"
619
+    // Ideas: Look for data density, significant changes in stdev, etc.
620
+    shifts.resize(bins, globalShift);
621
+    isValidBin.resize(bins, false);
622
+    size_t maxCount = 0;
623
+    for (size_t i = 0; i < bins; ++i)
624
+    {
625
+        if (sortBins[i].size() > 0)
626
+        {
627
+            // Get the median
628
+            shifts[i] = median(sortBins[i]);
629
+            // Find the bin with the highest number of scans
630
+            if (maxCount < sortBins[i].size())
631
+            {
632
+                highestCountBin = i;
633
+                maxCount = sortBins[i].size();
634
+            }
635
+        }
636
+    }
637
+
638
+    // Not sure how to work with this, or if it should just be removed.
639
+    //cleanNoise();
640
+
641
+    getStats(roughStDev, roughMAD);
642
+    
643
+    // Run the smoothing algorithm
644
+    smoothBins();
645
+    getStats(smoothedStDev, smoothedMAD);
646
+
647
+    // Need all absolute to be accurate - we don't want worse values to come back as "better"
648
+    percentImprovementRough = 100.0 * ((abs(globalStDev) - abs(roughStDev)) / abs(globalStDev));
649
+    percentImprovementSmoothed = 100.0 * ((abs(globalStDev) - abs(smoothedStDev)) / abs(globalStDev));
650
+    percentImprovementRoughMAD = 100.0 * ((abs(globalMAD) - abs(roughMAD)) / abs(globalMAD));
651
+    percentImprovementSmoothedMAD = 100.0 * ((abs(globalMAD) - abs(smoothedMAD)) / abs(globalMAD));
652
+
653
+    stdev = smoothedStDev;
654
+    mad = smoothedMAD;
655
+    percentImprovement = percentImprovementSmoothedMAD;
656
+}
657
+
658
+/*********************************************************************************************
659
+ * Calculate the bin standard deviations and average, and bin median absolute deviations and median
660
+ ********************************************************************************************/
661
+void BinnedAdjustmentObject::getStats(double& stDev, double& mad)
662
+{
663
+    int validBins = 0;
664
+    vector<double> binStDev(bins, 0.0);
665
+    vector<double> binMad;
666
+    vector<double> madWorker;
667
+    double binStDevSum = 0.0;
668
+    // This is kinda rough for a standard deviation calculation - it doesn't reflect what the final result would really be.
669
+    // Should change this to use basic smoothing between bins, similar to the binShift() function. Would require using the scan time or calculated mass to charger, depending on the shift type... //////////////////////////////////////////////////////////////////////////
670
+    for (size_t i = lowestValidBin; i <= highestValidBin; ++i)
671
+    {
672
+        if (sortBins[i].size() > 0)
673
+        {
674
+            double varianceSumBin = 0.0;
675
+            ++validBins;
676
+            BOOST_FOREACH(double &j, sortBins[i])
677
+            {
678
+                varianceSumBin += pow(j - shifts[i], 2);
679
+                // For bin Median Absolute Deviation
680
+                madWorker.push_back(abs(j - shifts[i]));
681
+            }
682
+            binStDev[i] = sqrt(varianceSumBin / (double)sortBins[i].size());
683
+            binStDevSum += binStDev[i];
684
+            // Bin Median Absolute Deviation: median of absolute deviations from the median
685
+            binMad.push_back(median(madWorker));
686
+            // Clear out the helper
687
+            madWorker.clear();
688
+        }
689
+    }
690
+    // Get final Median Absolute deviation: median of bin Median Absolute Deviation
691
+    mad = median(binMad);
692
+    stDev = binStDevSum / (double)validBins;
693
+}
694
+
695
+/*********************************************************************************************
696
+ * Perform a shift on a mass, given a dependency
697
+ ********************************************************************************************/
698
+double BinnedAdjustmentObject::binShift(double dependency, double mass) const
699
+{
700
+    size_t useBin = dependency / binSize;
701
+    if (useBin >= lowestValidBin && useBin <= highestValidBin)
702
+    {
703
+        // Handle limits: no data points either before or after to slope to.
704
+        if ((dependency <= ((double)lowestValidBin * binSize) + binSize / 2.0) || (dependency >= ((double)highestValidBin * binSize) + binSize / 2.0) )
705
+        {
706
+            //return mass * (1 - globalShift / 1.0e6);
707
+            return mass * (1 - shifts[useBin] / 1.0e6);
708
+        }
709
+        // Handle +/- 0.5 of center - no need to slope?
710
+        double binCenter = ((double)useBin * binSize) + binSize / 2.0;
711
+        int lowBin = lowestValidBin;
712
+        int highBin = highestValidBin;
713
+        // Set the low bin and high bin for smooth shifting between two points
714
+        if (dependency < binCenter)
715
+        {
716
+            // since levelling is being done and the previous bin is at minimum the
717
+            //    lowestValidBin, we can just use:
718
+            //  lowBin = useBin - 1;
719
+            lowBin = useBin - 1;
720
+            highBin = useBin;
721
+        }
722
+        else if (dependency > binCenter)
723
+        {
724
+            // since levelling is being done and the next bin is at maximum the
725
+            //    highestValidBin, we can just use:
726
+            //  highBin = useBin + 1;
727
+            highBin = useBin + 1;
728
+            lowBin = useBin;
729
+        }
730
+        else
731
+        {
732
+            // The float comparisons thought it wasn't greater than or less than the bin center. Assume equal.
733
+            return mass * (1 - shifts[useBin] / 1.0e6);
734
+        }
735
+        // Center index of lower bin
736
+        double lowMid = ((double)lowBin * binSize) + binSize / 2.0;
737
+        // Center index of higher bin
738
+        double highMid = ((double)highBin * binSize) + binSize / 2.0;
739
+        // Index difference between low and high bins //////////////////////////// This is probably only ever one, with smoothing in use.
740
+        double rangeBin = highMid - lowMid;
741
+        // Position in range of window - min 0, max rangeBin
742
+        double inRange = dependency - lowMid;
743
+        // Value difference from the low bin to the high bin
744
+        double ctcDiff = shifts[highBin] - shifts[lowBin];
745
+        // Percentage position of dependency in range from lowMid to highMid
746
+        double pctInRange = inRange / rangeBin;
747
+        // The shifting value that we can now calculate
748
+        double newShift = shifts[lowBin] + pctInRange * ctcDiff;
749
+        return mass * (1 - newShift / 1.0e6);
750
+    }
751
+    // When dependency value is less than the lowest bin with valid data, use the lowestValidBin for the shift
752
+    if (useBin < lowestValidBin)
753
+    {
754
+        return mass * (1 - shifts[lowestValidBin] / 1.0e6);
755
+    }
756
+    // When dependency value is greater than the highest bin with valid data, use the highestValidBin for the shift
757
+    if (useBin > highestValidBin)
758
+    {
759
+        return mass * (1 - shifts[highestValidBin] / 1.0e6);
760
+    }
761
+    // If all else fails, default to whatever bin it falls into. Should never be used, the above should catch everything this one does
762
+    if (useBin < counts.size() && counts[useBin] > 0)
763
+    {
764
+        return mass * (1 - shifts[useBin] / 1.0e6);
765
+    }
766
+    // If the data we got is out of the shift range, just use the global shift.
767
+    return mass * (1 - globalShift / 1.0e6);
768
+}
769
+
770
+/*********************************************************************************************
771
+* Output the range of shifts being performed
772
+********************************************************************************************/
773
+string BinnedAdjustmentObject::getShiftRange() const
774
+{
775
+    double min = shifts[lowestValidBin];
776
+    double max = shifts[lowestValidBin];
777
+    for (size_t i = lowestValidBin; i <= highestValidBin; i++)
778
+    {
779
+        if (shifts[i] < min)
780
+        {
781
+            min = shifts[i];
782
+        }
783
+        if (shifts[i] > max)
784
+        {
785
+            max = shifts[i];
786
+        }
787
+    }
788
+    // May want to use an ostringstream to set a specific precision
789
+    return lexical_cast<string>(min) + " to " + lexical_cast<string>(max);
790
+}
791
+
792
+/*********************************************************************************************
793
+* Configure the shifting object, populate data bins, and run calculations
794
+********************************************************************************************/
795
+void AdjustByScanTime::calculate(vector<ShiftDataPtr>& data)
796
+{
797
+    // Sort the data by scan time, and populate the bins
798
+    std::sort(data.begin(), data.end(), ShiftData::byScanTimePtr);
799
+    // Set the store the highest and lowest valid bins
800
+    lowestValidBin = data.front()->scanTime / binSize;
801
+    highestValidBin = data.back()->scanTime / binSize;
802
+    // Calculate the number of bins needed; add 3-4 extra bins to avoid going out of bounds
803
+    bins = (data.back()->scanTime + (binSize * 4.0)) / binSize;
804
+    // Resize the data vectors to the appropriate size
805
+    sortBins.resize(bins);
806
+    counts.resize(bins, 0);
807
+    // Populate the data vectors
808
+    BOOST_FOREACH(const ShiftDataPtr &i, data)
809
+    {
810
+        int useBin = i->scanTime / binSize;
811
+        sortBins[useBin].push_back(i->ppmError);
812
+        counts[useBin]++;
813
+    }
814
+
815
+    // The generic bin-based shift calculation can take care of the rest.
816
+    processBins();
817
+}
818
+
819
+/*********************************************************************************************
820
+* Perform a shift on a mass, given a dependency
821
+********************************************************************************************/
822
+double AdjustByScanTime::shift(double scanTime, double mass) const
823
+{
824
+    return binShift(scanTime, mass);
825
+}
826
+
827
+/*********************************************************************************************
828
+ * Configure the shifting object, populate data bins, and run calculations
829
+ ********************************************************************************************/ 
830
+void AdjustByMassToCharge::calculate(vector<ShiftDataPtr>& data)
831
+{
832
+    // Sort the data by mass, and populate the bins
833
+    std::sort(data.begin(), data.end(), ShiftData::byExperMzPtr);
834
+    // Set the store the highest and lowest valid bins
835
+    lowestValidBin = data.front()->experMz / binSize;
836
+    highestValidBin = data.back()->experMz / binSize;
837
+    // Calculate the number of bins needed; add 3-4 extra bins to avoid going out of bounds
838
+    bins = (data.back()->experMz + (binSize * 4)) / binSize;
839
+    // Resize the data vectors to the appropriate size
840
+    sortBins.resize(bins);
841
+    counts.resize(bins, 0);
842
+    // Populate the data vectors
843
+    BOOST_FOREACH(const ShiftDataPtr &i, data)
844
+    {
845
+        int useBin = i->experMz / binSize;
846
+        sortBins[useBin].push_back(i->ppmError);
847
+        counts[useBin]++;
848
+    }
849
+
850
+    // The generic bin-based shift calculation can take care of the rest.
851
+    processBins();
852
+}
853
+
854
+/*********************************************************************************************
855
+ * Perform a shift on a mass, given a dependency
856
+ ********************************************************************************************/
857
+double AdjustByMassToCharge::shift(double scanTime, double mass) const
858
+{
859
+    return binShift(mass, mass);
860
+}
861
+
862
+
863
+//
864
+// CVConditionalFilter
865
+//
866
+
867
+CVConditionalFilter::CVConditionalFilter(CVConditionalFilterConfigData configData)
868
+{
869
+    updateFilter(configData.software, configData.cvTerm, configData.rangeSet, configData.step, configData.maxSteps);
870
+}
871
+
872
+CVConditionalFilter::CVConditionalFilter(CVID software, const string& cvTerm, const string& rangeSet, double step, int maxStep)
873
+{
874
+    updateFilter(software, cvTerm, rangeSet, step, maxStep);
875
+}
876
+
877
+CVConditionalFilter::CVConditionalFilter(CVID software, const string& cvTerm, double maxValue, double minValue, bool useMax, bool useMin, double step, int maxStep)
878
+{
879
+    updateFilter(software, cvTerm, maxValue, minValue, useMax, useMin, step, maxStep);
880
+}
881
+
882
+void CVConditionalFilter::updateFilter(CVID software, const string& cvTerm, double maxValue, double minValue, bool useMax, bool useMin, double step, int maxStep)
883
+{
884
+    scoreName = cvTerm;
885
+    useName = false;
886
+
887
+    // This score translator is cheap - will run pretty fast - small subset of full CV
888
+    cvid = pwiz::identdata::pepXMLScoreNameToCVID(software, cvTerm);
889
+    if (cvid == CVID_Unknown)
890
+    {
891
+        // Then run the more expensive one.
892
+        CVTranslator findCV;
893
+        cvid = findCV.translate(cvTerm);
894
+        if (cvid == CVID_Unknown)
895
+        {
896
+            // TODO: log output at a high detail level.
897
+            //cout << "Warning: cvTerm for \"" << cvTerm << "\" not found. Will search scores for the provided term." << endl;
898
+            useName = true;
899
+        }
900
+    }
901
+    if (!useName)
902
+    {
903
+        scoreName = cvTermInfo(cvid).name;
904
+    }
905
+
906
+    string threshold = "";
907
+    isMin = false;
908
+    isMax = false;
909
+    if (!useMax)
910
+    {
911
+        maxValue = numeric_limits<double>::max();
912
+        threshold = ">= " + lexical_cast<string>(minValue);
913
+        isMin = true;
914
+    }
915
+    else if (!useMin)
916
+    {
917
+        minValue = -maxValue;
918
+        threshold = "<= " + lexical_cast<string>(maxValue);
919
+        isMax = true;
920
+    }
921
+    else
922
+    {
923
+        threshold = lexical_cast<string>(minValue) + " <= MME <= " + lexical_cast<string>(maxValue);
924
+    }
925
+    this->threshold = threshold;
926
+    max = maxValue;
927
+    min = minValue;
928
+    this->step = step;
929
+    maxSteps = maxStep;
930
+    stepCount = 0;
931
+    center = (min + max) / 2.0;
932
+    // Allow for tests where the value must be outside of a certain range
933
+    isAnd = minValue < maxValue;
934
+}
935
+
936
+/*************************************************************************************
937
+* Implementation of a DoubleSet, like an IntSet
938
+* There is probably a better way, but I couldn't find one.
939
+* The return value is a vector of two doubles, with minValue at 0 and maxValue at 1
940
+* If the range was something like "-1" or "1-", the minValue/maxValue (whichever was not specified)
941
+*     is set to lowest/highest (respectively) possible value for double
942
+**************************************************************************************/
943
+vector<double> parseDoubleSet(const string& rangeSet)
944
+{
945
+    double maxValue = numeric_limits<double>::max();
946
+    double minValue = -maxValue;
947
+
948
+    // [200,] [,-200] [-5,5] [5,-5] [1,5]
949
+    if (rangeSet[0] == '[')
950
+    {
951
+        string inner = rangeSet.substr(1, rangeSet.length() - 2);
952
+        string lower = inner.substr(0, inner.rfind(','));
953
+        if (lower.length() > 0)
954
+        {
955
+            minValue = boost::lexical_cast<double>(lower);
956
+        }
957
+        if (inner.rfind(',') != string::npos)
958
+        {
959
+            string upper = inner.substr(inner.rfind(',') + 1);
960
+            if (upper.length() > 0)
961
+            {
962
+                maxValue = boost::lexical_cast<double>(upper);
963
+            }
964
+        }
965
+    }
966
+    else
967
+    {
968
+        // for less than negative value, use "--(number)"
969
+        // "-(number)" will interpret as less than (positive number)
970
+        // allowed formats: "-(num)" "(num)-(num)" "(num)-"
971
+        // -1.0e-10, --1.0e-10, 5.0-, 1-5 
972
+        vector<string> numbers;
973
+        string numBuilder = "";
974
+        string numTester = "1234567890.eE";
975
+        bool startMinus = false;
976
+        bool endMinus = false;
977
+        bool separated = false;
978
+        for (size_t i = 0; i < rangeSet.length(); i++)
979
+        {
980
+            if (numTester.find(rangeSet[i]) != string::npos)
981
+            {
982
+                // all digits, plus '.' and 'e'/'E'
983
+                numBuilder += rangeSet[i];
984
+            }
985
+            else if (rangeSet[i] == '-' && i != 0 && rangeSet[i - 1] == 'e')
986
+            {
987
+                // A '-' that modifies an exponent
988
+                numBuilder += rangeSet[i];
989
+            }
990
+            else if (rangeSet[i] == '-')
991
+            {
992
+                //handle other '-'
993
+                if (i == 0)
994
+                {
995
+                    startMinus = true;
996
+                }
997
+                else if (i == 1)
998
+                {
999
+                    numBuilder += rangeSet[i];
1000
+                }
1001
+                else if (i == rangeSet.length() - 1)
1002
+                {
1003
+                    if (!separated)
1004
+                    {
1005
+                        endMinus = true;
1006
+                        if (startMinus)
1007
+                        {
1008
+                            startMinus = false;
1009
+                            numBuilder = "-" + numBuilder;
1010
+                        }
1011
+                    }
1012
+                    else
1013
+                    {
1014
+                        //report error
1015
+                        throw pwiz::util::user_error("[mzRefiner::parseRange] syntax error (start and end specified, with an infinite end also)");
1016
+                    }
1017
+                }
1018
+                else
1019
+                {
1020
+                    if (!separated)
1021
+                    {
1022
+                        separated = true;
1023
+                        if (startMinus && numBuilder[0] != '-')
1024
+                        {
1025
+                            numBuilder = "-" + numBuilder;
1026
+                            startMinus = false;
1027
+                        }
1028
+                        numbers.push_back(numBuilder);
1029
+                        numBuilder = "";
1030
+                    }
1031
+                    else
1032
+                    {
1033
+                        numBuilder += rangeSet[i];
1034
+                    }
1035
+                }
1036
+            }
1037
+            else
1038
+            {
1039
+                // Error: not a valid digit/character
1040
+                throw pwiz::util::user_error("[mzRefiner::parseRange] invalid characters");
1041
+            }
1042
+        }
1043
+        numbers.push_back(numBuilder);
1044
+        if ((startMinus && endMinus) || ((startMinus || endMinus) && separated))
1045
+        {
1046
+            //report error - searching both directions from a value, or other invalid syntax
1047
+            throw pwiz::util::user_error("[mzRefiner::parseRange] invalid syntax");
1048
+        }
1049
+        vector<double> numbers2(numbers.size(), 0.0);
1050
+        for (size_t i = 0; i < numbers.size(); i++)
1051
+        {
1052
+            numbers2[i] = lexical_cast<double>(numbers[i]);
1053
+        }
1054
+        if (startMinus)
1055
+        {
1056
+            maxValue = numbers2[0];
1057
+        }
1058
+        else if (endMinus)
1059
+        {
1060
+            minValue = numbers2[0];
1061
+        }
1062
+        else
1063
+        {
1064
+            minValue = numbers2[0];
1065
+            maxValue = numbers2[1];
1066
+        }
1067
+    }
1068
+    vector<double> set;
1069
+    set.push_back(minValue);
1070
+    set.push_back(maxValue);
1071
+    return set;
1072
+}
1073
+
1074
+/*************************************************************************************
1075
+* Step in the creation of a filter when using a DoubleSet for the threshold.
1076
+**************************************************************************************/
1077
+void CVConditionalFilter::updateFilter(CVID software, const string& cvTerm, const string& rangeSet, double step, int maxStep)
1078
+{
1079
+    double maxValue;
1080
+    double minValue;
1081
+    bool useMax = false;
1082
+    bool useMin = false;
1083
+
1084
+    vector<double> threshold = parseDoubleSet(rangeSet);
1085
+    useMin = true;
1086
+    useMax = true;
1087
+    minValue = threshold[0];
1088
+    maxValue = threshold[1];
1089
+
1090
+    updateFilter(software, cvTerm, maxValue, minValue, useMax, useMin, step, maxStep);
1091
+}
1092
+
1093
+/*****************************************************************************************************************
1094
+* Evaluate if the spectrum identification item passes the filter
1095
+*****************************************************************************************************************/
1096
+bool CVConditionalFilter::passesFilter(SpectrumIdentificationItemPtr& sii, double& scoreVal) const
1097
+{
1098
+    bool found = false;
1099
+    double value = 0;
1100
+    if (!useName)
1101
+    {
1102
+        vector<CVParam>::iterator it = find_if(sii->cvParams.begin(), sii->cvParams.end(), CVParamIs(cvid));
1103
+        if (it == sii->cvParams.end())
1104
+        {
1105
+            return false;
1106
+        }
1107
+        found = true;
1108
+        value = it->valueAs<double>();
1109
+    }
1110
+    else
1111
+    {
1112
+        // Handle search userParams and cvParams for the score name
1113
+        // Search the userParams first, under the assumption that we would have matched a CVID if it was a cvParam.
1114
+        BOOST_FOREACH(UserParam &up, sii->userParams)
1115
+        {
1116
+            if (bal::iends_with(up.name, scoreName))
1117
+            {
1118
+                found = true;
1119
+                value = up.valueAs<double>();
1120
+                break;
1121
+            }
1122
+        }
1123
+        if (!found)
1124
+        {
1125
+            BOOST_FOREACH(CVParam &cvp, sii->cvParams)
1126
+            {
1127
+                if (bal::iends_with(cvp.name(), scoreName))
1128
+                {
1129
+                    found = true;
1130
+                    value = cvp.valueAs<double>();
1131
+                    break;
1132
+                }
1133
+            }
1134
+        }
1135
+    }
1136
+    if (found)
1137
+    {
1138
+        scoreVal = value;
1139
+        if (!isAnd)
1140
+        {
1141
+            return (value <= max) || (value >= min);
1142
+        }
1143
+        return (value <= max) && (value >= min);
1144
+    }
1145
+    return false;
1146
+}
1147
+
1148
+/**********************************************************************************
1149
+* A function to allow us to sort a set of values from best to worst,
1150
+*   depending on the threshold values definition of "better"
1151
+***********************************************************************************/
1152
+bool CVConditionalFilter::isBetter(double lScoreVal, double rScoreVal) const
1153
+{
1154
+    if (!isAnd && isMin)
1155
+    {
1156
+        // Return true if the left value is larger
1157
+        return lScoreVal > rScoreVal;
1158
+    }
1159
+    if (!isAnd && isMax)
1160
+    {
1161
+        // Return true if the left value is smaller
1162
+        return lScoreVal < rScoreVal;
1163
+    }
1164
+
1165
+    double left = abs(lScoreVal - center);
1166
+    double right = abs(rScoreVal - center);
1167
+    // Assumptions: If a score is based in a range (like -5 to 5), closer to the center is better.
1168
+    //              If a score excludes a range (value < -5 or 5 < value), further from the center is better.
1169
+    if (isAnd)
1170
+    {
1171
+        // Return true if the left value is closer to "center"
1172
+        return left < right;
1173
+    }
1174
+    else
1175
+    {
1176
+        // Return true if the left value is further from "center"
1177
+        return left > right;
1178
+    }
1179
+}
1180
+
1181
+/***********************************************************************************
1182
+* Comparison function for sorting by score.
1183
+* Should probably change to prefer rank first, and the sort identical rank by score.
1184
+***********************************************************************************/
1185
+bool CVConditionalFilter::isBetter(const ScanDataPtr& lData, const ScanDataPtr& rData) const
1186
+{
1187
+    // If rank == 0, probably PMF data, rely on the score used; otherwise, sort by rank unless it is equal
1188
+    if (lData->rank != 0 && rData->rank != 0 && lData->rank != rData->rank)
1189
+    {
1190
+        // A lower rank is better
1191
+        if (lData->rank < rData->rank)
1192
+        {
1193
+            return true;
1194
+        }
1195
+        return false;
1196
+    }
1197
+    return isBetter(lData->scoreValue, rData->scoreValue);
1198
+}
1199
+
1200
+/********************************************************************************************
1201
+* For the edge case where you just have to have an adjustment and therefore you want values from
1202
+*   poorer scoring identifications gradually included until you have the required number of data points.
1203
+* (For the record, Sam Payne is against this, and Matt Monroe wants it.)
1204
+********************************************************************************************/
1205
+bool CVConditionalFilter::adjustFilterByStep()
1206
+{
1207
+    bool hasStep = false;
1208
+    ostringstream oss;
1209
+    oss << "Adjusted filters: " << endl;
1210
+    oss << "\tOld: " << *this << endl;
1211
+    if (step != 0.0 && stepCount < maxSteps)
1212
+    {
1213
+        hasStep = true;
1214
+        if (fabs(max) != numeric_limits<double>::max())
1215
+        {
1216
+            max = max * step;
1217
+        }
1218
+        if (fabs(min) != numeric_limits<double>::max())
1219
+        {
1220
+            min = min * step;
1221
+        }
1222
+        stepCount++;
1223
+    }
1224
+    if (hasStep)
1225
+    {
1226
+        oss << "\tNew: " << *this << endl;
1227
+        cout << oss.str() << endl;
1228
+    }
1229
+    return hasStep;
1230
+}
1231
+
1232
+/***************************************************************************************************************
1233
+* Overloaded output to easily output filter state
1234
+***************************************************************************************************************/
1235
+ostream& operator<<(ostream& out, CVConditionalFilter filter)
1236
+{
1237
+    CVID temp = filter.getCVID();
1238
+    double max = filter.getMax();
1239
+    double min = filter.getMin();
1240
+    bool isAnd = filter.getAnd();
1241
+    CVTermInfo cv = cvTermInfo(temp);
1242
+
1243
+    out << filter.getScoreName() << "; " << min << " <= value " << (isAnd ? "&&" : "||") << " value <= " << max;
1244
+
1245
+    return out;
1246
+}
1247
+
1248
+
1249
+/**********************************************************************************************
1250
+* Check an instrument configuration for a final high-res analyzer
1251
+***********************************************************************************************/
1252
+bool configurationIsHighRes(const InstrumentConfigurationPtr& ic)
1253
+{
1254
+    Component* la = &(ic->componentList.analyzer(0));
1255
+    // Get the last analyzer
1256
+    BOOST_FOREACH(Component &c, ic->componentList)
1257
+    {
1258
+        if (c.type == ComponentType_Analyzer)
1259
+        {
1260
+            la = &c;
1261
+        }
1262
+    }
1263
+    // Look for Orbitrap, FT, or TOF to test for high-res
1264
+    if (la->hasCVParam(MS_orbitrap)
1265
+        || la->hasCVParam(MS_time_of_flight)
1266
+        || la->hasCVParam(MS_fourier_transform_ion_cyclotron_resonance_mass_spectrometer)
1267
+        || la->hasCVParam(MS_stored_waveform_inverse_fourier_transform))
1268
+    {
1269
+        return true;
1270
+    }
1271
+    return false;
1272
+}
1273
+
1274
+/*********************************************************************************************
1275
+* Check a spectrum to determine if it is high res; also, return the first scan start time found.
1276
+* NB: only checks the first scan.
1277
+* Returns true if spectrum is high-resolution, and changes the start time if the value is available
1278
+*********************************************************************************************/
1279
+bool getSpectrumHighResAndStartTime(const vector<Scan>& scans, double& startTime, bool allHighRes)
1280
+{
1281
+    bool isHighRes = false;
1282
+    CVParam scanStartTime;
1283
+    if (!scans.empty())
1284
+    {
1285
+        // Set isHighRes to true, don't allow it to be set back to false after it has been set to true.
1286
+        if (allHighRes || configurationIsHighRes(scans[0].instrumentConfigurationPtr))
1287
+        {
1288
+            isHighRes = true;
1289
+        }
1290
+
1291
+        if (scanStartTime.empty())
1292
+        {
1293
+            scanStartTime = scans[0].cvParam(MS_scan_start_time);
1294
+        }
1295
+
1296
+        // Only attempt to change the start time if we have a valid CVParam.
1297
+        if (!scanStartTime.empty())
1298
+        {
1299
+            startTime = scanStartTime.timeInSeconds(); // Use seconds
1300
+        }
1301
+    }
1302
+    return isHighRes;
1303
+}
1304
+
1305
+
1306
+//
1307
+// SpectrumList_MZRefiner::Impl
1308
+//
1309
+
1310
+class SpectrumList_MZRefiner::Impl
1311
+{
1312
+public:
1313
+    Impl(util::IntegerSet p_msLevelsToRefine, bool assumeHighRes, CVConditionalFilter::CVConditionalFilterConfigData filterConfigData) : msLevelsToRefine(p_msLevelsToRefine), filterConfigData_(filterConfigData), bad_(0), badByScore_(0), badByMassError_(0), allHighRes_(assumeHighRes) {}
1314
+    AdjustmentObjectPtr adjust;
1315
+    AdjustmentObjectPtr ms2Adjust;
1316
+    string identFilePath;
1317
+    vector<ShiftDataPtr> data; // Will hold ScanDataPtr, but must be ShiftDataPtr because polymorphism on collections doesn't work (can't pass vector<ScanDataPtr> to vector<ShiftDataPtr> parameter)
1318
+    vector<ShiftDataPtr> ms2Data;
1319
+    SpectrumListPtr spectrumList; // store a reference to the spectrum list for checking precursor information.
1320
+    void configureShift(const MSData& msd, const string& identFile, pwiz::util::IterationListenerRegistry* ilr);
1321
+    bool getPrecursorHighResAndStartTime(const Precursor& p, double& scanStartTime) const;
1322
+    bool containsHighResData(const MSData& msd);
1323
+    bool isAllHighRes() { return allHighRes_; }
1324
+    util::IntegerSet msLevelsToRefine;
1325
+    string getFilterScoreName() { return filterScoreName_; }
1326
+    string getFilterThreshold() { return filterThreshold_; }
1327
+
1328
+private:
1329
+    shared_ptr<ofstream> statStream_;
1330
+    int bad_, badByScore_, badByMassError_;
1331
+    bool allHighRes_;
1332
+    CVConditionalFilter::CVConditionalFilterConfigData filterConfigData_;
1333
+    void processIdentData(const MSData& msd, pwiz::util::IterationListenerRegistry* ilr);
1334
+    void getMSDataData(const MSData& msd, pwiz::util::IterationListenerRegistry* ilr);
1335
+    void shiftCalculator(pwiz::util::IterationListenerRegistry* ilr);
1336
+    void shiftCalculator(pwiz::util::IterationListenerRegistry* ilr, vector<ShiftDataPtr>& shiftData,
1337
+                         AdjustSimpleGlobalPtr& globalShift, AdjustByScanTimePtr& scanTimeShift, AdjustByMassToChargePtr& mzShift,
1338
+                         AdjustmentObjectPtr& adjustment, string shiftDataType) const;
1339
+    void fragmentationIonPpmErrors(const string& peptideSeq, const int& peptideSeqLength, const SpectrumPtr& s);
1340
+    bool cleanIsotopes(ScanDataPtr& sd) const; // Utility function. Doesn't really need to be a member function, but it isn't used anywhere else than in processIdentData
1341
+    string filterScoreName_;
1342
+    string filterThreshold_;
1343
+
1344
+    // Identification precision filters; Stored here to facilitate future modification via user input
1345
+    double isotopeScreenAdj_; // Filtering threshold for measurement accuracy/precision: Maximum measurement error where isotopic peak correction will be attempted
1346
+    double isotopeFilter_;    // Filtering threshold for measurement accuracy/precision: Maximum measurement error
1347
+    double ppmErrorLimit_;    // Filtering threshold for measurement accuracy/precision: Maximum PPM error
1348
+};
1349
+
1350
+/*********************************************************************************
1351
+* Checks the instrument configuration data to determine if there are high-resolution spectra in the file.
1352
+* Also checks and sets the allHighRes_ member variable based on if all spectra will be high-resolution.
1353
+* This is a bad, dual-purpose function: the return value and the value stored to the member variable are related, but not the same.
1354
+*********************************************************************************/
1355
+bool SpectrumList_MZRefiner::Impl::containsHighResData(const MSData& msd)
1356
+{
1357
+    if (allHighRes_)
1358
+        return true;
1359
+
1360
+    bool hasHighRes = false;
1361
+    allHighRes_ = true;
1362
+    BOOST_FOREACH(const InstrumentConfigurationPtr& ic, msd.instrumentConfigurationPtrs)
1363
+    {
1364
+        // Set hasHighRes to true, don't allow it to be set back to false after it has been set to true.
1365
+        if (configurationIsHighRes(ic))
1366
+        {
1367
+            hasHighRes = true;
1368
+        }
1369
+        else
1370
+        {
1371
+            allHighRes_ = false;
1372
+        }
1373
+    }
1374
+    return hasHighRes;
1375
+}
1376
+
1377
+shared_ptr<ofstream> openFilestreamIfWritable(const bfs::path& filepath, bool& fileExists)
1378
+{
1379
+    fileExists = bfs::exists(filepath);
1380
+    shared_ptr<ofstream> tmp = boost::make_shared<ofstream>(filepath.string().c_str(), std::ios::app);
1381
+    if (!*tmp) return shared_ptr<ofstream>(); // return null ptr if open failed (probably because file or path is read-only)
1382
+    return tmp;
1383
+}
1384
+