Browse code

Removed unused 'Swap' method (that was triggering warnings about memcpy/memset)

Initialize NAAN = 0 in Options...which hopefully is OK? Passes tests anyways..

Andrew McDavid authored on 08/09/2020 20:38:46
Showing1 changed files
... ...
@@ -316,6 +316,7 @@ struct Options
316 316
 		isEST = false;
317 317
 		is454 = false;
318 318
 		NAA = 5;
319
+		NAAN = 0;
319 320
 		NAA_top_limit = 5;
320 321
 		cluster_thd = 0.9;
321 322
 		distance_thd = 0.0;
... ...
@@ -414,7 +415,7 @@ struct Sequence
414 415
 	void Resize( int n );
415 416
 	void Reserve( int n );
416 417
 
417
-	void Swap( Sequence & other );
418
+	//void Swap( Sequence & other );
418 419
 	int Format();
419 420
 
420 421
 	void ConvertBases();
Browse code

Reinitialize (known) global variables each cdhit/cdhit_est run

Eventually these globals should be turned local, but their use is complicated in cdhit-common.cpp

Closes #1.

Andrew McDavid authored on 17/06/2019 16:59:59
Showing1 changed files
... ...
@@ -29,7 +29,7 @@
29 29
 //                    Center for Biological Sequencing (CBS), DTU
30 30
 //                    2300 Kongens Lyngby, Denmark
31 31
 //                    Email: thomasp85@gmail.com
32
-//                    
32
+//
33 33
 // =============================================================================
34 34
 
35 35
 #include<iostream>
... ...
@@ -119,12 +119,12 @@ class NVector
119 119
 		int     capacity;
120 120
 
121 121
 		NVector(){ size = capacity = 0; items = NULL; }
122
-		NVector( int n, const TYPE & v=TYPE() ){ 
123
-			size = capacity = 0; items = NULL; 
122
+		NVector( int n, const TYPE & v=TYPE() ){
123
+			size = capacity = 0; items = NULL;
124 124
 			Resize( n, v );
125 125
 		}
126 126
 		NVector( const NVector & other ){
127
-			size = capacity = 0; items = NULL; 
127
+			size = capacity = 0; items = NULL;
128 128
 			if( other.items ){
129 129
 				Resize( other.size );
130 130
 				memcpy( items, other.items, other.size * sizeof(TYPE) );
... ...
@@ -136,7 +136,7 @@ class NVector
136 136
 		int  Size()const{ return size; }
137 137
 		void Clear(){
138 138
 			if( items ) free( items );
139
-			size = capacity = 0; items = NULL; 
139
+			size = capacity = 0; items = NULL;
140 140
 		}
141 141
 
142 142
 		void Resize( int n, const TYPE & value=TYPE() ){
... ...
@@ -213,6 +213,9 @@ extern int NAAN_array[13];
213 213
 
214 214
 void InitNAA( int max );
215 215
 
216
+void setaa_to_na();
217
+void resetaa();
218
+
216 219
 
217 220
 extern int naa_stat_start_percent;
218 221
 extern int naa_stat[5][61][4];
... ...
@@ -247,11 +250,11 @@ class WordTable
247 250
 		int  AddWordCounts( NVector<IndexCount> & counts, Sequence *seq, bool skipN=false);
248 251
 		int  AddWordCountsFrag( NVector<IndexCount> & counts, int frag, int frag_size, int repfrag );
249 252
 
250
-		int  AddWordCounts(int aan_no, Vector<int> & word_encodes, 
253
+		int  AddWordCounts(int aan_no, Vector<int> & word_encodes,
251 254
 				Vector<INTs> & word_encodes_no, int idx, bool skipN=false);
252
-		int AddWordCountsFrag( int aan_no, Vector<int> & word_encodes, 
255
+		int AddWordCountsFrag( int aan_no, Vector<int> & word_encodes,
253 256
 				Vector<INTs> & word_encodes_no, int frag, int frag_size );
254
-		int CountWords(int aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, 
257
+		int CountWords(int aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no,
255 258
 				NVector<IndexCount> & lookCounts, NVector<uint32_t> & indexMapping,
256 259
 				bool est=false, int min=0);
257 260
 		void PrintAll();
... ...
@@ -358,6 +361,8 @@ struct Options
358 361
 	void Print();
359 362
 };
360 363
 
364
+extern Options options;
365
+
361 366
 void bomb_error(const char *message);
362 367
 void clear_temps();
363 368
 
... ...
@@ -572,7 +577,7 @@ class SequenceDB
572 577
 		void ClusterOne( Sequence *seq, int id, WordTable & table,
573 578
 				WorkingParam & param, WorkingBuffer & buf, const Options & options );
574 579
 
575
-		//void SelfComparing( int start, int end, WordTable & table, 
580
+		//void SelfComparing( int start, int end, WordTable & table,
576 581
 		//    WorkingParam & param, WorkingBuffer & buf, const Options & options );
577 582
 
578 583
 		void ComputeDistance( const Options & options );
... ...
@@ -591,13 +596,13 @@ void bomb_error(const char *message, const char *message2);
591 596
 void bomb_warning(const char *message);
592 597
 void bomb_warning(const char *message, const char *message2);
593 598
 void format_seq(char *seq);
594
-int diag_test_aapn(int NAA1, char iseq2[], int len1, int len2, 
599
+int diag_test_aapn(int NAA1, char iseq2[], int len1, int len2,
595 600
 		WorkingBuffer & buffer, int &best_sum,
596 601
 		int band_width, int &band_left, int &band_center, int &band_right, int required_aa1);
597
-int diag_test_aapn_est(int NAA1, char iseq2[], int len1, int len2, 
602
+int diag_test_aapn_est(int NAA1, char iseq2[], int len1, int len2,
598 603
 		WorkingBuffer & buffer, int &best_sum,
599 604
 		int band_width, int &band_left, int &band_center, int &band_right, int required_aa1);
600
-int local_band_align( char query[], char ref[], int qlen, int rlen, ScoreMatrix &mat, 
605
+int local_band_align( char query[], char ref[], int qlen, int rlen, ScoreMatrix &mat,
601 606
 		int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
602 607
 		int band_left, int band_center, int band_right, WorkingBuffer & buffer);
603 608
 
Browse code

Initial commit. Extra files from FindMyFriends.

Andrew McDavid authored on 08/10/2018 02:40:14
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,619 @@
1
+// =============================================================================
2
+// CD-HI/CD-HIT
3
+//
4
+// Cluster Database at High Identity Threshold
5
+//
6
+// CD-HIT clusters protein sequence database at high sequence identity threshold.
7
+// This program can remove the high sequence redundance efficiently.
8
+//
9
+// program written by
10
+//                    Weizhong Li
11
+//                    UCSD, San Diego Supercomputer Center
12
+//                    La Jolla, CA, 92093
13
+//                    Email liwz@sdsc.edu
14
+//
15
+//                 at
16
+//                    Adam Godzik's lab
17
+//                    The Burnham Institute
18
+//                    La Jolla, CA, 92037
19
+//                    Email adam@burnham-inst.org
20
+//
21
+// modified by:
22
+//                    Limin Fu
23
+//                    Center for Research in Biological Systems (CRBS), UCSD
24
+//                    La Jolla, CA, 92093
25
+//                    Email: l2fu@ucsd.edu, fu@daovm.net
26
+//
27
+// modified by:
28
+//                    Thomas Lin Pedersen
29
+//                    Center for Biological Sequencing (CBS), DTU
30
+//                    2300 Kongens Lyngby, Denmark
31
+//                    Email: thomasp85@gmail.com
32
+//                    
33
+// =============================================================================
34
+
35
+#include<iostream>
36
+#include<fstream>
37
+#include<iomanip>
38
+#include<cstdlib>
39
+#include<stdio.h>
40
+#include<string.h>
41
+#include<ctype.h>
42
+#include<stdint.h>
43
+#include<time.h>
44
+
45
+#include<valarray>
46
+#include<vector>
47
+#include<map>
48
+
49
+#define CDHIT_VERSION  "4.6"
50
+
51
+#ifndef MAX_SEQ
52
+#define MAX_SEQ 655360
53
+#endif
54
+
55
+#define MAX_AA 23
56
+#define MAX_NA 6
57
+#define MAX_UAA 21
58
+#define MAX_DIAG (MAX_SEQ<<1)              // MAX_DIAG be twice of MAX_SEQ
59
+#define MAX_GAP MAX_SEQ                    // MAX_GAP <= MAX_SEQ
60
+#define MAX_DES 300000
61
+#define MAX_LINE_SIZE 300000
62
+#define MAX_FILE_NAME 1280
63
+#define MAX_SEG 50
64
+#define MAX_BIN_SWAP 2E9
65
+#define MAX_TABLE_SIZE 50000000
66
+#define CLOCK_TICKS 100
67
+#define FAILED_FUNC 1
68
+#define OK_FUNC 0
69
+
70
+#define IS_REP 1
71
+#define IS_REDUNDANT 2
72
+#define IS_PROCESSED 16
73
+#define IS_MINUS_STRAND 32
74
+
75
+/*
76
+#define max(a,b) (((a)>(b))?(a):(b))
77
+#define min(a,b) (((a)<(b))?(a):(b))
78
+*/
79
+
80
+typedef unsigned int UINT4;
81
+typedef unsigned short UINT2;
82
+
83
+#define LONG_SEQ
84
+
85
+//if the longset sequence is longer than 65535, I use INT4
86
+#ifdef LONG_SEQ
87
+#define INTs UINT4
88
+#else
89
+#define INTs UINT2
90
+#endif
91
+
92
+using namespace std;
93
+
94
+// the parent containter must guarantee continuous memory allocation.
95
+// std::valarray could be used instead of std::vector.
96
+template<class TYPE>
97
+class Vector : public vector<TYPE>
98
+{
99
+	public:
100
+		Vector() : vector<TYPE>(){}
101
+		Vector( size_t size ) : vector<TYPE>( size ){}
102
+		Vector( size_t size, const TYPE & deft ) : vector<TYPE>( size, deft ){}
103
+
104
+		void Append( const TYPE & item ){
105
+			size_t n = this->size();
106
+			if( n + 1 >= this->capacity() ) this->reserve( n + n/5 + 1 );
107
+			this->push_back( item );
108
+		}
109
+		int size()const{ return (int)vector<TYPE>::size(); }
110
+};
111
+
112
+// for primitive types only
113
+template<class TYPE>
114
+class NVector
115
+{
116
+	public:
117
+		TYPE   *items;
118
+		int     size;
119
+		int     capacity;
120
+
121
+		NVector(){ size = capacity = 0; items = NULL; }
122
+		NVector( int n, const TYPE & v=TYPE() ){ 
123
+			size = capacity = 0; items = NULL; 
124
+			Resize( n, v );
125
+		}
126
+		NVector( const NVector & other ){
127
+			size = capacity = 0; items = NULL; 
128
+			if( other.items ){
129
+				Resize( other.size );
130
+				memcpy( items, other.items, other.size * sizeof(TYPE) );
131
+			}
132
+		}
133
+
134
+		~NVector(){ if( items ) free( items ); }
135
+
136
+		int  Size()const{ return size; }
137
+		void Clear(){
138
+			if( items ) free( items );
139
+			size = capacity = 0; items = NULL; 
140
+		}
141
+
142
+		void Resize( int n, const TYPE & value=TYPE() ){
143
+			if( n == size && capacity > 0 ) return;
144
+			int i;
145
+			// When resize() is called, probably this is the intended size,
146
+			// and will not be changed frequently.
147
+			if( n != capacity ){
148
+				capacity = n;
149
+				items = (TYPE*)realloc( items, capacity*sizeof(TYPE) );
150
+			}
151
+			for(i=size; i<n; i++ ) items[i] = value;
152
+			size = n;
153
+		}
154
+		void Append( const TYPE & item ){
155
+			if( size + 1 >= capacity ){
156
+				capacity = size + size/5 + 1;
157
+				items = (TYPE*)realloc( items, capacity*sizeof(TYPE) );
158
+			}
159
+			items[size] = item;
160
+			size ++;
161
+		}
162
+
163
+		TYPE& operator[]( const int i ){
164
+			//if( i <0 or i >= size ) printf( "out of range\n" );
165
+			return items[i];
166
+		}
167
+		TYPE& operator[]( const int i )const{
168
+			//if( i <0 or i >= size ) printf( "out of range\n" );
169
+			return items[i];
170
+		}
171
+};
172
+typedef NVector<int>      VectorInt;
173
+typedef Vector<VectorInt> MatrixInt;
174
+
175
+typedef NVector<int64_t>   VectorInt64;
176
+typedef Vector<VectorInt64> MatrixInt64;
177
+
178
+////////// Class definition //////////
179
+class ScoreMatrix { //Matrix
180
+	private:
181
+
182
+	public:
183
+		int matrix[MAX_AA][MAX_AA];
184
+		int gap, ext_gap;
185
+
186
+		ScoreMatrix();
187
+		void init();
188
+		void set_gap(int gap1, int ext_gap1);
189
+		void set_matrix(int *mat1);
190
+		void set_to_na();
191
+		void set_match( int score );
192
+		void set_mismatch( int score );
193
+}; // END class ScoreMatrix
194
+
195
+
196
+typedef NVector<INTs>      VectorIntX;
197
+typedef Vector<VectorIntX> MatrixIntX;
198
+
199
+
200
+extern int NAA1 ;
201
+extern int NAA2 ;
202
+extern int NAA3 ;
203
+extern int NAA4 ;
204
+extern int NAA5 ;
205
+extern int NAA6 ;
206
+extern int NAA7 ;
207
+extern int NAA8 ;
208
+extern int NAA9 ;
209
+extern int NAA10;
210
+extern int NAA11;
211
+extern int NAA12;
212
+extern int NAAN_array[13];
213
+
214
+void InitNAA( int max );
215
+
216
+
217
+extern int naa_stat_start_percent;
218
+extern int naa_stat[5][61][4];
219
+
220
+struct IndexCount
221
+{
222
+	int index;
223
+	int count;
224
+
225
+	IndexCount( int i=0, int c=0 ){ index = i, count = c; }
226
+};
227
+
228
+struct Sequence;
229
+
230
+class WordTable
231
+{
232
+	private:
233
+	public:
234
+		Vector<NVector<IndexCount> > indexCounts; // hold index and word counts of seqs
235
+		Vector<Sequence*>            sequences;
236
+		int     NAA;                // length of word
237
+		int     NAAN;               // rows of table
238
+		char    is_aa;              // aa is for prot
239
+		size_t  size;
240
+		int     frag_count;
241
+
242
+	public:
243
+		WordTable( int naa=0, int naan=0 );
244
+		void Init(int, int);
245
+		void Clear();
246
+		void SetDNA();
247
+		int  AddWordCounts( NVector<IndexCount> & counts, Sequence *seq, bool skipN=false);
248
+		int  AddWordCountsFrag( NVector<IndexCount> & counts, int frag, int frag_size, int repfrag );
249
+
250
+		int  AddWordCounts(int aan_no, Vector<int> & word_encodes, 
251
+				Vector<INTs> & word_encodes_no, int idx, bool skipN=false);
252
+		int AddWordCountsFrag( int aan_no, Vector<int> & word_encodes, 
253
+				Vector<INTs> & word_encodes_no, int frag, int frag_size );
254
+		int CountWords(int aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, 
255
+				NVector<IndexCount> & lookCounts, NVector<uint32_t> & indexMapping,
256
+				bool est=false, int min=0);
257
+		void PrintAll();
258
+}; // END class INDEX_TBL
259
+struct Options
260
+{
261
+	int     NAA;
262
+	int     NAAN;
263
+	int     NAA_top_limit;
264
+
265
+	size_t  max_memory; // -M: 400,000,000 in bytes
266
+	int     min_length; // -l: 10 bases
267
+	bool    cluster_best;  // -g: 0, the first; 1, the best
268
+	bool    global_identity; // -G:
269
+	bool    store_disk; // -B:
270
+	int     band_width; // -b: 20
271
+	double  cluster_thd; // -c
272
+	double  distance_thd; // -D
273
+	double  diff_cutoff; // -s: 0.0
274
+	double  diff_cutoff2; // -s2: 1.0
275
+	int     diff_cutoff_aa; // -S: 999999
276
+	int     diff_cutoff_aa2; // -S2: 0
277
+	int     tolerance; // -t: 2
278
+	double  long_coverage; // -aL:
279
+	int     long_control; // -AL:
280
+	double  short_coverage; // -aS:
281
+	int     short_control; // -AS:
282
+	int     min_control; // -A:
283
+	double  long_unmatch_per; // -uL
284
+	double  short_unmatch_per; // -uS
285
+	int     unmatch_len; // -U
286
+	int     max_indel; // -D
287
+	int     print;
288
+	int     des_len;
289
+	int     frag_size;
290
+	int     option_r;
291
+	int     threads;
292
+
293
+	size_t  max_entries;
294
+	size_t  max_sequences;
295
+	size_t  mem_limit;
296
+
297
+	bool    has2D;
298
+	bool    isEST;
299
+	bool    is454;
300
+	bool    useIdentity;
301
+	bool    useDistance;
302
+	bool    backupFile;
303
+
304
+	string  input;
305
+	string  input2;
306
+	string  output;
307
+
308
+	Options(){
309
+		backupFile = false;
310
+		useIdentity = false;
311
+		useDistance = false;
312
+		has2D = false;
313
+		isEST = false;
314
+		is454 = false;
315
+		NAA = 5;
316
+		NAA_top_limit = 5;
317
+		cluster_thd = 0.9;
318
+		distance_thd = 0.0;
319
+		max_memory = 800000000;
320
+		min_length = 10;
321
+		cluster_best = false;
322
+		global_identity = true;
323
+		store_disk = false;
324
+		band_width = 20;
325
+		diff_cutoff = 0.0;
326
+		diff_cutoff2 = 1.0;
327
+		diff_cutoff_aa = 99999999;
328
+		diff_cutoff_aa2 = 0;
329
+		tolerance = 2;
330
+		long_coverage = 0.0;
331
+		long_control = 99999999;
332
+		short_coverage = 0.0;
333
+		short_control = 99999999;
334
+		long_unmatch_per = 1.0;
335
+		short_unmatch_per = 1.0;
336
+		unmatch_len = 99999999;
337
+		min_control = 0;
338
+		max_indel = 1;
339
+		print = 0;
340
+		option_r  = 1;
341
+		frag_size = 0;
342
+		des_len = 20;
343
+		threads = 1;
344
+		max_entries = 0;
345
+		max_sequences = 1<<20;
346
+		mem_limit = 100000000;
347
+	};
348
+
349
+	bool SetOptionCommon( const char *flag, const char *value );
350
+	bool SetOption( const char *flag, const char *value );
351
+	bool SetOption2D( const char *flag, const char *value );
352
+	bool SetOptionEST( const char *flag, const char *value );
353
+	bool SetOptions( int argc, char *argv[], bool twodata=false, bool est=false );
354
+
355
+	void Validate();
356
+	void ComputeTableLimits( int min_len, int max_len, int typical_len, size_t mem_need );
357
+
358
+	void Print();
359
+};
360
+
361
+void bomb_error(const char *message);
362
+void clear_temps();
363
+
364
+struct Sequence
365
+{
366
+	// real sequence, if it is not stored swap file:
367
+	char *data;
368
+	// length of the sequence:
369
+	int   size;
370
+	int   bufsize;
371
+
372
+	//uint32_t stats;
373
+
374
+	// if swap != NULL, the sequence is stored in file.
375
+	// swap is opened as temporary file, which will be deleted automatically
376
+	// after the program is finished:
377
+	FILE *swap;
378
+	// stream offset of the sequence:
379
+	int   offset;
380
+
381
+	// stream offset of the description string in the database:
382
+	size_t   des_begin;
383
+	// length of the description:
384
+	int   des_length;
385
+	// length of the description in quality score part:
386
+	int   des_length2;
387
+	// length of data in fasta file, including line wrapping:
388
+	int   dat_length;
389
+
390
+	char *identifier;
391
+
392
+	// index of the sequence in the original database:
393
+	int   index;
394
+	short state;
395
+	int   cluster_id;
396
+	float identity;
397
+	float distance;
398
+	int   coverage[4];
399
+
400
+	Sequence();
401
+	Sequence( const Sequence & other );
402
+	~Sequence();
403
+
404
+	void Clear();
405
+
406
+	void operator=( const char *s );
407
+	void operator+=( const char *s );
408
+
409
+	void Resize( int n );
410
+	void Reserve( int n );
411
+
412
+	void Swap( Sequence & other );
413
+	int Format();
414
+
415
+	void ConvertBases();
416
+
417
+	void SwapIn();
418
+	void SwapOut();
419
+	void PrintInfo( int id, FILE *fout, const Options & options, char *buf );
420
+};
421
+
422
+struct WorkingParam
423
+{
424
+	double aa1_cutoff;
425
+	double aas_cutoff; /* or aa2 */
426
+	double aan_cutoff;
427
+	int    len_upper_bound;
428
+	int    len_lower_bound;
429
+
430
+	WorkingParam( double a1=0, double a2=0, double an=0 ){
431
+		Set( a1, a2, an );
432
+	}
433
+	void Set( double a1=0, double a2=0, double an=0 ){
434
+		aa1_cutoff = a1;
435
+		aas_cutoff = a2;
436
+		aan_cutoff = an;
437
+		len_upper_bound = 0;
438
+		len_lower_bound = 0;
439
+	}
440
+
441
+	int len_eff;
442
+	int aln_cover_flag;
443
+	int min_aln_lenS;
444
+	int min_aln_lenL;
445
+	int required_aa1;
446
+	int required_aas; /* or aa2 */
447
+	int required_aan;
448
+
449
+	void ControlShortCoverage( int len, const Options & option );
450
+	void ControlLongCoverage( int len, const Options & option );
451
+	void ComputeRequiredBases( int NAA, int ss, const Options & option );
452
+};
453
+
454
+//#define MAX_TABLE_SEQ (1<<22)
455
+#define MAX_TABLE_SEQ 4000000
456
+
457
+enum { DP_BACK_NONE=0, DP_BACK_LEFT_TOP=1, DP_BACK_LEFT=2, DP_BACK_TOP=3 };
458
+
459
+struct WorkingBuffer
460
+{
461
+	Vector<int>  taap;
462
+	Vector<int>  word_encodes;
463
+	Vector<int>  word_encodes_backup;
464
+	Vector<INTs> word_encodes_no;
465
+	Vector<INTs> aap_list;
466
+	Vector<INTs> aap_begin;
467
+	//Vector<IndexCount>  indexCounts;
468
+	NVector<IndexCount>  lookCounts;
469
+	NVector<uint32_t>    indexMapping;
470
+	MatrixInt64  score_mat;
471
+	MatrixInt    back_mat;
472
+	Vector<int>  diag_score;
473
+	Vector<int>  diag_score2;
474
+	Vector<int> aan_list_comp;
475
+	Vector<char> seqi_comp;
476
+	int total_bytes;
477
+
478
+	WorkingBuffer( size_t frag=0, size_t maxlen=0, const Options & options=Options() ){
479
+		Set( frag, maxlen, options );
480
+		seqi_comp.resize( MAX_SEQ );
481
+	}
482
+	void Set( size_t frag, size_t maxlen, const Options & options ){
483
+		bool est = options.isEST;
484
+		size_t m = MAX_UAA*MAX_UAA;
485
+		size_t max_len = maxlen;
486
+		size_t band = max_len*max_len;
487
+		if( est ) m = m * m;
488
+		if( band > options.band_width ) band = options.band_width;
489
+		taap.resize( m );
490
+		aap_list.resize( max_len );
491
+		aap_begin.resize( m );
492
+		//indexCounts.resize( max_len );
493
+		word_encodes.resize( max_len );
494
+		word_encodes_no.resize( max_len );
495
+		word_encodes_backup.resize( max_len );
496
+		/* each table can not contain more than MAX_TABLE_SEQ representatives or fragments! */
497
+		if( frag > MAX_TABLE_SEQ ) frag = MAX_TABLE_SEQ;
498
+		lookCounts.Resize( frag + 2 );
499
+		indexMapping.Resize( frag + 2 );
500
+		diag_score.resize( MAX_DIAG );
501
+		diag_score2.resize( MAX_DIAG );
502
+		aan_list_comp.resize( max_len );
503
+		total_bytes = max_len;
504
+		total_bytes += taap.size()*sizeof(int);
505
+		total_bytes += word_encodes.size()*sizeof(int);
506
+		total_bytes += word_encodes_backup.size()*sizeof(int);
507
+		total_bytes += diag_score.size()*sizeof(int);
508
+		total_bytes += diag_score2.size()*sizeof(int);
509
+		total_bytes += aan_list_comp.size()*sizeof(int);
510
+		total_bytes += word_encodes_no.size()*sizeof(INTs);
511
+		total_bytes += aap_list.size()*sizeof(INTs);
512
+		total_bytes += aap_begin.size()*sizeof(INTs);
513
+		total_bytes += indexMapping.Size()*sizeof(uint32_t);
514
+		//total_bytes += indexCounts.size()*sizeof(IndexCount);
515
+		total_bytes += lookCounts.Size()*sizeof(IndexCount);
516
+		total_bytes += max_len*(band*sizeof(int)+sizeof(VectorInt));
517
+		total_bytes += max_len*(band*sizeof(int)+sizeof(VectorInt64));
518
+	}
519
+
520
+	int EncodeWords( Sequence *seq, int NA, bool est = false );
521
+	void ComputeAAP( const char *seqi, int size );
522
+	void ComputeAAP2( const char *seqi, int size );
523
+};
524
+extern Vector<int>  Comp_AAN_idx;
525
+extern ScoreMatrix  mat;
526
+
527
+
528
+class SequenceDB
529
+{
530
+	public:
531
+
532
+		int NAAN;
533
+		Vector<Sequence*>  sequences;
534
+		Vector<int>        rep_seqs;
535
+
536
+		long long total_letter;
537
+		long long total_desc;
538
+		size_t max_len;
539
+		size_t min_len;
540
+		size_t len_n50;
541
+
542
+		void Clear(){
543
+			for(int i=0; i<sequences.size(); i++) delete sequences[i];
544
+			sequences.clear(); rep_seqs.clear();
545
+		}
546
+
547
+		SequenceDB(){
548
+			total_letter = 0;
549
+			total_desc = 0;
550
+			min_len = 0;
551
+			max_len = 0;
552
+			len_n50 = 0;
553
+		}
554
+		~SequenceDB(){ Clear(); }
555
+
556
+		void Read( const char *file, const Options & options );
557
+		void WriteClusters( const char *db, const char *newdb, const Options & options );
558
+		void WriteExtra1D( const Options & options );
559
+		vector<int> GetClusters( const Options & options );
560
+		void WriteExtra2D( SequenceDB & other, const Options & options );
561
+		void DivideSave( const char *db, const char *newdb, int n, const Options & options );
562
+
563
+		void SwapIn( int seg, bool reponly=false );
564
+		void SwapOut( int seg );
565
+
566
+		//void Sort( int first, int last );
567
+		void SortDivide( Options & options, bool sort=true );
568
+		void MakeWordTable( const Options & optioins );
569
+
570
+		size_t MinimalMemory( int frag_no, int bsize, int T, const Options & options, size_t extra=0 );
571
+
572
+		void ClusterOne( Sequence *seq, int id, WordTable & table,
573
+				WorkingParam & param, WorkingBuffer & buf, const Options & options );
574
+
575
+		//void SelfComparing( int start, int end, WordTable & table, 
576
+		//    WorkingParam & param, WorkingBuffer & buf, const Options & options );
577
+
578
+		void ComputeDistance( const Options & options );
579
+		void DoClustering( const Options & options, std::string name, bool showProgress );
580
+		void DoClustering( int T, const Options & options );
581
+		void ClusterTo( SequenceDB & other, const Options & optioins );
582
+		int  CheckOne( Sequence *seq, WordTable & tab, WorkingParam & par, WorkingBuffer & buf, const Options & opt );
583
+		int  CheckOneEST( Sequence *seq, WordTable & tab, WorkingParam & par, WorkingBuffer & buf, const Options & opt );
584
+		int  CheckOneAA( Sequence *seq, WordTable & tab, WorkingParam & par, WorkingBuffer & buf, const Options & opt );
585
+};
586
+
587
+
588
+int print_usage (char *arg);
589
+void bomb_error(const char *message);
590
+void bomb_error(const char *message, const char *message2);
591
+void bomb_warning(const char *message);
592
+void bomb_warning(const char *message, const char *message2);
593
+void format_seq(char *seq);
594
+int diag_test_aapn(int NAA1, char iseq2[], int len1, int len2, 
595
+		WorkingBuffer & buffer, int &best_sum,
596
+		int band_width, int &band_left, int &band_center, int &band_right, int required_aa1);
597
+int diag_test_aapn_est(int NAA1, char iseq2[], int len1, int len2, 
598
+		WorkingBuffer & buffer, int &best_sum,
599
+		int band_width, int &band_left, int &band_center, int &band_right, int required_aa1);
600
+int local_band_align( char query[], char ref[], int qlen, int rlen, ScoreMatrix &mat, 
601
+		int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
602
+		int band_left, int band_center, int band_right, WorkingBuffer & buffer);
603
+
604
+int print_usage_2d (char *arg);
605
+int print_usage_est (char *arg);
606
+int print_usage_div (char *arg);
607
+int print_usage_est_2d (char *arg);
608
+
609
+int upper_bound_length_rep(int len, const Options & options );
610
+void cal_aax_cutoff (double &aa1_cutoff, double &aa2_cutoff, double &aan_cutoff,
611
+		double NR_clstr, int tolerance, int naa_stat_start_percent,
612
+		int naa_stat[5][61][4], int NAA);
613
+void update_aax_cutoff(double &aa1_cutoff, double &aa2_cutoff, double &aan_cutoff,
614
+		int tolerance, int naa_stat_start_percent,
615
+		int naa_stat[5][61][4], int NAA, int iden);
616
+
617
+int calc_ann_list(int len, char *seqi, int NAA, int& aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, bool est=false);
618
+
619
+float current_time();