src/cdhit-common.h
a1725dd6
 // =============================================================================
 // CD-HI/CD-HIT
 //
 // Cluster Database at High Identity Threshold
 //
 // CD-HIT clusters protein sequence database at high sequence identity threshold.
 // This program can remove the high sequence redundance efficiently.
 //
 // program written by
 //                    Weizhong Li
 //                    UCSD, San Diego Supercomputer Center
 //                    La Jolla, CA, 92093
 //                    Email liwz@sdsc.edu
 //
 //                 at
 //                    Adam Godzik's lab
 //                    The Burnham Institute
 //                    La Jolla, CA, 92037
 //                    Email adam@burnham-inst.org
 //
 // modified by:
 //                    Limin Fu
 //                    Center for Research in Biological Systems (CRBS), UCSD
 //                    La Jolla, CA, 92093
 //                    Email: l2fu@ucsd.edu, fu@daovm.net
 //
 // modified by:
 //                    Thomas Lin Pedersen
 //                    Center for Biological Sequencing (CBS), DTU
 //                    2300 Kongens Lyngby, Denmark
 //                    Email: thomasp85@gmail.com
34c488c2
 //
a1725dd6
 // =============================================================================
 
 #include<iostream>
 #include<fstream>
 #include<iomanip>
 #include<cstdlib>
 #include<stdio.h>
 #include<string.h>
 #include<ctype.h>
 #include<stdint.h>
 #include<time.h>
 
 #include<valarray>
 #include<vector>
 #include<map>
 
 #define CDHIT_VERSION  "4.6"
 
 #ifndef MAX_SEQ
 #define MAX_SEQ 655360
 #endif
 
 #define MAX_AA 23
 #define MAX_NA 6
 #define MAX_UAA 21
 #define MAX_DIAG (MAX_SEQ<<1)              // MAX_DIAG be twice of MAX_SEQ
 #define MAX_GAP MAX_SEQ                    // MAX_GAP <= MAX_SEQ
 #define MAX_DES 300000
 #define MAX_LINE_SIZE 300000
 #define MAX_FILE_NAME 1280
 #define MAX_SEG 50
 #define MAX_BIN_SWAP 2E9
 #define MAX_TABLE_SIZE 50000000
 #define CLOCK_TICKS 100
 #define FAILED_FUNC 1
 #define OK_FUNC 0
 
 #define IS_REP 1
 #define IS_REDUNDANT 2
 #define IS_PROCESSED 16
 #define IS_MINUS_STRAND 32
 
 /*
 #define max(a,b) (((a)>(b))?(a):(b))
 #define min(a,b) (((a)<(b))?(a):(b))
 */
 
 typedef unsigned int UINT4;
 typedef unsigned short UINT2;
 
 #define LONG_SEQ
 
 //if the longset sequence is longer than 65535, I use INT4
 #ifdef LONG_SEQ
 #define INTs UINT4
 #else
 #define INTs UINT2
 #endif
 
 using namespace std;
 
 // the parent containter must guarantee continuous memory allocation.
 // std::valarray could be used instead of std::vector.
 template<class TYPE>
 class Vector : public vector<TYPE>
 {
 	public:
 		Vector() : vector<TYPE>(){}
 		Vector( size_t size ) : vector<TYPE>( size ){}
 		Vector( size_t size, const TYPE & deft ) : vector<TYPE>( size, deft ){}
 
 		void Append( const TYPE & item ){
 			size_t n = this->size();
 			if( n + 1 >= this->capacity() ) this->reserve( n + n/5 + 1 );
 			this->push_back( item );
 		}
 		int size()const{ return (int)vector<TYPE>::size(); }
 };
 
 // for primitive types only
 template<class TYPE>
 class NVector
 {
 	public:
 		TYPE   *items;
 		int     size;
 		int     capacity;
 
 		NVector(){ size = capacity = 0; items = NULL; }
34c488c2
 		NVector( int n, const TYPE & v=TYPE() ){
 			size = capacity = 0; items = NULL;
a1725dd6
 			Resize( n, v );
 		}
 		NVector( const NVector & other ){
34c488c2
 			size = capacity = 0; items = NULL;
a1725dd6
 			if( other.items ){
 				Resize( other.size );
 				memcpy( items, other.items, other.size * sizeof(TYPE) );
 			}
 		}
 
 		~NVector(){ if( items ) free( items ); }
 
 		int  Size()const{ return size; }
 		void Clear(){
 			if( items ) free( items );
34c488c2
 			size = capacity = 0; items = NULL;
a1725dd6
 		}
 
 		void Resize( int n, const TYPE & value=TYPE() ){
 			if( n == size && capacity > 0 ) return;
 			int i;
 			// When resize() is called, probably this is the intended size,
 			// and will not be changed frequently.
 			if( n != capacity ){
 				capacity = n;
 				items = (TYPE*)realloc( items, capacity*sizeof(TYPE) );
 			}
 			for(i=size; i<n; i++ ) items[i] = value;
 			size = n;
 		}
 		void Append( const TYPE & item ){
 			if( size + 1 >= capacity ){
 				capacity = size + size/5 + 1;
 				items = (TYPE*)realloc( items, capacity*sizeof(TYPE) );
 			}
 			items[size] = item;
 			size ++;
 		}
 
 		TYPE& operator[]( const int i ){
 			//if( i <0 or i >= size ) printf( "out of range\n" );
 			return items[i];
 		}
 		TYPE& operator[]( const int i )const{
 			//if( i <0 or i >= size ) printf( "out of range\n" );
 			return items[i];
 		}
 };
 typedef NVector<int>      VectorInt;
 typedef Vector<VectorInt> MatrixInt;
 
 typedef NVector<int64_t>   VectorInt64;
 typedef Vector<VectorInt64> MatrixInt64;
 
 ////////// Class definition //////////
 class ScoreMatrix { //Matrix
 	private:
 
 	public:
 		int matrix[MAX_AA][MAX_AA];
 		int gap, ext_gap;
 
 		ScoreMatrix();
 		void init();
 		void set_gap(int gap1, int ext_gap1);
 		void set_matrix(int *mat1);
 		void set_to_na();
 		void set_match( int score );
 		void set_mismatch( int score );
 }; // END class ScoreMatrix
 
 
 typedef NVector<INTs>      VectorIntX;
 typedef Vector<VectorIntX> MatrixIntX;
 
 
 extern int NAA1 ;
 extern int NAA2 ;
 extern int NAA3 ;
 extern int NAA4 ;
 extern int NAA5 ;
 extern int NAA6 ;
 extern int NAA7 ;
 extern int NAA8 ;
 extern int NAA9 ;
 extern int NAA10;
 extern int NAA11;
 extern int NAA12;
 extern int NAAN_array[13];
 
 void InitNAA( int max );
 
34c488c2
 void setaa_to_na();
 void resetaa();
 
a1725dd6
 
 extern int naa_stat_start_percent;
 extern int naa_stat[5][61][4];
 
 struct IndexCount
 {
 	int index;
 	int count;
 
 	IndexCount( int i=0, int c=0 ){ index = i, count = c; }
 };
 
 struct Sequence;
 
 class WordTable
 {
 	private:
 	public:
 		Vector<NVector<IndexCount> > indexCounts; // hold index and word counts of seqs
 		Vector<Sequence*>            sequences;
 		int     NAA;                // length of word
 		int     NAAN;               // rows of table
 		char    is_aa;              // aa is for prot
 		size_t  size;
 		int     frag_count;
 
 	public:
 		WordTable( int naa=0, int naan=0 );
 		void Init(int, int);
 		void Clear();
 		void SetDNA();
 		int  AddWordCounts( NVector<IndexCount> & counts, Sequence *seq, bool skipN=false);
 		int  AddWordCountsFrag( NVector<IndexCount> & counts, int frag, int frag_size, int repfrag );
 
34c488c2
 		int  AddWordCounts(int aan_no, Vector<int> & word_encodes,
a1725dd6
 				Vector<INTs> & word_encodes_no, int idx, bool skipN=false);
34c488c2
 		int AddWordCountsFrag( int aan_no, Vector<int> & word_encodes,
a1725dd6
 				Vector<INTs> & word_encodes_no, int frag, int frag_size );
34c488c2
 		int CountWords(int aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no,
a1725dd6
 				NVector<IndexCount> & lookCounts, NVector<uint32_t> & indexMapping,
 				bool est=false, int min=0);
 		void PrintAll();
 }; // END class INDEX_TBL
 struct Options
 {
 	int     NAA;
 	int     NAAN;
 	int     NAA_top_limit;
 
 	size_t  max_memory; // -M: 400,000,000 in bytes
 	int     min_length; // -l: 10 bases
 	bool    cluster_best;  // -g: 0, the first; 1, the best
 	bool    global_identity; // -G:
 	bool    store_disk; // -B:
 	int     band_width; // -b: 20
 	double  cluster_thd; // -c
 	double  distance_thd; // -D
 	double  diff_cutoff; // -s: 0.0
 	double  diff_cutoff2; // -s2: 1.0
 	int     diff_cutoff_aa; // -S: 999999
 	int     diff_cutoff_aa2; // -S2: 0
 	int     tolerance; // -t: 2
 	double  long_coverage; // -aL:
 	int     long_control; // -AL:
 	double  short_coverage; // -aS:
 	int     short_control; // -AS:
 	int     min_control; // -A:
 	double  long_unmatch_per; // -uL
 	double  short_unmatch_per; // -uS
 	int     unmatch_len; // -U
 	int     max_indel; // -D
 	int     print;
 	int     des_len;
 	int     frag_size;
 	int     option_r;
 	int     threads;
 
 	size_t  max_entries;
 	size_t  max_sequences;
 	size_t  mem_limit;
 
 	bool    has2D;
 	bool    isEST;
 	bool    is454;
 	bool    useIdentity;
 	bool    useDistance;
 	bool    backupFile;
 
 	string  input;
 	string  input2;
 	string  output;
 
 	Options(){
 		backupFile = false;
 		useIdentity = false;
 		useDistance = false;
 		has2D = false;
 		isEST = false;
 		is454 = false;
 		NAA = 5;
bbbee14b
 		NAAN = 0;
a1725dd6
 		NAA_top_limit = 5;
 		cluster_thd = 0.9;
 		distance_thd = 0.0;
 		max_memory = 800000000;
 		min_length = 10;
 		cluster_best = false;
 		global_identity = true;
 		store_disk = false;
 		band_width = 20;
 		diff_cutoff = 0.0;
 		diff_cutoff2 = 1.0;
 		diff_cutoff_aa = 99999999;
 		diff_cutoff_aa2 = 0;
 		tolerance = 2;
 		long_coverage = 0.0;
 		long_control = 99999999;
 		short_coverage = 0.0;
 		short_control = 99999999;
 		long_unmatch_per = 1.0;
 		short_unmatch_per = 1.0;
 		unmatch_len = 99999999;
 		min_control = 0;
 		max_indel = 1;
 		print = 0;
 		option_r  = 1;
 		frag_size = 0;
 		des_len = 20;
 		threads = 1;
 		max_entries = 0;
 		max_sequences = 1<<20;
 		mem_limit = 100000000;
 	};
 
 	bool SetOptionCommon( const char *flag, const char *value );
 	bool SetOption( const char *flag, const char *value );
 	bool SetOption2D( const char *flag, const char *value );
 	bool SetOptionEST( const char *flag, const char *value );
 	bool SetOptions( int argc, char *argv[], bool twodata=false, bool est=false );
 
 	void Validate();
 	void ComputeTableLimits( int min_len, int max_len, int typical_len, size_t mem_need );
 
 	void Print();
 };
 
34c488c2
 extern Options options;
 
a1725dd6
 void bomb_error(const char *message);
 void clear_temps();
 
 struct Sequence
 {
 	// real sequence, if it is not stored swap file:
 	char *data;
 	// length of the sequence:
 	int   size;
 	int   bufsize;
 
 	//uint32_t stats;
 
 	// if swap != NULL, the sequence is stored in file.
 	// swap is opened as temporary file, which will be deleted automatically
 	// after the program is finished:
 	FILE *swap;
 	// stream offset of the sequence:
 	int   offset;
 
 	// stream offset of the description string in the database:
 	size_t   des_begin;
 	// length of the description:
 	int   des_length;
 	// length of the description in quality score part:
 	int   des_length2;
 	// length of data in fasta file, including line wrapping:
 	int   dat_length;
 
 	char *identifier;
 
 	// index of the sequence in the original database:
 	int   index;
 	short state;
 	int   cluster_id;
 	float identity;
 	float distance;
 	int   coverage[4];
 
 	Sequence();
 	Sequence( const Sequence & other );
 	~Sequence();
 
 	void Clear();
 
 	void operator=( const char *s );
 	void operator+=( const char *s );
 
 	void Resize( int n );
 	void Reserve( int n );
 
bbbee14b
 	//void Swap( Sequence & other );
a1725dd6
 	int Format();
 
 	void ConvertBases();
 
 	void SwapIn();
 	void SwapOut();
 	void PrintInfo( int id, FILE *fout, const Options & options, char *buf );
 };
 
 struct WorkingParam
 {
 	double aa1_cutoff;
 	double aas_cutoff; /* or aa2 */
 	double aan_cutoff;
 	int    len_upper_bound;
 	int    len_lower_bound;
 
 	WorkingParam( double a1=0, double a2=0, double an=0 ){
 		Set( a1, a2, an );
 	}
 	void Set( double a1=0, double a2=0, double an=0 ){
 		aa1_cutoff = a1;
 		aas_cutoff = a2;
 		aan_cutoff = an;
 		len_upper_bound = 0;
 		len_lower_bound = 0;
 	}
 
 	int len_eff;
 	int aln_cover_flag;
 	int min_aln_lenS;
 	int min_aln_lenL;
 	int required_aa1;
 	int required_aas; /* or aa2 */
 	int required_aan;
 
 	void ControlShortCoverage( int len, const Options & option );
 	void ControlLongCoverage( int len, const Options & option );
 	void ComputeRequiredBases( int NAA, int ss, const Options & option );
 };
 
 //#define MAX_TABLE_SEQ (1<<22)
 #define MAX_TABLE_SEQ 4000000
 
 enum { DP_BACK_NONE=0, DP_BACK_LEFT_TOP=1, DP_BACK_LEFT=2, DP_BACK_TOP=3 };
 
 struct WorkingBuffer
 {
 	Vector<int>  taap;
 	Vector<int>  word_encodes;
 	Vector<int>  word_encodes_backup;
 	Vector<INTs> word_encodes_no;
 	Vector<INTs> aap_list;
 	Vector<INTs> aap_begin;
 	//Vector<IndexCount>  indexCounts;
 	NVector<IndexCount>  lookCounts;
 	NVector<uint32_t>    indexMapping;
 	MatrixInt64  score_mat;
 	MatrixInt    back_mat;
 	Vector<int>  diag_score;
 	Vector<int>  diag_score2;
 	Vector<int> aan_list_comp;
 	Vector<char> seqi_comp;
 	int total_bytes;
 
 	WorkingBuffer( size_t frag=0, size_t maxlen=0, const Options & options=Options() ){
 		Set( frag, maxlen, options );
 		seqi_comp.resize( MAX_SEQ );
 	}
 	void Set( size_t frag, size_t maxlen, const Options & options ){
 		bool est = options.isEST;
 		size_t m = MAX_UAA*MAX_UAA;
 		size_t max_len = maxlen;
 		size_t band = max_len*max_len;
 		if( est ) m = m * m;
 		if( band > options.band_width ) band = options.band_width;
 		taap.resize( m );
 		aap_list.resize( max_len );
 		aap_begin.resize( m );
 		//indexCounts.resize( max_len );
 		word_encodes.resize( max_len );
 		word_encodes_no.resize( max_len );
 		word_encodes_backup.resize( max_len );
 		/* each table can not contain more than MAX_TABLE_SEQ representatives or fragments! */
 		if( frag > MAX_TABLE_SEQ ) frag = MAX_TABLE_SEQ;
 		lookCounts.Resize( frag + 2 );
 		indexMapping.Resize( frag + 2 );
 		diag_score.resize( MAX_DIAG );
 		diag_score2.resize( MAX_DIAG );
 		aan_list_comp.resize( max_len );
 		total_bytes = max_len;
 		total_bytes += taap.size()*sizeof(int);
 		total_bytes += word_encodes.size()*sizeof(int);
 		total_bytes += word_encodes_backup.size()*sizeof(int);
 		total_bytes += diag_score.size()*sizeof(int);
 		total_bytes += diag_score2.size()*sizeof(int);
 		total_bytes += aan_list_comp.size()*sizeof(int);
 		total_bytes += word_encodes_no.size()*sizeof(INTs);
 		total_bytes += aap_list.size()*sizeof(INTs);
 		total_bytes += aap_begin.size()*sizeof(INTs);
 		total_bytes += indexMapping.Size()*sizeof(uint32_t);
 		//total_bytes += indexCounts.size()*sizeof(IndexCount);
 		total_bytes += lookCounts.Size()*sizeof(IndexCount);
 		total_bytes += max_len*(band*sizeof(int)+sizeof(VectorInt));
 		total_bytes += max_len*(band*sizeof(int)+sizeof(VectorInt64));
 	}
 
 	int EncodeWords( Sequence *seq, int NA, bool est = false );
 	void ComputeAAP( const char *seqi, int size );
 	void ComputeAAP2( const char *seqi, int size );
 };
 extern Vector<int>  Comp_AAN_idx;
 extern ScoreMatrix  mat;
 
 
 class SequenceDB
 {
 	public:
 
 		int NAAN;
 		Vector<Sequence*>  sequences;
 		Vector<int>        rep_seqs;
 
 		long long total_letter;
 		long long total_desc;
 		size_t max_len;
 		size_t min_len;
 		size_t len_n50;
 
 		void Clear(){
 			for(int i=0; i<sequences.size(); i++) delete sequences[i];
 			sequences.clear(); rep_seqs.clear();
 		}
 
 		SequenceDB(){
 			total_letter = 0;
 			total_desc = 0;
 			min_len = 0;
 			max_len = 0;
 			len_n50 = 0;
 		}
 		~SequenceDB(){ Clear(); }
 
 		void Read( const char *file, const Options & options );
 		void WriteClusters( const char *db, const char *newdb, const Options & options );
 		void WriteExtra1D( const Options & options );
 		vector<int> GetClusters( const Options & options );
 		void WriteExtra2D( SequenceDB & other, const Options & options );
 		void DivideSave( const char *db, const char *newdb, int n, const Options & options );
 
 		void SwapIn( int seg, bool reponly=false );
 		void SwapOut( int seg );
 
 		//void Sort( int first, int last );
 		void SortDivide( Options & options, bool sort=true );
 		void MakeWordTable( const Options & optioins );
 
 		size_t MinimalMemory( int frag_no, int bsize, int T, const Options & options, size_t extra=0 );
 
 		void ClusterOne( Sequence *seq, int id, WordTable & table,
 				WorkingParam & param, WorkingBuffer & buf, const Options & options );
 
34c488c2
 		//void SelfComparing( int start, int end, WordTable & table,
a1725dd6
 		//    WorkingParam & param, WorkingBuffer & buf, const Options & options );
 
 		void ComputeDistance( const Options & options );
 		void DoClustering( const Options & options, std::string name, bool showProgress );
 		void DoClustering( int T, const Options & options );
 		void ClusterTo( SequenceDB & other, const Options & optioins );
 		int  CheckOne( Sequence *seq, WordTable & tab, WorkingParam & par, WorkingBuffer & buf, const Options & opt );
 		int  CheckOneEST( Sequence *seq, WordTable & tab, WorkingParam & par, WorkingBuffer & buf, const Options & opt );
 		int  CheckOneAA( Sequence *seq, WordTable & tab, WorkingParam & par, WorkingBuffer & buf, const Options & opt );
 };
 
 
 int print_usage (char *arg);
 void bomb_error(const char *message);
 void bomb_error(const char *message, const char *message2);
 void bomb_warning(const char *message);
 void bomb_warning(const char *message, const char *message2);
 void format_seq(char *seq);
34c488c2
 int diag_test_aapn(int NAA1, char iseq2[], int len1, int len2,
a1725dd6
 		WorkingBuffer & buffer, int &best_sum,
 		int band_width, int &band_left, int &band_center, int &band_right, int required_aa1);
34c488c2
 int diag_test_aapn_est(int NAA1, char iseq2[], int len1, int len2,
a1725dd6
 		WorkingBuffer & buffer, int &best_sum,
 		int band_width, int &band_left, int &band_center, int &band_right, int required_aa1);
34c488c2
 int local_band_align( char query[], char ref[], int qlen, int rlen, ScoreMatrix &mat,
a1725dd6
 		int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
 		int band_left, int band_center, int band_right, WorkingBuffer & buffer);
 
 int print_usage_2d (char *arg);
 int print_usage_est (char *arg);
 int print_usage_div (char *arg);
 int print_usage_est_2d (char *arg);
 
 int upper_bound_length_rep(int len, const Options & options );
 void cal_aax_cutoff (double &aa1_cutoff, double &aa2_cutoff, double &aan_cutoff,
 		double NR_clstr, int tolerance, int naa_stat_start_percent,
 		int naa_stat[5][61][4], int NAA);
 void update_aax_cutoff(double &aa1_cutoff, double &aa2_cutoff, double &aan_cutoff,
 		int tolerance, int naa_stat_start_percent,
 		int naa_stat[5][61][4], int NAA, int iden);
 
 int calc_ann_list(int len, char *seqi, int NAA, int& aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, bool est=false);
 
 float current_time();