src/R_interface.cpp
326f362a
 
 
 
6c315cf8
 #include "R_interface.h"
8a70266d
 
 static ScaleHMM* hmm; // declare as static outside the function because we only need one and this enables memory-cleanup on R_CheckUserInterrupt()
 static double** multiD;
6ad4e3b8
 
a0b1f4ae
 // ===================================================================================================================================================
fa363ffb
 // This function takes parameters from R, creates a univariate HMM object, creates the distributions, runs the EM and returns the result to R.
a0b1f4ae
 // ===================================================================================================================================================
b1ab269a
 void univariate_hmm(int* O, int* T, int* N, int* state_labels, double* size, double* prob, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* weights, int* distr_type, double* initial_size, double* initial_prob, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* read_cutoff, int* algorithm, int* verbosity)
a0b1f4ae
 {
6ad4e3b8
 
 	// Define logging level
a0b1f4ae
 // 	FILE* pFile = fopen("chromStar.log", "w");
6ad4e3b8
 // 	Output2FILE::Stream() = pFile;
8a70266d
 //  	FILELog::ReportingLevel() = FILELog::FromString("NONE");
41c75ecc
 //  	FILELog::ReportingLevel() = FILELog::FromString("DEBUG2");
6ad4e3b8
 
7d8b241d
 // 	// Parallelization settings
a9469ef4
 // 	omp_set_num_threads(*num_threads);
6ad4e3b8
 
 	// Print some information
8a70266d
 	//FILE_LOG(logINFO) << "number of states = " << *N;
b1ab269a
 	if (*verbosity>=1) Rprintf("number of states = %d\n", *N);
8a70266d
 	//FILE_LOG(logINFO) << "number of bins = " << *T;
b1ab269a
 	if (*verbosity>=1) Rprintf("number of bins = %d\n", *T);
6ad4e3b8
 	if (*maxiter < 0)
 	{
8a70266d
 		//FILE_LOG(logINFO) << "maximum number of iterations = none";
b1ab269a
 		if (*verbosity>=1) Rprintf("maximum number of iterations = none\n");
6ad4e3b8
 	} else {
8a70266d
 		//FILE_LOG(logINFO) << "maximum number of iterations = " << *maxiter;
b1ab269a
 		if (*verbosity>=1) Rprintf("maximum number of iterations = %d\n", *maxiter);
6ad4e3b8
 	}
 	if (*maxtime < 0)
 	{
8a70266d
 		//FILE_LOG(logINFO) << "maximum running time = none";
b1ab269a
 		if (*verbosity>=1) Rprintf("maximum running time = none\n");
6ad4e3b8
 	} else {
8a70266d
 		//FILE_LOG(logINFO) << "maximum running time = " << *maxtime << " sec";
b1ab269a
 		if (*verbosity>=1) Rprintf("maximum running time = %d sec\n", *maxtime);
6ad4e3b8
 	}
8a70266d
 	//FILE_LOG(logINFO) << "epsilon = " << *eps;
b1ab269a
 	if (*verbosity>=1) Rprintf("epsilon = %g\n", *eps);
a0b1f4ae
 
8a70266d
 	//FILE_LOG(logDEBUG3) << "observation vector";
6ad4e3b8
 	for (int t=0; t<50; t++) {
8a70266d
 		//FILE_LOG(logDEBUG3) << "O["<<t<<"] = " << O[t];
6ad4e3b8
 	}
 
b1ab269a
 	// Flush if (*verbosity>=1) Rprintf statements to console
a0b1f4ae
 	R_FlushConsole();
 
6ad4e3b8
 	// Create the HMM
8a70266d
 	//FILE_LOG(logDEBUG1) << "Creating a univariate HMM";
 	hmm = new ScaleHMM(*T, *N);
1fe9061a
 // 	LogHMM* hmm = new LogHMM(*T, *N);
a0b1f4ae
 	hmm->set_cutoff(*read_cutoff);
6ad4e3b8
 	// Initialize the transition probabilities and proba
079638e5
 	hmm->initialize_transition_probs(initial_A, *use_initial_params);
 	hmm->initialize_proba(initial_proba, *use_initial_params);
6ad4e3b8
     
 	// Calculate mean and variance of data
 	double mean = 0, variance = 0;
 	for(int t=0; t<*T; t++)
 	{
 		mean+= O[t];
 	}
 	mean = mean / *T;
 	for(int t=0; t<*T; t++)
 	{
 		variance+= pow(O[t] - mean, 2);
 	}
 	variance = variance / *T;
8a70266d
 	//FILE_LOG(logINFO) << "data mean = " << mean << ", data variance = " << variance;		
b1ab269a
 	if (*verbosity>=1) Rprintf("data mean = %g, data variance = %g\n", mean, variance);		
6ad4e3b8
 	
7d8b241d
 	// Create the emission densities and initialize
f4cdf709
 	for (int i_state=0; i_state<*N; i_state++)
 	{
 		if (distr_type[i_state] == 1)
a0b1f4ae
 		{
8a70266d
 			//FILE_LOG(logDEBUG1) << "Using delta distribution for state " << i_state;
1fe9061a
 			ZeroInflation *d = new ZeroInflation(O, *T); // delete is done inside ~ScaleHMM()
a0b1f4ae
 			hmm->densityFunctions.push_back(d);
 		}
f4cdf709
 		else if (distr_type[i_state] == 2)
ecb41d4a
 		{
8a70266d
 			//FILE_LOG(logDEBUG1) << "Using geometric distribution for state " << i_state;
7d8b241d
 			Geometric *d = new Geometric(O, *T, initial_prob[i_state]); // delete is done inside ~ScaleHMM()
1fe9061a
 			hmm->densityFunctions.push_back(d);
 		}
f4cdf709
 		else if (distr_type[i_state] == 3)
1fe9061a
 		{
8a70266d
 			//FILE_LOG(logDEBUG1) << "Using negative binomial for state " << i_state;
1fe9061a
 			NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM()
ecb41d4a
 			hmm->densityFunctions.push_back(d);
 		}
7d8b241d
 		else if (distr_type[i_state] == 4)
 		{
8a70266d
 			//FILE_LOG(logDEBUG1) << "Using binomial for state " << i_state;
7d8b241d
 			NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]); // delete is done inside ~ScaleHMM()
 			hmm->densityFunctions.push_back(d);
 		}
ecb41d4a
 		else
 		{
8a70266d
 			//FILE_LOG(logWARNING) << "Density not specified, using default negative binomial for state " << i_state;
a0b1f4ae
 			NegativeBinomial *d = new NegativeBinomial(O, *T, initial_size[i_state], initial_prob[i_state]);
ecb41d4a
 			hmm->densityFunctions.push_back(d);
 		}
6ad4e3b8
 	}
 
b1ab269a
 	// Flush if (*verbosity>=1) Rprintf statements to console
a0b1f4ae
 	R_FlushConsole();
 
fa363ffb
 	// Do the EM to estimate the parameters
a0b1f4ae
 	try
 	{
fa363ffb
 		if (*algorithm == 1)
 		{
 			hmm->baumWelch();
 		}
 		else if (*algorithm == 3)
 		{
 			//FILE_LOG(logDEBUG1) << "Starting EM estimation";
 			hmm->EM(maxiter, maxtime, eps);
 			//FILE_LOG(logDEBUG1) << "Finished with EM estimation";
 		}
a0b1f4ae
 	}
 	catch (std::exception& e)
 	{
fa363ffb
 		//FILE_LOG(logERROR) << "Error in EM/baumWelch: " << e.what();
b1ab269a
 		if (*verbosity>=1) Rprintf("Error in EM/baumWelch: %s\n", e.what());
8a70266d
 		if (strcmp(e.what(),"nan detected")==0) { *error = 1; }
a0b1f4ae
 		else { *error = 2; }
 	}
631d6738
 
fba447b1
 	// // Compute the posteriors and save results directly to the R pointer
 	// //FILE_LOG(logDEBUG1) << "Recode posteriors into column representation";
 	// #pragma omp parallel for
 	// for (int iN=0; iN<*N; iN++)
 	// {
 	// 	for (int t=0; t<*T; t++)
 	// 	{
 	// 		posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t);
 	// 	}
 	// }
631d6738
 
 	// Compute the states from posteriors
8a70266d
 	//FILE_LOG(logDEBUG1) << "Computing states from posteriors";
 	int ind_max;
 	std::vector<double> posterior_per_t(*N);
631d6738
 	for (int t=0; t<*T; t++)
6ad4e3b8
 	{
631d6738
 		for (int iN=0; iN<*N; iN++)
6ad4e3b8
 		{
631d6738
 			posterior_per_t[iN] = hmm->get_posterior(iN, t);
6ad4e3b8
 		}
8a70266d
 		ind_max = std::distance(posterior_per_t.begin(), std::max_element(posterior_per_t.begin(), posterior_per_t.end()));
 		states[t] = state_labels[ind_max];
fba447b1
 		maxPosterior[t] = posterior_per_t[ind_max];
6ad4e3b8
 	}
 
8a70266d
 	//FILE_LOG(logDEBUG1) << "Return parameters";
6ad4e3b8
 	// also return the estimated transition matrix and the initial probs
079638e5
 	for (int i=0; i<*N; i++)
6ad4e3b8
 	{
079638e5
 		proba[i] = hmm->get_proba(i);
 		for (int j=0; j<*N; j++)
6ad4e3b8
 		{
7d8b241d
 			A[i * (*N) + j] = hmm->get_A(j,i);
6ad4e3b8
 		}
 	}
 
 	// copy the estimated distribution params
079638e5
 	for (int i=0; i<*N; i++)
6ad4e3b8
 	{
a0b1f4ae
 		if (hmm->densityFunctions[i]->get_name() == NEGATIVE_BINOMIAL) 
 		{
 			NegativeBinomial* d = (NegativeBinomial*)(hmm->densityFunctions[i]);
 			size[i] = d->get_size();
 			prob[i] = d->get_prob();
 		}
1fe9061a
 		else if (hmm->densityFunctions[i]->get_name() == GEOMETRIC)
 		{
 			Geometric* d = (Geometric*)(hmm->densityFunctions[i]);
 			size[i] = 0;
 			prob[i] = d->get_prob();
 		}
a0b1f4ae
 		else if (hmm->densityFunctions[i]->get_name() == ZERO_INFLATION)
 		{
 			// These values for a Negative Binomial define a zero-inflation (delta distribution)
 			size[i] = 0;
 			prob[i] = 1;
 		}
7d8b241d
 		else if (hmm->densityFunctions[i]->get_name() == BINOMIAL) 
 		{
 			Binomial* d = (Binomial*)(hmm->densityFunctions[i]);
 			size[i] = d->get_size();
 			prob[i] = d->get_prob();
 		}
6ad4e3b8
 	}
079638e5
 	*loglik = hmm->get_logP();
 	hmm->calc_weights(weights);
6ad4e3b8
 	
8a70266d
 	//FILE_LOG(logDEBUG1) << "Deleting the hmm";
079638e5
 	delete hmm;
8a70266d
 	hmm = NULL; // assign NULL to defuse the additional delete in on.exit() call
6ad4e3b8
 }
 
95bcf480
 
 // =====================================================================================================================================================
fa363ffb
 // This function takes parameters from R, creates a multivariate HMM object, runs the EM and returns the result to R.
95bcf480
 // =====================================================================================================================================================
c683cb96
 void multivariate_hmm(double* D, int* T, int* N, int *Nmod, int* comb_states, int* maxiter, int* maxtime, double* eps, double* maxPosterior, int* states, double* A, double* proba, double* loglik, double* initial_A, double* initial_proba, bool* use_initial_params, int* num_threads, int* error, int* algorithm, int* verbosity)
95bcf480
 {
 
 	// Define logging level {"ERROR", "WARNING", "INFO", "ITERATION", "DEBUG", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4"}
 // 	FILE* pFile = fopen("chromStar.log", "w");
 // 	Output2FILE::Stream() = pFile;
 //  	FILELog::ReportingLevel() = FILELog::FromString("ITERATION");
 //  	FILELog::ReportingLevel() = FILELog::FromString("DEBUG2");
8a70266d
 //  	FILELog::ReportingLevel() = FILELog::FromString("ERROR");
95bcf480
 
 	// Parallelization settings
a9469ef4
 // 	omp_set_num_threads(*num_threads);
95bcf480
 
 	// Print some information
8a70266d
 	//FILE_LOG(logINFO) << "number of states = " << *N;
b1ab269a
 	if (*verbosity>=1) Rprintf("number of states = %d\n", *N);
8a70266d
 	//FILE_LOG(logINFO) << "number of bins = " << *T;
b1ab269a
 	if (*verbosity>=1) Rprintf("number of bins = %d\n", *T);
95bcf480
 	if (*maxiter < 0)
 	{
8a70266d
 		//FILE_LOG(logINFO) << "maximum number of iterations = none";
b1ab269a
 		if (*verbosity>=1) Rprintf("maximum number of iterations = none\n");
95bcf480
 	} else {
8a70266d
 		//FILE_LOG(logINFO) << "maximum number of iterations = " << *maxiter;
b1ab269a
 		if (*verbosity>=1) Rprintf("maximum number of iterations = %d\n", *maxiter);
95bcf480
 	}
 	if (*maxtime < 0)
 	{
8a70266d
 		//FILE_LOG(logINFO) << "maximum running time = none";
b1ab269a
 		if (*verbosity>=1) Rprintf("maximum running time = none\n");
95bcf480
 	} else {
8a70266d
 		//FILE_LOG(logINFO) << "maximum running time = " << *maxtime << " sec";
b1ab269a
 		if (*verbosity>=1) Rprintf("maximum running time = %d sec\n", *maxtime);
95bcf480
 	}
8a70266d
 	//FILE_LOG(logINFO) << "epsilon = " << *eps;
b1ab269a
 	if (*verbosity>=1) Rprintf("epsilon = %g\n", *eps);
8a70266d
 	//FILE_LOG(logINFO) << "number of modifications = " << *Nmod;
b1ab269a
 	if (*verbosity>=1) Rprintf("number of modifications = %d\n", *Nmod);
95bcf480
 
b1ab269a
 	// Flush if (*verbosity>=1) Rprintf statements to console
95bcf480
 	R_FlushConsole();
 
 	// Recode the densities vector to matrix representation
 // 	clock_t clocktime = clock(), dtime;
8a70266d
 	multiD = CallocDoubleMatrix(*N, *T);
95bcf480
 	for (int iN=0; iN<*N; iN++)
 	{
 		for (int t=0; t<*T; t++)
 		{
 			multiD[iN][t] = D[iN*(*T)+t];
 		}
 	}
 // 	dtime = clock() - clocktime;
8a70266d
 // 	//FILE_LOG(logDEBUG1) << "recoding densities vector to matrix representation: " << dtime << " clicks";
95bcf480
 
 	// Create the HMM
8a70266d
 	//FILE_LOG(logDEBUG1) << "Creating the multivariate HMM";
 	hmm = new ScaleHMM(*T, *N, *Nmod, multiD);
95bcf480
 	// Initialize the transition probabilities and proba
 	hmm->initialize_transition_probs(initial_A, *use_initial_params);
 	hmm->initialize_proba(initial_proba, *use_initial_params);
 	
 	// Print logproba and A
 // 	for (int iN=0; iN<*N; iN++)
 // 	{
8a70266d
 // 		//FILE_LOG(logDEBUG) << "proba["<<iN<<"] = " <<exp(hmm->logproba[iN]);
95bcf480
 // 		for (int jN=0; jN<*N; jN++)
 // 		{
8a70266d
 // 			//FILE_LOG(logDEBUG) << "A["<<iN<<"]["<<jN<<"] = " << hmm->A[iN][jN];
95bcf480
 // 		}
 // 	}
 
fa363ffb
 	// Do the EM to estimate the parameters
95bcf480
 	try
 	{
fa363ffb
 		if (*algorithm == 1)
 		{
 			hmm->baumWelch();
 		}
 		else if (*algorithm == 3)
 		{
 			//FILE_LOG(logDEBUG1) << "Starting EM estimation";
 			hmm->EM(maxiter, maxtime, eps);
 			//FILE_LOG(logDEBUG1) << "Finished with EM estimation";
 		}
95bcf480
 	}
 	catch (std::exception& e)
 	{
fa363ffb
 		//FILE_LOG(logERROR) << "Error in EM/baumWelch: " << e.what();
b1ab269a
 		if (*verbosity>=1) Rprintf("Error in EM/baumWelch: %s\n", e.what());
8a70266d
 		if (strcmp(e.what(),"nan detected")==0) { *error = 1; }
95bcf480
 		else { *error = 2; }
 	}
 	
 // 	// Compute the posteriors and save results directly to the R pointer
8a70266d
 // 	//FILE_LOG(logDEBUG1) << "Recode posteriors into column representation";
95bcf480
 // 	for (int iN=0; iN<*N; iN++)
 // 	{
 // 		for (int t=0; t<*T; t++)
 // 		{
 // 			posteriors[t + iN * (*T)] = hmm->get_posterior(iN, t);
 // 		}
 // 	}
 
 	// Compute the states from posteriors
8a70266d
 	//FILE_LOG(logDEBUG1) << "Computing states from posteriors";
 	int ind_max;
 	std::vector<double> posterior_per_t(*N);
95bcf480
 	for (int t=0; t<*T; t++)
 	{
 		for (int iN=0; iN<*N; iN++)
 		{
 			posterior_per_t[iN] = hmm->get_posterior(iN, t);
 		}
8a70266d
 		ind_max = std::distance(posterior_per_t.begin(), std::max_element(posterior_per_t.begin(), posterior_per_t.end()));
 		states[t] = comb_states[ind_max];
c683cb96
 		maxPosterior[t] = posterior_per_t[ind_max];
95bcf480
 	}
 	
8a70266d
 	//FILE_LOG(logDEBUG1) << "Return parameters";
95bcf480
 	// also return the estimated transition matrix and the initial probs
 	for (int i=0; i<*N; i++)
 	{
 		proba[i] = hmm->get_proba(i);
 		for (int j=0; j<*N; j++)
 		{
 				A[i * (*N) + j] = hmm->get_A(j,i);
 		}
 	}
 	*loglik = hmm->get_logP();
 
8a70266d
 	//FILE_LOG(logDEBUG1) << "Deleting the hmm";
95bcf480
 	delete hmm;
8a70266d
 	hmm = NULL; // assign NULL to defuse the additional delete in on.exit() call
 // 	FreeDoubleMatrix(multiD, *N);
95bcf480
 }
 
8a70266d
 
 // =======================================================
 // This function make a cleanup if anything was left over
 // =======================================================
08327ec8
 void univariate_cleanup()
8a70266d
 {
 // 	//FILE_LOG(logDEBUG2) << __PRETTY_FUNCTION__; // This message will be shown if interrupt happens before start of C-code
 	delete hmm;
 }
 
08327ec8
 void multivariate_cleanup(int* N)
8a70266d
 {
 	delete hmm;
 	FreeDoubleMatrix(multiD, *N);
 }
 
fba447b1
 
 // ====================================================================
 // C version of apply(array2D, 1, which.max) and apply(array2D, 1, max)
 // ====================================================================
 void array2D_which_max(double* array2D, int* dim, int* ind_max, double* value_max)
 {
   // array2D is actually a vector, but is intended to originate from a 2D array in R
 	std::vector<double> value_per_i0(dim[1]);
   for (int i0=0; i0<dim[0]; i0++)
   {
     for (int i1=0; i1<dim[1]; i1++)
     {
 			value_per_i0[i1] = array2D[i1 * dim[0] + i0];
b1ab269a
       // if (*verbosity>=1) Rprintf("i0=%d, i1=%d, value_per_i0[%d] = %g\n", i0, i1, i1, value_per_i0[i1]);
fba447b1
     }
 		ind_max[i0] = 1 + std::distance(value_per_i0.begin(), std::max_element(value_per_i0.begin(), value_per_i0.end()));
     value_max[i0] = *std::max_element(value_per_i0.begin(), value_per_i0.end());
   }
 	
b1ab269a
 }