/* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */

/* This the fast UPGMA algorithm (O(N^2)) as implemented in Bob Edgar's
 * Muscle (UPGMA2.cpp; version 3.7) ported to pure C.
 *
 * Muscle's code is public domain and so is this code here.
 *
 * From http://www.drive5.com/muscle/license.htm:
 * """
 * MUSCLE is public domain software
 *
 * The MUSCLE software, including object and source code and
 * documentation, is hereby donated to the public domain.
 *
 * Disclaimer of warranty
 * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * """
 *
 */

/*
 *  RCS $Id: muscle_upgma.c 230 2011-04-09 15:37:50Z andreas $
 *
 *
 * Notes:
 * ------
 * LINKAGE become linkage_t here
 *
 * Replaced the the following member functions for DistCalc DC:
 * DC.GetId = sequence id as int
 * DC.GetName = sequence name
 * DC.GetCount = matrix dim
 * DC.DistRange = vector / matrix row for object i with index j<i
 *
 * Log() has been replaced with Clustal's Info(), Quiet() with Log(&rLog, LOG_FATAL)
 *
 * Made TriangleSubscript() and g_ulTriangleSize ulong to prevent overflow for many sequences
 */

#ifndef ulint
/* limit use of unsigned vars (see coding_style_guideline.txt) */
typedef unsigned long int ulong;
#endif



#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include "util.h"
#include "log.h"
#include "symmatrix.h"

#include "muscle_tree.h"
#include "muscle_upgma.h"

/* from distcalc.h */
typedef float dist_t;
static const dist_t BIG_DIST = (dist_t) 1e29;
/* from muscle.h */
static const unsigned uInsane = 8888888;




/*static inline*/
ulong TriangleSubscript(uint uIndex1, uint uIndex2);




#define TRACE   0

#ifndef MIN
#define MIN(x, y)   ((x) < (y) ? (x) : (y))
#endif
#ifndef MIN
#define MAX(x, y)   ((x) > (y) ? (x) : (y))
#endif
#define AVG(x, y)   (((x) + (y))/2)

static uint g_uLeafCount;
static ulong g_ulTriangleSize;
static uint g_uInternalNodeCount;
static uint g_uInternalNodeIndex;

/* Triangular distance matrix is g_Dist, which is allocated
 * as a one-dimensional vector of length g_ulTriangleSize.
 * TriangleSubscript(i,j) maps row,column=i,j to the subscript
 * into this vector.
 * Row / column coordinates are a bit messy.
 * Initially they are leaf indexes 0..N-1.
 * But each time we create a new node (=new cluster, new subtree),
 * we re-use one of the two rows that become available (the children
 * of the new node). This saves memory.
 * We keep track of this through the g_uNodeIndex vector.
 */
static dist_t *g_Dist;

/* Distance to nearest neighbor in row i of distance matrix.
 * Subscript is distance matrix row.
 */
static dist_t *g_MinDist;

/* Nearest neighbor to row i of distance matrix.
 * Subscript is distance matrix row.
 */
static uint *g_uNearestNeighbor;

/* Node index of row i in distance matrix.
 * Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
 * Subscript is distance matrix row.
 */
static uint *g_uNodeIndex;

/* The following vectors are defined on internal nodes,
 * subscripts are internal node index 0..N-2.
 * For g_uLeft/Right, value is the node index 0 .. 2N-2
 * because a child can be internal or leaf.
 */
static uint *g_uLeft;
static uint *g_uRight;
static dist_t *g_Height;
static dist_t *g_LeftLength;
static dist_t *g_RightLength;


/***   CalcDistRange
 *
 * Imitation of DistCalc.DistRange
 *
 * Sets values of row (vector / matrix row) to distances for object i with index j<i
 *
 * row must be preallocated
 */
void CalcDistRange(symmatrix_t *distmat, uint i, dist_t *row)
{
    uint j;
    for (j = 0; j < i; ++j) {
        row[j] = SymMatrixGetValue(distmat, i, j);
    }
}
/* end of CalcDistRange */



/*static inline*/
ulong
TriangleSubscript(uint uIndex1, uint uIndex2)
{
    ulong v;
#ifndef NDEBUG
    if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
        Log(&rLog, LOG_FATAL, "TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
#endif
    if (uIndex1 >= uIndex2)
        v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
    else
        v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
    assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
    return v;
}

#ifdef UNUSED
static void ListState()
{
    uint i, j;
    Info("Dist matrix\n");
    Info("     ");
    for (i = 0; i < g_uLeafCount; ++i)
    {
        if (uInsane == g_uNodeIndex[i])
            continue;
        Info("  %5u", g_uNodeIndex[i]);
    }
    Info("\n");

    for (i = 0; i < g_uLeafCount; ++i)
    {
        if (uInsane == g_uNodeIndex[i])
            continue;
        Info("%5u  ", g_uNodeIndex[i]);
        for (j = 0; j < g_uLeafCount; ++j)
        {
            if (uInsane == g_uNodeIndex[j])
                continue;
            if (i == j)
                Info("       ");
            else
            {
                ulong v = TriangleSubscript(i, j);
                Info("%5.2g  ", g_Dist[v]);
            }
        }
        Info("\n");
    }

    Info("\n");
    Info("    i   Node   NrNb      Dist\n");
    Info("-----  -----  -----  --------\n");
    for (i = 0; i < g_uLeafCount; ++i)
    {
        if (uInsane == g_uNodeIndex[i])
            continue;
        Info("%5u  %5u  %5u  %8.3f\n",
             i,
             g_uNodeIndex[i],
             g_uNearestNeighbor[i],
             g_MinDist[i]);
    }

    Info("\n");
    Info(" Node      L      R  Height  LLength  RLength\n");
    Info("-----  -----  -----  ------  -------  -------\n");
    for (i = 0; i <= g_uInternalNodeIndex; ++i)
        Info("%5u  %5u  %5u  %6.2g  %6.2g  %6.2g\n",
             i,
             g_uLeft[i],
             g_uRight[i],
             g_Height[i],
             g_LeftLength[i],
             g_RightLength[i]);
}
#endif
/* ifdef UNUSED */

/**
 * @brief Creates a UPGMA in O(N^2) tree from given distmat
 *
 * @param[out] tree
 * newly created rooted UPGMA tree
 * @param[in] distmat
 * distance matrix to be clustered
 * @param[in] linkage
 * linkage type
 * @param[in] names
 * leaf names, will be copied
 *
 * @note called UPGMA2() in Muscle3.7.
 * caller has to free with FreeMuscleTree()
 *
 * @see FreeMuscleTree()
 */
void
MuscleUpgma2(tree_t *tree, symmatrix_t *distmat, linkage_t linkage, char **names)
{
    int i, j;
    uint *Ids;

    /* only works on full symmetric matrices */
    assert (distmat->nrows==distmat->ncols);
   
    g_uLeafCount = distmat->ncols;    
    g_ulTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
    g_uInternalNodeCount = g_uLeafCount - 1;

    g_Dist = (dist_t *) CKMALLOC(g_ulTriangleSize * sizeof(dist_t));

    g_uNodeIndex = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
    g_uNearestNeighbor = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
    g_MinDist = (dist_t *) CKMALLOC(sizeof(dist_t) * g_uLeafCount);
    Ids = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
    /* NOTE: we replaced Names with argument names */

    /**
     * left and right node indices, as well as left and right
     * branch-length and height for for internal nodes
     */
    g_uLeft =  (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
    g_uRight =  (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
    g_Height =  (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
    g_LeftLength =  (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
    g_RightLength =  (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
    
    for (i = 0; i < g_uLeafCount; ++i) {
        g_MinDist[i] = BIG_DIST;
        g_uNodeIndex[i] = i;
        g_uNearestNeighbor[i] = uInsane;
        Ids[i] = i;
    }
    
    for (i = 0; i < g_uInternalNodeCount; ++i) {
        g_uLeft[i] = uInsane;
        g_uRight[i] = uInsane;
        g_LeftLength[i] = BIG_DIST;
        g_RightLength[i] = BIG_DIST;
        g_Height[i] = BIG_DIST;
    }
    
/* Compute initial NxN triangular distance matrix.
 * Store minimum distance for each full (not triangular) row.
 * Loop from 1, not 0, because "row" is 0, 1 ... i-1,
 * so nothing to do when i=0.
 */
    for (i = 1; i < g_uLeafCount; ++i) {
        dist_t *Row = g_Dist + TriangleSubscript(i, 0);
        CalcDistRange(distmat, i, Row);
        for (j = 0; j < i; ++j) {
            const dist_t d = Row[j];
            if (d < g_MinDist[i]) {
                g_MinDist[i] = d;
                g_uNearestNeighbor[i] = j;
            }
            if (d < g_MinDist[j]) {
                g_MinDist[j] = d;
                g_uNearestNeighbor[j] = i;
            }
        }
    }

#if TRACE
    Info("Initial state:\n");
    ListState();
#endif

    for (g_uInternalNodeIndex = 0;
         g_uInternalNodeIndex < g_uLeafCount - 1;
         ++g_uInternalNodeIndex) {

        dist_t dtNewMinDist = BIG_DIST;
        uint uNewNearestNeighbor = uInsane;

#if TRACE
        Info("\n");
        Info("Internal node index %5u\n", g_uInternalNodeIndex);
        Info("-------------------------\n");
#endif

        /* Find nearest neighbors */
        uint Lmin = uInsane;
        uint Rmin = uInsane;
        dist_t dtMinDist = BIG_DIST;
        for (j = 0; j < g_uLeafCount; ++j) {
            dist_t d;
            if (uInsane == g_uNodeIndex[j])
                continue;

            d = g_MinDist[j];
            if (d < dtMinDist) {
                dtMinDist = d;
                Lmin = j;
                Rmin = g_uNearestNeighbor[j];
                assert(uInsane != Rmin);
                assert(uInsane != g_uNodeIndex[Rmin]);
            }
        }

        assert(Lmin != uInsane);
        assert(Rmin != uInsane);
        assert(dtMinDist != BIG_DIST);

#if TRACE
        Info("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
             Lmin,
             g_uNodeIndex[Lmin],
             Rmin,
             g_uNodeIndex[Rmin],
             dtMinDist);
#endif

        /* Compute distances to new node
         * New node overwrites row currently assigned to Lmin
         */
        for ( j = 0; j < g_uLeafCount; ++j) {
            ulong vL, vR;
            dist_t dL, dR;
            dist_t dtNewDist;
            
            if (j == Lmin || j == Rmin)
                continue;
            if (uInsane == g_uNodeIndex[j])
                continue;

            vL = TriangleSubscript(Lmin, j);
            vR = TriangleSubscript(Rmin, j);
            dL = g_Dist[vL];
            dR = g_Dist[vR];
            dtNewDist = 0.0;

            switch (linkage) {
            case LINKAGE_AVG:
                dtNewDist = AVG(dL, dR);
                break;

            case LINKAGE_MIN:
                dtNewDist = MIN(dL, dR);
                break;

            case LINKAGE_MAX:
                dtNewDist = MAX(dL, dR);
                break;
/* couldn't be arsed to figure out proper usage of g_dSUEFF */
#if 0
            case LINKAGE_BIASED:
                dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
                break;
#endif
            default:
                Log(&rLog, LOG_FATAL, "UPGMA2: Invalid LINKAGE_%u", linkage);
            }

            /* Nasty special case.
             * If nearest neighbor of j is Lmin or Rmin, then make the new
             * node (which overwrites the row currently occupied by Lmin)
             * the nearest neighbor. This situation can occur when there are
             * equal distances in the matrix. If we don't make this fix,
             * the nearest neighbor pointer for j would become invalid.
             * (We don't need to test for == Lmin, because in that case
             * the net change needed is zero due to the change in row
             * numbering).
             */
            if (g_uNearestNeighbor[j] == Rmin)
                g_uNearestNeighbor[j] = Lmin;

#if TRACE
            Info("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
                 j, Lmin, dL, Rmin, dR, dtNewDist);
#endif
            g_Dist[vL] = dtNewDist;
            if (dtNewDist < dtNewMinDist) {
                dtNewMinDist = dtNewDist;
                uNewNearestNeighbor = j;
            }
        }

        assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
        assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);

        const ulong v = TriangleSubscript(Lmin, Rmin);
        const dist_t dLR = g_Dist[v];
        const dist_t dHeightNew = dLR/2;
        const uint uLeft = g_uNodeIndex[Lmin];
        const uint uRight = g_uNodeIndex[Rmin];
        const dist_t HeightLeft =
            uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
        const dist_t HeightRight =
            uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];

        g_uLeft[g_uInternalNodeIndex] = uLeft;
        g_uRight[g_uInternalNodeIndex] = uRight;
        g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
        g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
        g_Height[g_uInternalNodeIndex] = dHeightNew;

        /* Row for left child overwritten by row for new node */
        g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
        g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
        g_MinDist[Lmin] = dtNewMinDist;

        /* Delete row for right child */
        g_uNodeIndex[Rmin] = uInsane;

#if TRACE
        Info("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
             g_uInternalNodeIndex, Lmin, Rmin);
        ListState();
#endif
    }

    uint uRoot = g_uLeafCount - 2;

#if TRACE
    Log(&rLog, LOG_FORCED_DEBUG, "uRoot=%d g_uLeafCount=%d g_uInternalNodeCount=%d", uRoot, g_uLeafCount, g_uInternalNodeCount);
    for (i=0; i<g_uInternalNodeCount; i++) {
        Log(&rLog, LOG_FORCED_DEBUG, "internal node=%d:  g_uLeft=%d g_uRight=%d g_LeftLength=%f g_RightLength=%f g_Height=%f",
                  i, g_uLeft[i], g_uRight[i],
                  g_LeftLength[i], g_RightLength[i],
                  g_Height[i]);
    }
    for (i=0; i<g_uLeafCount; i++) {
        Log(&rLog, LOG_FORCED_DEBUG, "leaf node=%d:  Ids=%d names=%s",
                  i, Ids[i], names[i]);
    }
#endif
    
    MuscleTreeCreate(tree, g_uLeafCount, uRoot,
                      g_uLeft, g_uRight,
                      g_LeftLength, g_RightLength,
                      Ids, names);
#if TRACE
    tree.LogMe();
#endif

    free(g_Dist);

    free(g_uNodeIndex);
    free(g_uNearestNeighbor);
    free(g_MinDist);
    free(g_Height);

    free(g_uLeft);
    free(g_uRight);
    free(g_LeftLength);
    free(g_RightLength);

    /* NOTE: Muscle's "Names" variable is here the argument "names" */
    free(Ids);
}
/***   end of UPGMA2   ***/