/*  
    This code is written by <albanese@fbk.it>.
    (C) 2010 Fondazione Bruno Kessler - Via Santa Croce 77, 38100 Trento, ITALY.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/


#include <Python.h>
#include <numpy/arrayobject.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>

#define MIN( A , B ) ((A) < (B) ? (A) : (B))

#define INIT_STD 0
#define INIT_PLUSPLUS 1

void init_std(double *data,      /* data points (nn points x pp dimensions) */
	      double *means,     /* means (kk clusters x pp dimensions) */
	      int nn,            /* number od data points */
	      int pp,            /* number of dimensions */
	      int kk,            /* number of clusters */
	      unsigned long seed /* random seed for init */
	      )
{
  int n, p, k;
  int *ridx;
  const gsl_rng_type * T;
  gsl_rng * r;
  
  T = gsl_rng_default;
  r = gsl_rng_alloc (T);
  gsl_rng_set (r, seed);

  ridx = (int *) malloc (nn * sizeof(int));
      
  for (n=0; n<nn; n++)
    ridx[n] = n;
          
  gsl_ran_shuffle (r, ridx, nn, sizeof (int));
  
  for (k=0; k<kk; k++)
    for (p=0; p<pp; p++)
      means[p + (k * pp)] = data[p + (ridx[k] * pp)];    
  
  free(ridx);
}


/* for init_plusplus */
void
dist_min(double *a, double *b, int nn)
{
  int n;
  
  for (n=0; n<nn; n++)
    a[n] = MIN (a[n], b[n]);
}


/* for init_plusplus */
int 
idx_max(double *a, int nn)
{
  int n, idx = 0;
  double max = -DBL_MAX;
  
  for (n=0; n<nn; n++)
    if (a[n] > max)
      {
	max = a[n];
	idx = n;
      }
  
  return idx;
}


void
init_plusplus(double *data,      /* data points (nn points x pp dimensions) */
	      double *means,     /* means (kk clusters x pp dimensions) */
	      int nn,            /* number od data points */
	      int pp,            /* number of dimensions */
	      int kk,            /* number of clusters */
	      unsigned long seed /* random seed for init */
	      )
{
  int n, p, k;
  double *dist, *distk;
  int sidx;

  const gsl_rng_type *T;
  gsl_rng *r;
 
  
  T = gsl_rng_default;
  r = gsl_rng_alloc (T);
  gsl_rng_set (r, seed);
 
  dist = (double *) malloc (nn * sizeof(double));
  distk = (double *) malloc (nn * sizeof(double));
    
  /* first mean (randomly selected) */
  sidx = (int) gsl_rng_uniform_int (r, nn);
  gsl_rng_free(r);
  for (p=0; p<pp; p++)
    means[p] = data[p + (sidx * pp)];
  
  /* initialize dist */
  for (n=0; n<nn; n++) /* for each data point */
    dist[n] = DBL_MAX;

  for (k=0; k<kk-1; k++)
    {
      /* for each data point x compute distance from mean k */
      for (n=0; n<nn; n++)
	{
	  distk[n] = 0.0;
	  for (p=0; p<pp; p++)
	    distk[n] += pow(data[p + (n * pp)] - means[p + (k * pp)], 2);
	}
      
      /* for each data point x, compute the distance between x and */
      /* the nearest center that has already been chosen */
      dist_min (dist, distk, nn);
      
      /* add one new data point as a new center, using */           
      sidx = idx_max (dist, nn);

      for (p=0; p<pp; p++)
	means[p + ((k+1) * pp)] = data[p + (sidx * pp)];
    }
    
  free(dist);
  free(distk);
}


/* assignment step */
int
a_step(double *data,  /* data points (nn x pp) */
       double *means, /* means (kk x pp) */
       int *cls,      /* cluster assignement for each data point (nn) */
       int *nelems,   /* number of elements of each cluster (kk) */
       int nn,        /* number od data points */
       int pp,        /* number of dimensions */
       int kk         /* number of clusters */
       )
{
  int n, p, k, kn = 0;
  double dist, dmin;
  int changed = 0;
  
  for (k=0; k<kk; k++)
    nelems[k] = 0; 
   
  for (n=0; n<nn; n++) /* for each data point */
    {
      dmin = DBL_MAX;
      for (k=0; k<kk; k++) /* for each cluster */
	{
	  /* compute distance */
	  dist = 0.0;
	  for (p=0; p<pp; p++)
	    dist += pow(data[p + (n * pp)] - means[p + (k * pp)], 2);
	  
	  /* remember the cluster k if dist < dmin */
	  if (dist < dmin)
	    {
	      dmin = dist;
	      kn = k;
	    }    
	}
      
      
      /* if the cluster assignement change */
      if (kn != cls[n])
	changed++;
      
      /* update clusters and number of elements of each cluster */
      cls[n] = kn;
      nelems[kn]++;
    }
  
  return changed;
}


/* update step */
int
u_step(double *data,  /* data points (nn x pp) */
       double *means, /* means (kk x pp) */
       int *cls,      /* cluster assignement for each data point (nn) */
       int *nelems,   /* number of elements of each cluster (kk) */
       int nn,        /* number od data points */
       int pp,        /* number of dimensions */
       int kk         /* number of clusters */
       )
{
  int n, p, k;
  
  /* reset means */
  for (k=0; k<kk; k++)
    for (p=0; p<pp; p++)
      means[p + (k * pp)] = 0.0;

  for (n=0; n<nn; n++) /* for each data point */
    for (p=0; p<pp; p++) /* for each dimension */
      means[p + (cls[n] * pp)] += data[p + (n * pp)];
  
  for (k=0; k<kk; k++) 
    if (nelems[k] > 0)
      for (p=0; p<pp; p++)
	means[p + (k * pp)] /= nelems[k];
  
  return 1;
}


int
kmeans(double *data,  /* data points (nn x pp) */
       double *means, /* initialized means (kk x pp) */
       int *cls,      /* cluster assignement for each data point (nn) */
       int nn,        /* number od data points */
       int pp,        /* number of dimensions */
       int kk         /* number of clusters */
       )
{
  int n;
  int ret, steps = 0;
  int changed = -1;
  int *nelems; /* number of elements of each cluster (kk) */

  nelems = (int *) malloc (kk * sizeof(int));

  /* init cls */
  for (n=0; n<nn; n++)
    cls[n] = -1.0;

  /* k-means algorithm  */
  while (changed != 0)
    {
      changed = a_step (data, means, cls, nelems, nn, pp, kk);     
      ret = u_step (data, means, cls, nelems, nn, pp, kk);
      steps++;
    }
  
  free(nelems);
 
  return steps;
}


static PyObject *kmeanscore_kmeans(PyObject *self, PyObject *args, PyObject *keywds)
{
  PyObject *x = NULL;
  PyObject *xContiguous  = NULL;
  double *xC;
  int k;
  int init = 0;
  unsigned long seed = 0;
  
  PyObject *meansContiguous  = NULL;
  PyObject *clsContiguous  = NULL;
  double *meansC;

  npy_intp means_dims[2];
  npy_intp cls_dims[1];
  int *clsC;
   
  npy_intp n, p;
  int steps;

  /* Parse Tuple*/
  static char *kwlist[] = {"x", "k", "init", "seed", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, keywds, "Oi|ik", kwlist, &x, &k, &init, &seed))
    return NULL;

  xContiguous = PyArray_FROM_OTF(x, NPY_DOUBLE, NPY_IN_ARRAY);
  if (xContiguous == NULL) return NULL;
 
  if (PyArray_NDIM(xContiguous) != 2)
    {
      PyErr_SetString(PyExc_ValueError, "x must be 2D array");
      return NULL;
    }
  
  n = PyArray_DIM(xContiguous, 0);
  p = PyArray_DIM(xContiguous, 1);

  if ((k < 2) || (k > n))
    {
      PyErr_SetString(PyExc_ValueError, "k must be >= 2 and <= number of samples");
      return NULL;
    }

  xC = (double *) PyArray_DATA(xContiguous);

  means_dims[0] = k;
  means_dims[1] = p;
  meansContiguous = PyArray_SimpleNew (2, means_dims, NPY_DOUBLE);
  meansC = (double *) PyArray_DATA(meansContiguous);
  
  cls_dims[0] = n;
  
  clsContiguous = PyArray_SimpleNew (1, cls_dims, NPY_INT);
  clsC = (int *) PyArray_DATA (clsContiguous);  
  
  /* initialization */
  if (init == INIT_STD)
    init_std(xC, meansC, n, p, k, seed);
  else if (init == INIT_PLUSPLUS)
    init_plusplus(xC, meansC, n, p, k, seed);
  else
    {
      PyErr_SetString(PyExc_ValueError, "init is not valid");
      return NULL;
    }

  /* kmeans algorithm */
  steps = kmeans (xC, meansC, clsC, n, p, k);
  
  Py_DECREF(xContiguous);
  
  return Py_BuildValue("(N, N, i)", clsContiguous, meansContiguous, steps);
}


/* Doc strings: */
static char module_doc[] = "";

static char kmeanscore_kmeans_doc[] = "";

/* Method table */
static PyMethodDef kmeanscore_methods[] = {
  {"kmeans",
   (PyCFunction)kmeanscore_kmeans,
   METH_VARARGS | METH_KEYWORDS,
   kmeanscore_kmeans_doc},
  {NULL, NULL, 0, NULL}
};

/* Init */
void initkmeanscore()
{
  Py_InitModule3("kmeanscore", kmeanscore_methods, module_doc);
  import_array();
}
