/* File: gaussian_mixture.c
 * Author: David Squire (DMS), David.Squire@infotech.monash.edu.au
 * Date Created: 20050804
 * Date Last Modified: 20050813
 * - this version has all required functions in this single file,
 *   so that users do not need to have my libraries, headers, etc.
 *
 * Purpose: This program generates a specified number of samples from a
 * probability distribution specified by a mixture of Gaussians. The number
 * of samples desired, and the parameters of the mixture, are specified in
 * a file that is either given on the command line, or read from stdin. The
 * results are written to stdout.
 *
 * The format of the input file is as follows:
 *
 * - numSamples (n)
 * - numDataDimensions (d)
 * - numGaussians (k)
 * - weight_1, weight_2, ..., weight_k # relative weights of Gaussians in
 *   mixture
 * - k lines of means of each Gaussian, each specified as a line of d
 *   comma-separated values
 * - d x d covariance matrices of each of the k Gaussians, each specified
 *   as d lines of d comma-separated values.
 *
 * Note that lines beginning with '#' are ignored, as is anything after a
 * '#' on a line.
 *
 * Here is an example input file:
 *
100000 # number of samples
2 # number of dimensions
4 # number of Gaussians
0.4, 0.2, 0.2, 0.2 # weights of Gaussians
# Means of Gaussians
0.3, 0.7 # Gaussian 1
-5, 5 # Gaussian 2
5, 5 # Gaussian 3
2, -3 # Gaussian 4
# Covariance matrices of Gaussians
7, 0 # Gaussian 1
0, 7
#
1, 0.5 # Gaussian 2
0.5, 5
#
1, -0.5 # Gaussian 3
-0.5, 1
#
5, 0 # Gaussian 4
0, 0.2 
 *
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <malloc.h>

/*****************************************************************************/
/* From my file matrix_utils.c                                               */
/*****************************************************************************/
/***************************************/
/* handy output routines for debugging */
void printVector(double *Vector, int rows) {
	
	int i;

	printf("[ ");
	for (i = 0; i < rows; i++) {
		printf("%16.6f", Vector[i]);
	}
	printf(" ]\n");
}

void printMatrix(double **Matrix, int rows, int cols) {
	
	int i, j;

	for (i = 0; i < rows; i++) {
		printf("[ ");
		for (j = 0; j < cols; j++) {
			printf("%16.6f", Matrix[i][j]);
		}
		printf(" ]\n");
	}
}

/***************************************/
/* Below are routines for creating the matrices that numrec wants */

double *dVector(int n) {
	
	double *v;

	v = (double *)calloc(n, sizeof(double));
	return(v);
}

double **dMatrix(int nrows, int ncols) {

	double **m;
	int i;

	m = (double **)calloc(nrows, sizeof(double *));
	for (i = 0; i < nrows; i++)
		m[i] = (double *)calloc(ncols, sizeof(double));
	return(m);
}

void dFreeMatrix(double **m, int nrows, int ncols) {

	int i;

	for (i = 0; i < nrows; i++)
		free(m[i]);
	free(m);
}

/* multiplication of matrix by a vector */
double *dMultiplyVector(double **m, double *v, int nrows, int ncols) {

	double *product;
	int i, j;

	/* allocate space for the product */
	product = dVector(nrows);
	/* calculate the product. Since dVector uses calloc, we don't need to
	 * initialize  */
	for (i = 0; i < nrows; i++) {
		for (j = 0; j < ncols; j++) {
			product[i] += m[i][j]*v[j];
		}
	}
	return product;
}

/*****************************************************************************/
/* From my file sq_utils.c                                                   */
/*****************************************************************************/

int strsplit(char *sin, char ***sout, char c) {
	/* Takes a string sin, and splits it at each occurence of character c.
	 * The results of the split are placed in an array of strings, *sout,
	 * and the number of results n + 1 is returned. No space is
	 * allocated for sout if there are no occurences of c, in which case 0
	 * is returned.
     * Author: David Squire                                                   
     * Date: 20050804
	 */
	
	int i, j, n, m;
	char *p, *buff;

	/* count the number of occurences of c in sin */
	n = 0;
	p = sin;
	while (*p) {
		if (*p++ == c) {
			n++;
		}
	}
	if (n == 0) {
		return 0;
	}

	/* allocate space for the array of strings */
	*sout = (char **)malloc((n + 1)*sizeof(char *));

	/* extract each substring, store it in the array */
	buff = (char *)malloc((1 + strlen(sin))*sizeof(char)); /* buffer */
	p = sin;
	i = 0;
	while (*p) {
		m = 0;
		while (*p && (*p != c)) { /* copy characters into the buffer */
			buff[m++] = *p++;     /* until we see c, or end of string */
		}
		/* allocate space for this substring, and copy it from the buffer */
		(*sout)[i] = (char *)malloc((m + 1)*sizeof(char));
		for (j = 0; j < m; j++) {
			(*sout)[i][j] = buff[j];
		}
		(*sout)[i][j] = '\0'; /* terminate the string */
		i++;
		if (*p) {
			p++; /* skip the splitting character */
		}
	}
	free(buff);

	if (i != (n + 1)) { // the string must have had c as the final character
		(*sout)[n] = (char *)malloc(1*sizeof(char));
		(*sout)[n][0] = '\0';
	}

	return (n + 1);
}

int getline(FILE *f, char *buff, char skipchar, int MAXLINELENGTH) {
	/* Read a line from a file f, and store it in space buff, which must
	 * already have been allocated memory of at least MAXLINELENGTH
	 * characters. Lines beginning with skipchar are ignored, as is
	 * anything after an occurence of skipchar on a line. Returns false if
	 * nothing can be read.
     * Author: David Squire                                                   
     * Date: 20050805
     * Last Modified: 20050807
	 */

	char c;
	int i;
	
	buff[0] = skipchar;
	while ((buff[0] == skipchar) && !feof(f)) {
		i = 0;
		while (((c = fgetc(f)) != EOF) && (c != '\n') && (i < MAXLINELENGTH - 1)) {
			if (c != skipchar) {
				buff[i++] = c;	
			}
			else {
				/* suck up the remainder of the line and discard it */ 
				while (((c = fgetc(f)) != EOF) && (c != '\n')) {
					; /* do nothing */
				}
				ungetc(c, f); /* put the \n back for the outer loop */
			}
		}
	}
	buff[i] = '\0'; /* terminate the string */

	if (i == 0) { /* first read caused EOF, or couldn't find a line with
	                 content */
		return 0;
	}

	if ((i == MAXLINELENGTH - 1) && (c != '\n')) { /* line was too long */
		fprintf(stderr, "\007Warning: encountered line to long to read!\n");
		/* suck up the remainder of the line and discard it */ 
		while (((c = fgetc(f)) != EOF) && (c != '\n')) {
			; /* do nothing */
		}
	}
	return 1;
}

int line2vec(char *line, double *vec, int d) {
	/* parse a comma-separated line of d numbers into a vector of doubles.
	 * Returns false if there is a problem.
     * Author: David Squire                                                   
     * Date: 20050807
	 */

	char **s; /* array of strings, used for split results */
	int i;

	if (strsplit(line, &s, ',') != d) {
		fprintf(stderr, "Incorrect number of elements on line '%s'.\n", line);
		return 0;
	}
	for (i = 0; i < d; i++) {
		vec[i] = atof(s[i]);
		free(s[i]);
	}
	free(s);
	return 1;
}

int *pick_n_from_weights(int n, double *w, int d) {
	/* Pick n integers from [0, d-1] according to probabilities specified
	 * by weight vector w.
     * Author: David Squire                                                   
     * Date: 20050807
	 */

	double weight_sum, *cum_probs, p;
	int i, j, *results;

	weight_sum = 0;
	cum_probs = (double *)malloc(d*sizeof(double));
	for (i = 0; i < d; i++) {
	 	weight_sum += w[i];
	}
	cum_probs[0] = w[0]/weight_sum;
	for (i = 1; i < d; i++) {
		cum_probs[i] = cum_probs[i - 1] + w[i]/weight_sum;
	}
	results = (int *)malloc(n*sizeof(int));
	for (j = 0; j < n; j++) {
		p = drand48();
		for (i = 0; i < d; i++) {
			if (cum_probs[i] > p) {
				break;
			}
		}
		results[j] = i;
	}
	free(cum_probs);
	return results;
}

/*****************************************************************************/
/* From my file jacobi.c                                                     */
/*****************************************************************************/

/* this is just the NumRec routine, changed to use doubles instead of floats */
/* DMS 960318: I have hacked it to do without the numrec library, and
			   to use standard C matrix indices [0, n-1]. */

/* DMS: I really hate macros like this that depend on variable names
		that are not passed to them!! */
#define ROTATE(a,i,j,k,l) g=a[i][j];h=a[k][l];a[i][j]=g-s*(h+g*tau);\
	a[k][l]=h+s*(g-h*tau);


void jacobi(a,n,d,v,nrot)
/* Computes all eigenvalues and eigenvectors of a real symmetric matrix
   a[0..n-1][0..n-1]. On output, elements of a above the diagonal are 
   destroyed. d[0..n-1] returns the eigenvalues of a. v[0..n-1][0..n-1] is
   a matrix whose columns contain, on output, the normalized eigenvectors
   of a. nrot returns the number of Jacobi rotations that were required.
*/
double **a,d[],**v;
int n,*nrot;
{
	int j, iq, ip, i;
	double tresh, theta, tau, t, sm, s, h, g, c, *b, *z;

	b = dVector(n);
	z = dVector(n);

	/* initialise v to the identity matrix */
	for (ip = 0; ip < n; ip++) {
		for (iq = 0; iq < n; iq++)
			v[ip][iq] = 0.0;
		v[ip][ip] = 1.0;
	}

	/* initialise b and d to the diagonal of a */
	for (ip = 0; ip < n; ip++) {
		b[ip] = d[ip] = a[ip][ip];
		z[ip] = 0.0;
	}
	*nrot = 0;
	for (i = 1; i <= 50; i++) {
		/* sum off-diagonal elements */
		/* DMS: surely this is just the lower triangle? */
		sm = 0.0;
		for (ip = 0; ip < n-1; ip++) {
			for (iq = ip+1; iq < n; iq++)
				sm +=  fabs(a[ip][iq]);
		}
		if (sm == 0.0) {	/* The normal return, which relies on quadratic
			free(z);		   covergence to machine underflow. */
			free(b);
			return;
		}
		if (i < 4)
			tresh = 0.2*sm/(n*n);	/* On the first three sweeps... */
		else					
			tresh = 0.0;			/* ...thereafter */
		for (ip = 0; ip < n-1; ip++) {
			for (iq = ip+1; iq < n; iq++) {
				g = 100.0*fabs(a[ip][iq]);
				/* After four sweeps, skip the rotation if the off-diagonal
				   element is small */
				if (i > 4 && fabs(d[ip])+g == fabs(d[ip])
				  && fabs(d[iq])+g == fabs(d[iq]))
					a[ip][iq] = 0.0;
				else if (fabs(a[ip][iq]) > tresh) {
					h = d[iq]-d[ip];
					if (fabs(h)+g == fabs(h))
						t = (a[ip][iq])/h;
					else {
						theta = 0.5*h/(a[ip][iq]);
						t = 1.0/(fabs(theta)+sqrt(1.0+theta*theta));
						if (theta < 0.0)
							t = -t;
					}
					c = 1.0/sqrt(1+t*t);
					s = t*c;
					tau = s/(1.0+c);
					h = t*a[ip][iq];
					z[ip] -=  h;
					z[iq] +=  h;
					d[ip] -=  h;
					d[iq] +=  h;
					a[ip][iq] = 0.0;
					/* Case of rotations 0 <= j < p */
					for (j = 0; j <= ip-1; j++) {
						ROTATE(a, j, ip, j, iq)
					}
					/* Case of rotations p < j < q */
					for (j = ip+1; j <= iq-1; j++) {
						ROTATE(a, ip, j, j, iq)
					}
					/* Case of rotations q < j < n */
					for (j = iq+1; j < n; j++) {
						ROTATE(a, ip, j, iq, j)
					}
					for (j = 0; j < n; j++) {
						ROTATE(v, j, ip, j, iq)
					}
					++(*nrot);
				}
			}
		}
		for (ip = 0; ip < n; ip++) {
			b[ip] +=  z[ip];
			d[ip] = b[ip];
			z[ip] = 0.0;
		}
	}
	fprintf(stderr, "Too many iterations in routine JACOBI\n");
}

#undef ROTATE

/*****************************************************************************/
/* From my file gasdev.c                                                     */
/*****************************************************************************/

/* Adapted from Numerical Recipes in C (2nd Edition) code */
/* David Squire (DMS) 20050804 */

double gasdev(long *idum) {
/* Returns a normally distributed deviate with zero mean and unit variance.
 * */

	static int iset = 0;
	static double gset;
	double fac, rsq, v1, v2;

	if (*idum < 0)
		iset = 0; /* Reinitialize. */
	if (iset == 0) {
		/* We don't have an extra deviate handy, so pick two uniform
		 * numbers in the square extending from -1 to +1 in each direction,
		 * see if they are in the unit circle, and if they are not, try
		 * again. */
		do {
			v1 = 2.0*drand48() - 1.0;
			v2 = 2.0*drand48() - 1.0;
			rsq = v1*v1 + v2*v2;
		} while (rsq >= 1.0 || rsq == 0.0);
		fac = sqrt(-2.0*log(rsq)/rsq);
		/* Now make the Box-Muller transformation to get two normal
		 * deviates. Return one and save the other for next time. */
		gset = v1*fac;
		iset = 1; /* Set flag. */
		return v2*fac;
	}
	else { /* We have an extra deviate handy, */
		iset=0; /* so unset the flag, */
		return gset; /* and return it. */
	}
}

double drandgaussian(double mean, double stddev) {
/* Author: DMS
 * Date:   20050804
 * Description: Returns a random sample from a Gaussian distribution with
 * mean and standard deviation as specified by the parameters. Really just
 * a wrapper for the Numerical Recipes gasdev above.
 */

 static long idum = -1;

 return mean + stddev*gasdev(&idum);
}

/*****************************************************************************/
/* Beginning of my normal gaussian_mixture.c                                 */
/*****************************************************************************/

int n; /* number of samples desired */
int d; /* number of data dimensions */
int k; /* number of Gaussians in the mixture */
double *w; /* array of weights of Gaussian components */
double **m; /* array of k means, each of d dimensions */
double ***S; /* array of k covariance matrices, each d x d */
double ***E; /* array of k matrices, each d x d, containing the
              * eigenvectors of each covariance matrix */
double **lambda; /* array of k vectors, each of d dimensions, containing the
             * eigenvalues of each covariance matrix */
#define MAXLINELENGTH 256 /* maximum line length accepted in input file */

void parseInput(FILE *infile) {

	char line[MAXLINELENGTH];
	int i, j;
	char **s; /* array of strings, used for split results */

#ifdef DEBUG
	printf("Parsing input file...\n");
#endif

	if (!getline(infile, line, '#', MAXLINELENGTH)) {
		fprintf(stderr, "Error occurred when reading file.\n");
		exit(1);
	}
	n = atoi(line); /* number of samples */
	if (!getline(infile, line, '#', MAXLINELENGTH)) {
		fprintf(stderr, "Error occurred when reading file.\n");
		exit(1);
	}
	d = atoi(line); /* number of dimensions */
	if (!getline(infile, line, '#', MAXLINELENGTH)) {
		fprintf(stderr, "Error occurred when reading file.\n");
		exit(1);
	}
	k = atoi(line); /* number of Gaussians */
#ifdef DEBUG
	printf("Want %d samples from %d %d-dimensional Gaussians\n", n, k, d);
#endif

	/* allocate space for the weights of the Gaussians */
	w = dVector(k);
	/* read in the weights of the Gaussians */
	if (!getline(infile, line, '#', MAXLINELENGTH)) {
		fprintf(stderr, "Unexpected end of input file.\n");
		exit(1);
	}
	if (!line2vec(line, w, k)) {
		fprintf(stderr, "Error extracting %d numbers from '%s'\n", k, line);
		exit(1);
	}
#ifdef DEBUG
	printf("Weights: \n");
	printVector(w, k);
#endif

	/* allocate space for d-dimensional means of the Gaussians */
	m = (double **)malloc(k*sizeof(double *));
	for (i = 0; i < k; i++) {
		m[i] = dVector(d);
	}
	/* read in the means of the Gaussians */
	for (i = 0; i < k; i++) {
		if (!getline(infile, line, '#', MAXLINELENGTH)) {
			fprintf(stderr, "Unexpected end of input file.\n");
			exit(1);
		}
		if (!line2vec(line, m[i], d)) {
			fprintf(stderr, "Error extracting %d numbers from '%s'\n", d, line);
			exit(1);
		}
	}
#ifdef DEBUG
	printf("Means: \n");
	for (i = 0; i < k; i++) {
		printf("Gaussian %d:\n", i);
		printVector(m[i], d);
	}
#endif

	/* allocate space for the k covariance matrices of the Gaussians, each
	 * d x d */
	S = (double ***)malloc(k*sizeof(double **));
	for (i = 0; i < k; i++) {
		S[i] = dMatrix(d, d);
	}
	/* read in the covariance matrices */
	/* TODO In a perfect world, we would check that the covariance matrices
	 * are symmetrical, and thus valid - better yet, we would have an input
	 * file format that enforced it (e.g. just specify upper triangle of
	 * the matrix). There are also other constraints that should be checked,
	 * e.g. covariances (off-diagonal) that are too big for the variances
	 * (on-diagonal).
	 */
	for (i = 0; i < k; i++) { /* foreach Gaussian */
		for (j = 0; j < d; j++) { /* d lines of numbers each */
			if (!getline(infile, line, '#', MAXLINELENGTH)) {
				fprintf(stderr, "Unexpected end of input file.\n");
				exit(1);
			}
			if (!line2vec(line, S[i][j], d)) {
				fprintf(stderr, "Error extracting %d numbers from '%s'\n", d, line);
				exit(1);
			}
		}
	}
#ifdef DEBUG
	printf("Covariance matrices: \n");
	for (i = 0; i < k; i++) {
		printf("Gaussian %d:\n", i);
		printMatrix(S[i], d, d);
	}
	printf("\n");
#endif
}

int main(int argc, char *argv[]) {

	FILE *infile;
	int i, j, *g;
	int nrot; /* needed by eigsrt. Not used in this program */
	double *pa_sample, *sample; /* temp. storage for generated sample */

	srand48(getpid()); /* seed random number generator */
	
	/* check if an input file has been specified, if not use stdin */
	switch(argc) {
	case 1:
		infile = stdin;
		break;
	case 2:
		if (infile = fopen(argv[1], "r")) {
			break;
		}
		else {
			fprintf(stderr, "Can not open %s for reading. Bye.\n", argv[1]);
			exit(1);
		}
	default:
		fprintf(stderr, "Usage: %s [inputfile]\n", argv[0]);
		exit(1);
	}

	/* get the number of samples desired, and the parameters of the
	 * Gaussian mixture */
	parseInput(infile);

	/* Now, we need to find the eigenvectors and eigenvalues of the
	 * covariance matrix of each Gaussian. The orthonormal matrix of
	 * eigenvectors will give us the rotation transformation from the
	 * principal axes of the hyperellipsoid corresponding to the Gaussian
	 * back to the original coordinate system, and the eigenvalues will
	 * give us the variance along each principal axis. So, the procedure to
	 * generate the samples will be to generate them in the coordinate
	 * system defined by the principal axes, with appropriate variances,
	 * then rotate back to the original coordinate system, and finally
	 * shift by the means.
	 */

	/* allocate space for eigenvectors and eigenvalues */
	E = (double ***)malloc(k*sizeof(double **));
	lambda = (double **)malloc(k*sizeof(double *));
	for (i = 0; i < k; i++) {
		E[i] = dMatrix(d, d);
		lambda[i] = dVector(d);
	}

	/* compute the eigenvectors and eigenvalues */
	for (i = 0; i < k; i++) {
		/* note that jacobi DESTROYS its input matrix */
		jacobi(S[i], d, lambda[i], E[i], &nrot);
#ifdef DEBUG
		printf("Gaussian %d:\n", i);
		printf("Eigenvalues:\n");
		printVector(lambda[i], d);
		printf("Eigenvectors:\n");
		printMatrix(E[i], d, d);
		printf("\n");
#endif
		/* take square roots of eigenvalues, since we actually want standard
		 * deviations, not variances */
		for (j = 0; j < d; j++) {
			lambda[i][j] = sqrt(lambda[i][j]);
		}
	}


	/* generate the samples */
	pa_sample = dVector(d);
	/* get list of indices to select Gaussians, according to probabilities
	 * given by the weights (getting n at a time is faster than doing it in
	 * the loop). */
	g = pick_n_from_weights(n, w, k);
	for (i = 0; i < n; i++) {
		/* generate sample in principal axes space */
		for (j = 0; j < d; j++) {
			pa_sample[j] = drandgaussian(0, lambda[g[i]][j]);
		}
		/* rotate from principal axes space back to normal space */
		sample = dMultiplyVector(E[g[i]], pa_sample, d, d);
		/* shift to the means */
		for (j = 0; j < d; j++) {
			sample[j] += m[g[i]][j];
		}
		/* print out the sample */
		for (j = 0; j < d - 1; j++) {
			printf("%f, ", sample[j]);
		}	
		printf("%f\n", sample[j]);
		free(sample);
	}
}

