/*
   flame - cosmic recursive fractal flames
   Copyright (C) 1992  Scott Draves <spot@cs.cmu.edu>

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/

#include "rect.h"

#include <string.h>


/* for batch
 *   interpolate
 *   compute colormap
 *   for subbatch
 *     compute samples
 *     buckets += cmap[samples]
 *   accum += time_filter[batch] * log(buckets)
 * image = filter(accum)
 */


typedef short bucket[4];

/* if you use longs instead of shorts, you 
   get higher quality, and spend more memory */

#if 1
typedef short accum_t;
#define MAXBUCKET (1<<14)
#define SUB_BATCH_SIZE 10000
#else
typedef long accum_t;
#define MAXBUCKET (1<<30)
#define SUB_BATCH_SIZE 10000
#endif

typedef accum_t abucket[4];


/* allow this many iterations for settling into attractor */
#define FUSE 15

/* clamp spatial filter to zero at this std dev (2.5 ~= 0.0125) */
#define FILTER_CUTOFF 2.5

/* should be MAXBUCKET / (OVERSAMPLE^2) */
#define PREFILTER_WHITE (MAXBUCKET>>4)


#define bump_no_overflow(dest, delta, type) { \
   type tt_ = dest + delta;            \
   if (tt_ > dest) dest = tt_;                 \
}

/* sum of entries of vector to 1 */
void normalize_vector(v, n)
   double *v;
   int n;
{
   double t = 0.0;
   int i;
   for (i = 0; i < n; i++)
      t += v[i];
   t = 1.0 / t;
   for (i = 0; i < n; i++)
      v[i] *= t;
}


void render_rectangle(spec, out, out_width, field, nchan, progress)
   frame_spec *spec;
   unsigned char *out;
   int out_width;
   int field;
   int nchan;
   int progress(double);
{
   int i, j, k, nsamples, nbuckets, batch_size, batch_num, sub_batch;
   bucket  *buckets;
   abucket *accumulate;
   point *points;
   double *filter, *temporal_filter, *temporal_deltas;
   double bounds[4], size[2], ppux, ppuy;
   int image_width, image_height;    /* size of the image to produce */
   int width, height;               /* size of histogram */
   int filter_width;
   int oversample = spec->cps[0].spatial_oversample;
   int nbatches = spec->cps[0].nbatches;
   bucket cmap[CMAP_SIZE];
   int gutter_width;
   int sbc;

   image_width = spec->cps[0].width;
   if (field) {
      image_height = spec->cps[0].height / 2;
      if (field == field_odd)
	 out += nchan * out_width;
      out_width *= 2;
   } else
      image_height = spec->cps[0].height;

   if (1) {
      filter_width = (2.0 * FILTER_CUTOFF * oversample *
		      spec->cps[0].spatial_filter_radius);
      /* make sure it has same parity as oversample */
      if ((filter_width ^ oversample) & 1)
	 filter_width++;

      filter = (double *) malloc(sizeof(double) * filter_width * filter_width);
      /* fill in the coefs */
      for (i = 0; i < filter_width; i++)
	 for (j = 0; j < filter_width; j++) {
	    double ii = ((2.0 * i + 1.0) / filter_width - 1.0) * FILTER_CUTOFF;
	    double jj = ((2.0 * j + 1.0) / filter_width - 1.0) * FILTER_CUTOFF;
	    if (field)
	       jj *= 2.0;
	    filter[i + j * filter_width] =
	       exp(-2.0 * (ii * ii + jj * jj));
	 }

      normalize_vector(filter, filter_width * filter_width);
   }
   temporal_filter = (double *) malloc(sizeof(double) * nbatches);
   temporal_deltas = (double *) malloc(sizeof(double) * nbatches);
   if (nbatches > 1) {
      double t;
      /* fill in the coefs */
      for (i = 0; i < nbatches; i++) {
	 t = temporal_deltas[i] = (2.0 * ((double) i / (nbatches - 1)) - 1.0)
	    * spec->temporal_filter_radius;
	 temporal_filter[i] = exp(-2.0 * t * t);
      }

      normalize_vector(temporal_filter, nbatches);
   } else {
      temporal_filter[0] = 1.0;
      temporal_deltas[0] = 0.0;
   }

#if 0
   for (j = 0; j < nbatches; j++)
      fprintf(stderr, "%4f %4f\n", temporal_deltas[j], temporal_filter[j]);
   fprintf(stderr, "\n");
#endif
   
   /* the number of additional rows of buckets we put at the edge so
      that the filter doesn't go off the edge */

   gutter_width = (filter_width - oversample) / 2;
   height = oversample * image_height + 2 * gutter_width;
   width  = oversample * image_width  + 2 * gutter_width;

   nbuckets = width * height;
   if (1) {
     static char *last_block = NULL;
     static int last_block_size = 0;
     int memory_rqd = (sizeof(bucket) * nbuckets +
		       sizeof(abucket) * nbuckets +
		       sizeof(point) * SUB_BATCH_SIZE);
     if (memory_rqd > last_block_size) {
       if (last_block != NULL)
	  free(last_block);
       last_block = (char *) malloc(memory_rqd);
       if (NULL == last_block) {
	  fprintf(stderr, "render_rectangle: cannot malloc %d bytes.\n", memory_rqd);
	  exit(1);
       }
       /* else fprintf(stderr, "render_rectangle: mallocked %dMb.\n", Mb); */

       last_block_size = memory_rqd;
     }
     buckets = (bucket *) last_block;
     accumulate = (abucket *) (last_block + sizeof(bucket) * nbuckets);
     points = (point *)  (last_block + (sizeof(bucket) + sizeof(abucket)) * nbuckets);
   }

   memset((char *) accumulate, 0, sizeof(abucket) * nbuckets);
   for (batch_num = 0; batch_num < nbatches; batch_num++) {
      double batch_time;
      double sample_density;
      control_point cp;
      memset((char *) buckets, 0, sizeof(bucket) * nbuckets);
      batch_time = spec->time + temporal_deltas[batch_num];

      /* interpolate and get a control point */
      interpolate(spec->cps, spec->ncps, batch_time, &cp);

      /* compute the colormap entries.  the input colormap is 256 long with
	 entries from 0 to 1.0 */
      for (j = 0; j < CMAP_SIZE; j++) {
	 for (k = 0; k < 3; k++) {
#if 1
	    cmap[j][k] = (int) (cp.cmap[(j * 256) / CMAP_SIZE][k] *
				cp.white_level);
#else
	    /* monochrome if you don't have any cmaps */
	    cmap[j][k] = cp.white_level;
#endif	    
	 }
	 cmap[j][3] = cp.white_level;
      }

      /* compute camera */
      if (1) {
	double t0, t1, shift = 0.0, corner0, corner1;
	double scale;

	scale = pow(2.0, cp.zoom);
	sample_density = cp.sample_density * scale * scale;
	
	ppux = cp.pixels_per_unit * scale;
	ppuy = field ? (ppux / 2.0) : ppux;
	switch (field) {
	case field_both: shift =  0.0; break;
	case field_even: shift = -0.5; break;
	case field_odd:  shift =  0.5; break;
	}
	shift = shift / ppux;
	t0 = (double) gutter_width / (oversample * ppux);
	t1 = (double) gutter_width / (oversample * ppuy);
	corner0 = cp.center[0] - image_width / ppux / 2.0;
	corner1 = cp.center[1] - image_height / ppuy / 2.0;
	bounds[0] = corner0 - t0;
	bounds[1] = corner1 - t1 + shift;
	bounds[2] = corner0 + image_width  / ppux + t0;
	bounds[3] = corner1 + image_height / ppuy + t1 + shift;
	size[0] = 1.0 / (bounds[2] - bounds[0]);
	size[1] = 1.0 / (bounds[3] - bounds[1]);
      }

      nsamples = (int) (sample_density * nbuckets /
			(oversample * oversample));
   
      batch_size = nsamples / cp.nbatches;

      sbc = 0;
      for (sub_batch = 0;
	   sub_batch < batch_size;
	   sub_batch += SUB_BATCH_SIZE) {

	if (progress&&!(sbc++&7))
	  (*progress)(0.5*sub_batch/(double)batch_size);

	 /* generate a sub_batch_size worth of samples */
	 points[0][0] = random_uniform11();
	 points[0][1] = random_uniform11();
	 points[0][2] = random_uniform01();
	 iterate(&cp, SUB_BATCH_SIZE, FUSE, points);
	 
	 
	 /* merge them into buckets, looking up colors */
	 for (j = 0; j < SUB_BATCH_SIZE; j++) {
	    int k, color_index;
	    double *p = points[j];
	    bucket *b;
	    if (p[0] < bounds[0] ||
		p[1] < bounds[1] ||
		p[0] > bounds[2] ||
		p[1] > bounds[3])
	       continue;
	    color_index = (int) (p[2] * CMAP_SIZE);
	    if (color_index < 0) color_index = 0;
	    else if (color_index > (CMAP_SIZE-1))
	       color_index = CMAP_SIZE-1;
	    b = buckets +
	       (int) (width * (p[0] - bounds[0]) * size[0]) +
		  width * (int) (height * (p[1] - bounds[1]) * size[1]);	    
	    for (k = 0; k < 4; k++)
	       bump_no_overflow(b[0][k], cmap[color_index][k], short);
	 }
      }
      
      if (1) {
	 double k1 =(cp.contrast * cp.brightness *
		     PREFILTER_WHITE * 268.0 *
		     temporal_filter[batch_num]) / 256;
	 double area = image_width * image_height / (ppux * ppuy);
	 double k2 = (oversample * oversample * nbatches) /
	    (cp.contrast * area * cp.white_level * sample_density);
 
	 /* log intensity in hsv space */
	 for (j = 0; j < height; j++)
	    for (i = 0; i < width; i++) {
	       abucket *a = accumulate + i + j * width;
	       bucket *b = buckets + i + j * width;
	       double c[4], ls;
	       c[0] = (double) b[0][0];
	       c[1] = (double) b[0][1];
	       c[2] = (double) b[0][2];
	       c[3] = (double) b[0][3];
	       if (0.0 == c[3])
		 continue;
	       
	       ls = (k1 * log(1.0 + c[3] * k2))/c[3];
	       c[0] *= ls;
	       c[1] *= ls;
	       c[2] *= ls;
	       c[3] *= ls;

	       bump_no_overflow(a[0][0], c[0] + 0.5, accum_t);
	       bump_no_overflow(a[0][1], c[1] + 0.5, accum_t);
	       bump_no_overflow(a[0][2], c[2] + 0.5, accum_t);
	       bump_no_overflow(a[0][3], c[3] + 0.5, accum_t);
	    }
      }
   }
   /*
    * filter the accumulation buffer down into the image
    */
   if (1) {
      int x, y;
      double t[4];
      double g = 1.0 / spec->cps[0].gamma;
      y = 0;
      for (j = 0; j < image_height; j++) {
	 if (progress && !(j&15))
	   (*progress)(0.5+0.5*j/(double)image_height);
	 x = 0;
	 for (i = 0; i < image_width; i++) {
	    int ii, jj, a;
	    unsigned char *p;
	    t[0] = t[1] = t[2] = t[3] = 0.0;
	    for (ii = 0; ii < filter_width; ii++)
	       for (jj = 0; jj < filter_width; jj++) {
		  double k = filter[ii + jj * filter_width];
		  abucket *a = accumulate + x + ii + (y + jj) * width;
		  
		  t[0] += k * a[0][0];
		  t[1] += k * a[0][1];
		  t[2] += k * a[0][2];
		  t[3] += k * a[0][3];
	       }
	    p = out + nchan * (i + j * out_width);
	    a = 256.0 * pow((double) t[0] / PREFILTER_WHITE, g) + 0.5;
	    if (a < 0) a = 0; else if (a > 255) a = 255;
	    p[0] = a;
	    a = 256.0 * pow((double) t[1] / PREFILTER_WHITE, g) + 0.5;
	    if (a < 0) a = 0; else if (a > 255) a = 255;
	    p[1] = a;
	    a = 256.0 * pow((double) t[2] / PREFILTER_WHITE, g) + 0.5;
	    if (a < 0) a = 0; else if (a > 255) a = 255;
	    p[2] = a;
	    if (nchan > 3) {
	      a = 256.0 * pow((double) t[3] / PREFILTER_WHITE, g) + 0.5;
	      if (a < 0) a = 0; else if (a > 255) a = 255;
	      p[3] = a;
	    }
	    x += oversample;
	 }
	 y += oversample;
      }
   }
   
   free(filter);
}