/* rotozoom.c - 
   Copyright (C) 2000 Tijs van Bakel and Jorik Blaas.
   Tijs van Bakel <smoke@casema.net>
   Jorik Blaas <jrk@panic.et.tudelft.nl>
 
 This file is part of a silly intro
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2, or (at your option)
 any later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
 along with This program; see the file COPYING.  If not, write to
 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */

#include <stdlib.h>
#include <math.h>

#include "rotozoom.h"

#include "crap_png.h"
#include "crap_mod.h"
#include "crap_font.h"

Palette texture_palette;
Image* texture_image;
Image texture_subimage;

/* we use the cache optimization technique as described by Pascal aka
   Niklas Beisert (sp?) from Cubic Team.  (the original source is
   available from http://www.hornet.org/, the Cubic homepage is at
   http://www.cubic.org/).

   note that i'm using a p100 myself, so i have NO idea how this
   technique works on modern fancy hardware with modern fancy
   caches. please let me know if you're into this.

   the general idea is to split up the innerloop in 8x8 pixelblocks,
   by means of which the cache won't stall as much if the
   dudx/dudy/dvdx/dvdy's are too big (if they are too big, the texture
   is traversed through at too large intervals, and the cache couldn't
   read pixels ahead as happily. experimenting with 16x16 and 4x4
   blocks didn't really help much

   --
   smoke, thinking of giving credit where credit is due :-) */

void
draw_rotozoom ( Image* dest, Image* src,
	   float angle, float r )
{
  /* screencoords */
  int x,y; 
  int ofs; /* textureoffsets */
  
  /* texture coords inside one block */
  int u,v; 
  int u0,v0; /* initial values */
  int dudx,dvdx,dudy,dvdy;
  float fdudx,fdvdx,fdudy,fdvdy;
  
  /* block coordinates */
  int bx,by;
  int bu,bv;
  int bu0,bv0; /* initial values */
  
  int dudx_shl_3,dvdx_shl_3,dudy_shl_3,dvdy_shl_3;
  
  int precalcu[256];
  int precalcv[256];
  
  int* pu_ptr;
  int* pv_ptr;
  
  uint8* src_ptr;
  uint8* dest_ptr;

  uint8* src_buf = src->buffer;
  uint8* dest_buf = dest->buffer;
  
  int ofsh;
  int dest_stride = dest->stride;
  
  fdudx = ( (cos(angle/65536.0*2*M_PI) * r));
  fdvdx = ( (-sin(angle/65536.0*2*M_PI) * r));
  
  fdudy = ( (sin(angle/65536.0*2*M_PI) * r));
  fdvdy = ( (cos(angle/65536.0*2*M_PI) * r));

  dudx_shl_3 = fdudx * 8.0;
  dudy_shl_3 = fdudy * 8.0;
  dvdx_shl_3 = fdvdx * 8.0;
  dvdy_shl_3 = fdvdy * 8.0;

  dudx = dudx_shl_3 >> 3;
  dvdx = dvdx_shl_3 >> 3;
  dudy = dudy_shl_3 >> 3;
  dvdy = dvdy_shl_3 >> 3;
  
  bu0 = (-100.0*fdudy + -160.0*fdudx);
  bv0 = (-100.0*fdvdy + -160.0*fdvdx);
  
  /* precalc one block's worth of du.dv's */
  
  u0 = 0;
  v0 = 0;
  
  for (y = 0; y < 8; y++)
    {
      u = u0;
      v = v0;
      for (x = 0; x < 8; x++)
        {
	  ofs = u;
	  precalcu[x+y*8] = ofs;
	  ofs = v;
	  precalcv[x+y*8] = ofs;
	  u += dudx;
	  v += dvdx;
        }
      u0 += dudy;
      v0 += dvdy;
    }
  
  /* draw rotated image in blocks of 8x8 pixels */
  for (by = 0; by < dest->height/8; by++)
    {
      bu = bu0;
      bv = bv0;
      for (bx = 0; bx < dest->width/8; bx++)
        {
	  dest_ptr = dest_buf + (bx<<3) + 320*(by<<3);

	  /* primitive clipping optimized a little for the fist image */
	  if ( (bu < -(80<<8)) || (bu > (120<<8)) || (bv < -(130<<8)) || (bv > (40<<8)) )
	    {
	      for ( y = 8; y; y-- )
		{
		  memset ( dest_ptr, 0, 8 );
		  dest_ptr += 320;
		}
	    }
	  else
	    {
	      u0 = bu & 255;
	      v0 = bv & 255;
	      
	      ofsh = (bu >> 8) + (bv & ~255); 
	      
	      src_ptr = src_buf + ofsh + 128*256 + 128;
	      
	      pu_ptr = precalcu;
	      pv_ptr = precalcv;
	      
	      for ( y = 8; y; y-- )
		{
		  /* innerloop. unrolling isn't really necessary but
                     it impresses most people more than a forloop */
		  *dest_ptr++ = src_ptr[((u0+*pu_ptr++)>>8) + ((v0+*pv_ptr++)&~255)];
		  *dest_ptr++ = src_ptr[((u0+*pu_ptr++)>>8) + ((v0+*pv_ptr++)&~255)];
		  *dest_ptr++ = src_ptr[((u0+*pu_ptr++)>>8) + ((v0+*pv_ptr++)&~255)];
		  *dest_ptr++ = src_ptr[((u0+*pu_ptr++)>>8) + ((v0+*pv_ptr++)&~255)];
		  *dest_ptr++ = src_ptr[((u0+*pu_ptr++)>>8) + ((v0+*pv_ptr++)&~255)];
		  *dest_ptr++ = src_ptr[((u0+*pu_ptr++)>>8) + ((v0+*pv_ptr++)&~255)];
		  *dest_ptr++ = src_ptr[((u0+*pu_ptr++)>>8) + ((v0+*pv_ptr++)&~255)];
		  *dest_ptr++ = src_ptr[((u0+*pu_ptr++)>>8) + ((v0+*pv_ptr++)&~255)];
		  
		  dest_ptr += dest_stride - 8;
		}
	    }
	  
	  bu += dudx_shl_3;
	  bv += dvdx_shl_3;
        }
      bu0 += dudy_shl_3;
      bv0 += dvdy_shl_3;
    }
}

/* i discovered a buffer overflow here and there were basically three
   ways to solve it:
   
   1) clip better (add a maximal diff to the block borders)
   2) use &65535 in the innerloop, thereby slowing things down
   3) allocate a big empty void, and use a subimage in the middle
      of that so that overflows are of no harm

   of these options, 3 is obviously the ugliest. guess which one i
   chose to use.. (hint: this is why the source image is cropped to
   256x512 instead of the usual 256x256.

   please send in patches for the fastest method, i really don't like
   allocating another 256x256. */

Rotozoom_data* create_fx_rotozoom ( Image* dest )
{
  Rotozoom_data* data;
  Image image;
  
  data = (Rotozoom_data*) malloc ( sizeof(Rotozoom_data) );

  crap_png_load ( &image, &texture_palette, "rotozoom.png" );
  texture_image = crap_image_create ( 256, 512 );
  crap_image_fill ( texture_image, 0 );
  crap_image_blit ( texture_image, 0, 128, &image );

  crap_image_subimage ( texture_image, &texture_subimage, 0, 128, 256, 256 );
  
  return data;
}

void fx_rotozoom ( Image* dest, Rotozoom_data* data )
{
  float r;
  float angle;
  static float counter = 0.0;
  static int first_time = 1;

  if ( first_time )
    {
      first_time = 0;
      crap_image_fill ( dest, 0 );
      crap_setpalette ( &texture_palette, 0, 128 );
    }
    
  counter += 1.0;

  /* funky rotations */
  angle = sin ( counter / 64.0 ) * 65535.0;
  r = cos ( counter / 32.0 ) * 256.0 + 384.0;

  draw_rotozoom ( dest, &texture_subimage, angle, r );
}
