/*
	Twilight Prophecy 3D/Multimedia SDK
	A multi-platform development system for virtual reality and multimedia.

	Copyright (C) 1997-2001 by Twilight 3D Finland Oy Ltd.

	This program is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 2 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

	Please read the file LICENSE.TXT for additional details.


	source: 
		surface blitter implementation

	revision history:
		Jun/30/1999 - Jukka Liimatta - initial revision
		Jul/06/1999 - Mikael Kalms - generic and c++ inners added
		Jul/14/1999 - Mikael Kalms - specialized x86 inners added
		Jan/24/2001 - Jukka Liimatta - renaissance build
		Jun/14/2001 - Jukka Liimatta - blitter setup fixes (possible division by zero, etc. ;-)
*/
/*
  - Indexed->Indexed [stretch]blit does not check palette at all.
	It should either fail or remap when blitting between pictures with
	different palettes
  - Generic Alpha->Alpha blits zero-extend the lower bits of alpha.
	It should rather repeat the alpha bits over and over again in the
	lower bits...? [Ideally this should be done for all components, but
	it is most important for alpha]
  - blitters[] table is never released, possible memory leak?
*/
/*
	Blitting is supported between all possible formats, except
	!Indexed->Indexed (that situation is handled by Bitmap::Quantize() instead).

    Combined format conversion + stretch operations perform point sampling.
	Pure stretching can perform point sampling on all formats, and
	  bilinear filtering on !Indexed formats.

	Indexed->!Indexed format conversion (+ possible stretch) will first
	convert the palette to	output format, and then remap (+ possibly
	stretch) the image.

	Current inners:

	* Generic C inners. These are quite slow, but will work with *any* Bitmaps.
	* Specialized C inners; stretch and/or format conversion between all
		formats listed in pixelformat.hpp are accelerated.

	MK, 09/07/1999
*/
#include <prcore/prcore.hpp>
using namespace prcore;



//////////////////////////////////////////////////////
// code generation config                          //
////////////////////////////////////////////////////

#define ENABLE_SPECIALIZED_C_INNERS

#if defined( PRCORE_X86_SIMD )
#define ENABLE_SPECIALIZED_X86_INNERS
#define ENABLE_SPECIALIZED_X86_MMX_INNERS
#endif // PRCORE_X86_SIMD


//////////////////////////////////////////////////////
// macros                                          //
////////////////////////////////////////////////////

static inline uint32 read24(const void* ptr)
{
	#ifdef PRCORE_LITTLE_ENDIAN
	return ((uint8*) ptr)[0] | (((uint8*) ptr)[1] << 8) | (((uint8*) ptr)[2] << 16);
	#else
	return (((uint8*) ptr)[0] << 16) | (((uint8*) ptr)[1] << 8) | ((uint8*) ptr)[2];
	#endif
}

static inline void write24(void* ptr, uint32 data)
{
	#ifdef PRCORE_LITTLE_ENDIAN
	((uint8*) ptr)[0] = data & 0xff;
	((uint8*) ptr)[1] = (data >> 8) & 0xff;
	((uint8*) ptr)[2] = (data >> 16) & 0xff;
	#else
	((uint8*) ptr)[0] = (data >> 16) & 0xff;
	((uint8*) ptr)[1] = (data >> 8) & 0xff;
	((uint8*) ptr)[2] = data & 0xff;
	#endif
}


//////////////////////////////////////////////////////
// blitter class                                   //
////////////////////////////////////////////////////

	enum
	{
		BLIT_STRETCH = 1,			// Point-sampling stretch
		BLIT_REMAP = 2,				// Format conversion
		BLIT_STRETCH_REMAP = 3,		// Point-sampling stretch & format conversion
		BLIT_BILINEAR = 4,
		BLIT_STRETCH_BILINEAR = 5	// Bilinear filtered stretch
	};

	struct BlitMask
	{
		int32	left, right;
		uint32	mask;
	};

	struct InnerInfo
	{
		uint8*	dest;
		uint8*	src;
		uint8*	src2;				// used by bilinear only
		int		width;
		uint32	ustart, ustep;		// used by point sampled + bilinear only
		uint32	vfrac;				// used by bilinear only
	};

	class Blitter;

	typedef void (*BlitFunc)(Blitter*, const InnerInfo*);
	typedef bool (*BlitUpdatePalFunc)(Blitter& blitter, const PixelFormat& dest, const PixelFormat& src);

	// information holder class for a particular format conversion/stretch
	class Blitter
	{
		public:

		BlitMask	red, green, blue, alpha, index, intens;
		uint32		alphaor;
		PixelFormat	dest, source;
		int			convtype;
		BlitFunc	func;				// 'Perform blit on a row'-function
		BlitUpdatePalFunc updatepal;	// 'Update palette before blitting'-function
		char		palremap[1024];		// Indexed->!Indexed remapping table
static	Color32		palmono[256];		// Direct->Intensity RGB weighting table

		Blitter(const PixelFormat& destination, const PixelFormat& source, int type);
		~Blitter();

inline	void	Blit(const InnerInfo* info) { func( this, info ); }
	};


Color32 Blitter::palmono[256];
static PixelFormat palformat(32,0x00ff0000,0x0000ff00,0x000000ff,0xff000000);

#ifdef ENABLE_SPECIALIZED_X86_MMX_INNERS
static bool mmxfound = false;
#endif // ENABLE_SPECIALIZED_X86_MMX_INNERS


//////////////////////////////////////////////////////
// external inners ( mmx )                         //
////////////////////////////////////////////////////

#ifdef ENABLE_SPECIALIZED_X86_INNERS
extern "C"
{
	extern void* inner_stretch_smc_start;
	extern int inner_stretch_smc_size;
	void inner_stretch_rgb565_bilinear_x86(Blitter* blitter, const InnerInfo* info);
#ifdef ENABLE_SPECIALIZED_X86_MMX_INNERS
	void inner_stretch_rgb888_bilinear_x86_mmx(Blitter* blitter, const InnerInfo* info);
#endif // ENABLE_SPECIALIZED_X86_MMX_INNERS
	void inner_stretch_argb1555_bilinear_x86(Blitter* blitter, const InnerInfo* info);
	void inner_stretch_argb4444_bilinear_x86(Blitter* blitter, const InnerInfo* info);
#ifdef ENABLE_SPECIALIZED_X86_MMX_INNERS
	void inner_stretch_argb8888_bilinear_x86_mmx(Blitter* blitter, const InnerInfo* info);
#endif // ENABLE_SPECIALIZED_X86_MMX_INNERS
};
#endif // ENABLE_SPECIALIZED_X86_INNERS


//////////////////////////////////////////////////////
// generic innerloops                              //
////////////////////////////////////////////////////

	// remap_rgba_rgba template

template <class S, class D>
inline void remap_rgba_rgba(Blitter& blitter, const InnerInfo& info)
{
	S* src = (S*)info.src;
	D* dest = (D*)info.dest;
	int count = info.width;
	do
	{
		*dest++ = 
			(((*src >> blitter.red.right    ) << blitter.red.left  ) & blitter.red.mask  ) | 
			(((*src >> blitter.green.right  ) << blitter.green.left) & blitter.green.mask) | 
			(((*src >> blitter.blue.right   ) << blitter.blue.left ) & blitter.blue.mask ) |
			(((*src++ >> blitter.alpha.right) << blitter.alpha.left) & blitter.alpha.mask) | 
			blitter.alphaor;
	} while( --count );
}

	// remap_rgba_rgba inners

static void inner_remap_1rgba_1rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint8,uint8>( *blitter, *info );
}

static void inner_remap_1rgba_2rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint8,uint16>( *blitter, *info );
}

static void inner_remap_1rgba_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = (((*src >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
			| (((*src >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
			| (((*src >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
			| (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			| blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_1rgba_4rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint8,uint32>( *blitter, *info );
}

static void inner_remap_2rgba_1rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint16,uint8>( *blitter, *info );
}

static void inner_remap_2rgba_2rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint16,uint16>( *blitter, *info );
}

static void inner_remap_2rgba_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = (((*src >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
			| (((*src >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
			| (((*src >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
			| (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			| blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_2rgba_4rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint16,uint32>( *blitter, *info );
}

static void inner_remap_3rgba_1rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = read24( src );
		*dest++ = (((col >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
			    | (((col >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
			    | (((col >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
			    | (((col >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			    | blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_3rgba_2rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;
	do
	{
		col = read24( src );
		*dest++ = (((col >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
			    | (((col >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
			    | (((col >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
			    | (((col >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_3rgba_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col1, col2;
	do
	{
		col1 = read24( src );
		col2 = (((col1 >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
			 | (((col1 >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
			 | (((col1 >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
			 | (((col1 >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			 | blitter->alphaor;
		src += 3;
		write24( dest, col2 );
		dest += 3;
	} while ( --x );
}

static void inner_remap_3rgba_4rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;
	do
	{
		col = read24( src );
		*dest++ = (((col >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
			    | (((col >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
			    | (((col >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
			    | (((col >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_4rgba_1rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint32,uint8>( *blitter, *info );
}

static void inner_remap_4rgba_2rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint32,uint16>( *blitter, *info );
}

static void inner_remap_4rgba_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = (((*src >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
			| (((*src >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
			| (((*src >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
			| (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			| blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_4rgba_4rgba(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_rgba<uint32,uint32>( *blitter, *info );
}


	// remap_pa_ta template

template <class S, class D>
inline void remap_pa_ta(Blitter& blitter, const InnerInfo& info)
{
	S* src = (S*)info.src;
	D* dest = (D*)info.dest;
	D* pal = (D*)blitter.palremap;
	int count = info.width;
	do
	{
		*dest++ =
			pal[(((*src >> blitter.index.right) << blitter.index.left) & blitter.index.mask)] | 
			(((*src++ >> blitter.alpha.right) << blitter.alpha.left) & blitter.alpha.mask) | 
			blitter.alphaor;
	} while ( --count );
}

	// remap_pa_ta inners

static void inner_remap_1pa_1ta(Blitter* blitter, const InnerInfo* info)
{
	remap_pa_ta<uint8,uint8>( *blitter, *info );
}

static void inner_remap_1pa_2ta(Blitter* blitter, const InnerInfo* info)
{
	remap_pa_ta<uint8,uint16>( *blitter, *info );
}

static void inner_remap_1pa_3ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	int offs;
	do
	{
		offs = (((*src >> blitter->index.right) << blitter->index.left) & blitter->index.mask) * 3;
		col = read24( &pal[offs] )
			| (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			| blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_1pa_4ta(Blitter* blitter, const InnerInfo* info)
{
	uint8* src = (uint8*)info->src;
	uint32* dest = (uint32*)info->dest;
	uint32* pal = (uint32*)blitter->palremap;
	int count = info->width;
	do
	{
		*dest++ =
			pal[(((*src >> blitter->index.right) << blitter->index.left) & blitter->index.mask)] | 
			(((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask) | 
			blitter->alphaor;
	} while ( --count );
	
	// TODO: incredible, the above code, when instantiating the remap_pa_ta<S,D> template
	// fails to compile correctly with MSVC++6 SP5.
	// Investigate with better time if any other template instantiation fails!

//	remap_pa_ta<uint8,uint32>( *blitter, *info );
}

static void inner_remap_2pa_1ta(Blitter* blitter, const InnerInfo* info)
{
	remap_pa_ta<uint16,uint8>( *blitter, *info );
}

static void inner_remap_2pa_2ta(Blitter* blitter, const InnerInfo* info)
{
	remap_pa_ta<uint16,uint16>( *blitter, *info );
}

static void inner_remap_2pa_3ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint16* src = (uint16*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	int offs;
	do
	{
		offs = (((*src >> blitter->index.right) << blitter->index.left) & blitter->index.mask) * 3;
		col = read24( &pal[offs] )
			| (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			| blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_2pa_4ta(Blitter* blitter, const InnerInfo* info)
{
	remap_pa_ta<uint16,uint32>( *blitter, *info );
}

static void inner_remap_3pa_1ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = read24( src );
		*dest++ = pal[(((col >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((col >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_3pa_2ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint16* pal = (uint16*) blitter->palremap;
	uint16* dest = (uint16*) info->dest;
	uint32 col;
	do
	{
		col = read24( src );
		*dest++ = pal[(((col >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((col >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_3pa_3ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 col1, col2;
	int offs;
	do
	{
		col1 = read24( src );
		offs = (((col1 >> blitter->index.right) << blitter->index.left) & blitter->index.mask) * 3;
		col2 = read24( &pal[offs] )
			 | (((col1 >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			 | blitter->alphaor;
		src += 3;
		write24( dest, col2 );
		dest += 3;
	} while ( --x );
}

static void inner_remap_3pa_4ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint32* pal = (uint32*) blitter->palremap;
	uint32* dest = (uint32*) info->dest;
	uint32 col;
	do
	{
		col = read24( src );
		*dest++ = pal[(((col >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((col >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_4pa_1ta(Blitter* blitter, const InnerInfo* info)
{
	remap_pa_ta<uint32,uint8>( *blitter, *info );
}

static void inner_remap_4pa_2ta(Blitter* blitter, const InnerInfo* info)
{
	remap_pa_ta<uint32,uint16>( *blitter, *info );
}

static void inner_remap_4pa_3ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32* src = (uint32*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	int offs;
	do
	{
		offs = (((*src >> blitter->index.right) << blitter->index.left) & blitter->index.mask) * 3;
		col = read24( &pal[offs] )
			| (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			| blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_4pa_4ta(Blitter* blitter, const InnerInfo* info)
{
	remap_pa_ta<uint32,uint32>( *blitter, *info );
}


	// remap_rgba_ia template

template <class S, class D>
inline void remap_rgba_ia(Blitter& blitter, const InnerInfo& info)
{
	S* src = (S*)info.src;
	D* dest = (D*)info.dest;
	Color32* palmono = Blitter::palmono;
	int count = info.width;
	do
	{
		uint32 col =
			palmono[((*src >> blitter.red.right  ) << blitter.red.left  ) & blitter.red.mask  ].r +
			palmono[((*src >> blitter.green.right) << blitter.green.left) & blitter.green.mask].g +
			palmono[((*src >> blitter.blue.right ) << blitter.blue.left ) & blitter.blue.mask ].b;
		*dest++ = 
			(((col >> blitter.intens.right) << blitter.intens.left) & blitter.intens.mask) | 
			(((*src++ >> blitter.alpha.right) << blitter.alpha.left) & blitter.alpha.mask) |
			blitter.alphaor;
	} while( --count );
}

	// remap_rgba_ia inners

static void inner_remap_1rgba_1ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint8,uint8>( *blitter, *info );
}

static void inner_remap_1rgba_2ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint8,uint16>( *blitter, *info );
}

static void inner_remap_1rgba_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = Blitter::palmono[((*src >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((*src >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((*src >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		col = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
		      | (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
			  | blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_1rgba_4ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint8,uint32>( *blitter, *info );
}

static void inner_remap_2rgba_1ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint16,uint8>( *blitter, *info );
}

static void inner_remap_2rgba_2ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint16,uint16>( *blitter, *info );
}

static void inner_remap_2rgba_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = Blitter::palmono[((*src >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((*src >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((*src >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		col = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
		        | (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_2rgba_4ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint16,uint32>( *blitter, *info );
}

static void inner_remap_3rgba_1ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col1, col2;
	do
	{
		col1 = read24( src );
		col2 = Blitter::palmono[((col1 >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			   + Blitter::palmono[((col1 >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			   + Blitter::palmono[((col1 >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col2 >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((col1 >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_3rgba_2ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col1, col2;
	do
	{
		col1 = read24( src );
		col2 = Blitter::palmono[((col1 >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			   + Blitter::palmono[((col1 >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			   + Blitter::palmono[((col1 >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col2 >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((col1 >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_3rgba_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col1, col2;
	do
	{
		col1 = read24( src );
		col2 = Blitter::palmono[((col1 >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			   + Blitter::palmono[((col1 >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			   + Blitter::palmono[((col1 >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		col2 = (((col2 >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
		        | (((col1 >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
 				| blitter->alphaor;
		src += 3;
		write24( dest, col2 );
		dest += 3;
	} while ( --x );
}

static void inner_remap_3rgba_4ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col1, col2;
	do
	{
		col1 = read24( src );
		col2 = Blitter::palmono[((col1 >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((col1 >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((col1 >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col2 >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((col1 >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		src += 3;
	} while ( --x );
}

static void inner_remap_4rgba_1ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint32,uint8>( *blitter, *info );
}

static void inner_remap_4rgba_2ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint32,uint16>( *blitter, *info );
}

static void inner_remap_4rgba_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = Blitter::palmono[((*src >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((*src >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((*src >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		col = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
		        | (((*src++ >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		write24( dest, col );
		dest += 3;
	} while ( --x );
}

static void inner_remap_4rgba_4ia(Blitter* blitter, const InnerInfo* info)
{
	remap_rgba_ia<uint32,uint32>( *blitter, *info );
}


//////////////////////////////////////////////////////
// specialized innerloops                          //
////////////////////////////////////////////////////

#ifdef ENABLE_SPECIALIZED_C_INNERS

static void inner_remap_p8_1ta(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 alphaor = blitter->alphaor;
	uint32 col;
	do
	{
		col = *src++;
		*dest++ = pal[col] | alphaor;
	} while ( --count );
}

static void inner_remap_p8_2ta(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* pal = (uint16*) blitter->palremap;
	uint16* dest = (uint16*) info->dest;
	uint32 alphaor = blitter->alphaor;
	uint32 col;
	do
	{
		col = *src++;
		*dest++ = pal[col] | alphaor;
	} while ( --count );
}

static void inner_remap_p8_3ta(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 alphaor = blitter->alphaor;
	uint32 col;
	do
	{
		col = *src++;
		write24( dest, read24( &pal[col * 3] ) | alphaor );
		dest += 3;
	} while ( --count );
}

static void inner_remap_p8_4ta(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint32* pal = (uint32*) blitter->palremap;
	uint32* dest = (uint32*) info->dest;
	uint32 alphaor = blitter->alphaor;
	uint32 col;
	do
	{
		col = *src++;
		*dest++ = pal[col] | alphaor;
	} while ( --count );
}

static void inner_remap_i8_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = (col & 0xe0) | ((col & 0xe0) >> 3) | ((col & 0xc0) >> 6);
	}
	while ( --count );
}

static void inner_remap_i8_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0xf8) << 8) | ((col & 0xfc) << 3) | ((col & 0xf8) >> 3);
	}
	while ( --count );
}

static void inner_remap_i8_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		write24( dest, (col << 16) | (col << 8) | col );
		dest += 3;
	}
	while ( --count );
}

static void inner_remap_i8_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col, col2;

	do
	{
		col = *src++;
		col2 = (col & 0xf8);
		*dest++ = (col2 << 7) | (col2 << 2) | (col2 >> 3) | 0x8000;
	}
	while ( --count );
}

static void inner_remap_i8_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col, col2;

	do
	{
		col = *src++;
		col2 = (col & 0xf0);
		*dest++ = (col2 << 4) | col2 | (col2 >> 4) | 0xf000;
	}
	while ( --count );
}

static void inner_remap_i8_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = (col << 16) | (col << 8) | col | 0xff000000;
	}
	while ( --count );
}

static void inner_remap_rgb332_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = Blitter::palmono[col & 0xe0].r + Blitter::palmono[(col & 0x1c) << 3].g
				  + Blitter::palmono[(col & 0x03) << 6].b;
	}
	while ( --count );
}

static void inner_remap_rgb332_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0xe0) << 8) | ((col & 0x1c) << 6) | ((col & 0x03) << 3);
	}
	while ( --count );
}

static void inner_remap_rgb332_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		write24( dest, ((col & 0xe0) << 16) | ((col & 0x1c) << 11) | ((col & 0x03) << 6) );
		dest += 3;
	}
	while ( --count );
}

static void inner_remap_rgb332_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0xe0) << 7) | ((col & 0x1c) << 5) | ((col & 0x03) << 3) | 0x8000;
	}
	while ( --count );
}

static void inner_remap_rgb332_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0xe0) << 4) | ((col & 0x1c) << 3) | ((col & 0x03) << 2) | 0xf000;
	}
	while ( --count );
}

static void inner_remap_rgb332_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0xe0) << 16) | ((col & 0x1c) << 11) | ((col & 0x03) << 6) | 0xff000000;
	}
	while ( --count );
}

static void inner_remap_rgb565_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = Blitter::palmono[(col & 0xf800) >> 8].r
				  + Blitter::palmono[(col & 0x07e0) >> 3].g
				  + Blitter::palmono[(col & 0x001f) << 3].b;
	}
	while ( --count );
}

static void inner_remap_rgb565_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0xe000) >> 8) | ((col & 0x0700) >> 6) | ((col & 0x0018) >> 3);
	}
	while ( --count );
}

static void inner_remap_rgb565_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		write24( dest, ((col & 0xf800) << 8) | ((col & 0x07e0) << 5) | ((col & 0x001f) << 3) );
		dest += 3;
	}
	while ( --count );
}

static void inner_remap_rgb565_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	if ( ((int) src ^ (int) dest) & 2 ) // Misaligned?
	{
		do
		{
			col = *src++;
			*dest++ = ((col & 0xffc0) >> 1) | (col & 0x1f) | 0x8000;
		}
		while ( --count );
	}
	else
	{
		if ( (int) src & 2 ) // Odd first uint16?
		{
			col = *src++;
			*dest++ = ((col & 0xffc0) >> 1) | (col & 0x1f) | 0x8000;
			count--;
		}

		int count2 = (count >> 1);
		if ( count2 ) // Middle uint32s?
			do
			{
				col = *((uint32*) src);
				src += 2;
				*((uint32*) dest) = ((col & 0xffc0ffc0) >> 1) | (col & 0x001f001f) | 0x80008000;
				dest += 2;
			}
			while ( --count2 );

		if ( count & 1 ) // Odd last uint16?
		{
			col = *src++;
			*dest++ = ((col & 0xffc0) >> 1) | (col & 0x1f) | 0x8000;
		}
	}
}

static void inner_remap_rgb565_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	if ( ((int) src ^ (int) dest) & 2 ) // Misaligned?
	{
		do
		{
			col = *src++;
			*dest++ = ((col & 0xf000) >> 4) | ((col & 0x0780) >> 3) | ((col & 0x001e) >> 1) | 0xf000;
		}
		while ( --count );
	}
	else
	{
		if ( (int) src & 2 ) // Odd first uint16?
		{
			col = *src++;
			*dest++ = ((col & 0xf000) >> 4) | ((col & 0x0780) >> 3) | ((col & 0x001e) >> 1) | 0xf000;
			count--;
		}

		int count2 = (count >> 1);
		if ( count2 ) // Middle uint32s?
			do
			{
				col = *((uint32*) src);
				src += 2;
				*((uint32*) dest) = ((col & 0xf000f000) >> 4) | ((col & 0x07800780) >> 3) 
									| ((col & 0x001e001e) >> 1) | 0xf000f000;
				dest += 2;
			}
			while ( --count2 );

		if ( count & 1 ) // Odd last uint16?
		{
			col = *src++;
			*dest++ = ((col & 0xf000) >> 4) | ((col & 0x780) >> 3) | ((col & 0x001e) >> 1) | 0xf000;
		}
	}
}

static void inner_remap_rgb565_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0xf800) << 8) | ((col & 0x07e0) << 5) | ((col & 0x001f) << 3)
				  | 0xff000000;
	}
	while ( --count );
}

static void inner_remap_rgb888_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;
	do
	{
		col = read24( src );
		src += 3;
		*dest++ = Blitter::palmono[(col & 0x00ff0000) >> 16].r
			      + Blitter::palmono[(col & 0x0000ff00) >> 8].g
				  + Blitter::palmono[col & 0x000000ff].b;
	}
	while ( --count );
}

static void inner_remap_rgb888_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = read24( src );
		src += 3;
		*dest++ = ((col & 0x00e00000) >> 16) | ((col & 0x0000e000) >> 11)
				  | ((col & 0x000000c0) >> 6);
	}
	while ( --count );
}

static void inner_remap_rgb888_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = read24( src );
		src += 3;
		*dest++ = ((col & 0x00f80000) >> 8) | ((col & 0x0000fc00) >> 5) | ((col & 0x000000f8) >> 3);
	}
	while ( --count );
}

static void inner_remap_rgb888_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = read24( src );
		src += 3;
		*dest++ = ((col & 0x00f80000) >> 9) | ((col & 0x0000f800) >> 6) | ((col & 0x000000f8) >> 3)
				  | 0x8000;
	}
	while ( --count );
}

static void inner_remap_rgb888_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = read24( src );
		src += 3;
		*dest++ = ((col & 0x00f00000) >> 12) | ((col & 0x0000f000) >> 8) | ((col & 0x000000f0) >> 4)
				  | 0xf000;
	}
	while ( --count );
}

static void inner_remap_rgb888_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;

	do
	{
		*dest++ = read24( src ) | 0xff000000;
		src += 3;
	}
	while ( --count );
}

static void inner_remap_argb1555_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = Blitter::palmono[(col & 0x7c00) >> 7].r
				  + Blitter::palmono[(col & 0x03e0) >> 2].g
				  + Blitter::palmono[(col & 0x001f) << 3].b;
	}
	while ( --count );
}

static void inner_remap_argb1555_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0x7000) >> 7) | ((col & 0x0380) >> 5) | ((col & 0x0018) >> 3);
	}
	while ( --count );
}

static void inner_remap_argb1555_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	if ( ((int) src ^ (int) dest) & 2 ) // Misaligned?
	{
		do
		{
			col = *src++ & 0x7fff;
			*dest++ = col + (col & 0xffe0);
		}
		while ( --count );
	}
	else
	{
		if ( (int) src & 2 ) // Odd first uint16?
		{
			col = *src++ & 0x7fff;
			*dest++ = col + (col & 0xffe0);
			count--;
		}

		int count2 = (count >> 1);
		if ( count2 ) // Middle uint32s?
			do
			{
				col = *((uint32*) src) & 0x7fff7fff;
				src += 2;
				*((uint32*) dest) = col + (col & 0xffe0ffe0);
				dest += 2;
			}
			while ( --count2 );

		if ( count & 1 ) // Odd last uint16?
		{
			col = *src++ & 0x7fff;
			*dest++ = col + (col & 0xffe0);
		}
	}
}

static void inner_remap_argb1555_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		write24( dest, ((col & 0x7c00) << 9) | ((col & 0x03e0) << 6) | ((col & 0x001f) << 3) );
		dest += 3;
	}
	while ( --count );
}

static void inner_remap_argb1555_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	if ( ((int) src ^ (int) dest) & 2 ) // Misaligned?
	{
		do
		{
			col = *src++;
			*dest++ = ((col & 0x7800) >> 3) | ((col & 0x03c0) >> 2) | ((col & 0x001e) >> 1)
					  | (0x10000 - ((col & 0x8000) >> 3));
		}
		while ( --count );
	}
	else
	{
		if ( (int) src & 2 ) // Odd first uint16?
		{
			col = *src++;
			*dest++ = ((col & 0x7800) >> 3) | ((col & 0x03c0) >> 2) | ((col & 0x001e) >> 1)
					  | (0x10000 - ((col & 0x8000) >> 3));
			count--;
		}

		int count2 = (count >> 1);
		if ( count2 ) // Middle uint32s?
			do
			{
//				static uint32 alphabits[2] = { 0x00000000, 0xf000f000 };
				col = *((uint32*) src);
				src += 2;
				*((uint32*) dest) = ((col & 0x78007800) >> 3) | ((col & 0x03c003c0) >> 2)
									| ((col & 0x001e001e) >> 1) 
									| ((-int((col & 0x8000) >> 4)) & 0xf000f000);
				dest += 2;
			}
			while ( --count2 );

		if ( count & 1 ) // Odd last uint16?
		{
			col = *src++;
			*dest++ = ((col & 0x7800) >> 3) | ((col & 0x03c0) >> 2) | ((col & 0x001e) >> 1)
					  | (0x10000 - ((col & 0x8000) >> 3));
		}
	}
}

static void inner_remap_argb1555_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0x7c00) << 9) | ((col & 0x03e0) << 6) | ((col & 0x001f) << 3)
				  | ((-int(col & 0x8000)) & 0xff000000);
	}
	while ( --count );
}

static void inner_remap_argb4444_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ =   Blitter::palmono[(col & 0x0f00) >> 8].r
				  + Blitter::palmono[col & 0x00f0].g
				  + Blitter::palmono[(col & 0x000f) << 4].b;
	}
	while ( --count );
}

static void inner_remap_argb4444_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0x0e00) >> 4) | ((col & 0x00e0) >> 3) | ((col & 0x000c) >> 2);
	}
	while ( --count );
}

static void inner_remap_argb4444_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	if ( ((int) src ^ (int) dest) & 2 ) // Misaligned?
	{
		do
		{
			col = *src++;
			*dest++ = ((col & 0x0f00) << 4) | ((col & 0x00f0) << 3) | ((col & 0x000f) << 1);
		}
		while ( --count );
	}
	else
	{
		if ( (int) src & 2 ) // Odd first uint16?
		{
			col = *src++;
			*dest++ = ((col & 0x0f00) << 4) | ((col & 0x00f0) << 3) | ((col & 0x000f) << 1);
			count--;
		}

		int count2 = (count >> 1);
		if ( count2 ) // Middle uint32s?
			do
			{
				col = *((uint32*) src);
				src += 2;
				*((uint32*) dest) = ((col & 0x0f000f00) << 4) | ((col & 0x00f000f0) << 3)
									| ((col & 0x000f000f) << 1);
				dest += 2;
			}
			while ( --count2 );

		if ( count & 1 ) // Odd last uint16?
		{
			col = *src++;
			*dest++ = ((col & 0x0f00) << 4) | ((col & 0x00f0) << 3) | ((col & 0x000f) << 1);
		}
	}
}

static void inner_remap_argb4444_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		write24( dest, ((col & 0x0f00) << 12) | ((col & 0x00f0) << 8) | ((col & 0x000f) << 4) );
		dest += 3;
	}
	while ( --count );
}

static void inner_remap_argb4444_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	if ( ((int) src ^ (int) dest) & 2 ) // Misaligned?
	{
		do
		{
			col = *src++;
			*dest++ = ((col & 0x0f00) << 3) | ((col & 0x00f0) << 2) | ((col & 0x000f) << 1)
					  | (col & 0x8000);
		}
		while ( --count );
	}
	else
	{
		if ( (int) src & 2 ) // Odd first uint16?
		{
			col = *src++;
			*dest++ = ((col & 0x0f00) << 3) | ((col & 0x00f0) << 2) | ((col & 0x000f) << 1)
					  | (col & 0x8000);
			count--;
		}

		int count2 = (count >> 1);
		if ( count2 ) // Middle uint32s?
			do
			{
//				static uint32 alphabits[2] = { 0x00000000, 0xf000f000 };
				col = *((uint32*) src);
				src += 2;
				*((uint32*) dest) = ((col & 0x0f000f00) << 3) | ((col & 0x00f000f0) << 2)
									| ((col & 0x000f000f) << 1) 
									| (col & 0x80008000);
				dest += 2;
			}
			while ( --count2 );

		if ( count & 1 ) // Odd last uint16?
		{
			col = *src++;
			*dest++ = ((col & 0x0f00) << 3) | ((col & 0x00f0) << 2) | ((col & 0x000f) << 1)
					  | (col & 0x8000);
		}
	}
}

static void inner_remap_argb4444_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint16* src = (uint16*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;
	uint32 alpha;

	do
	{
		col = *src++;
		alpha = (col & 0xf000);
		*dest++ = ((col & 0x0f00) << 12) | ((col & 0x00f0) << 8) | ((col & 0x000f) << 4)
				  | (alpha << 16) | (alpha << 12);
	}
	while ( --count );
}

static void inner_remap_argb8888_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = Blitter::palmono[(col & 0x00ff0000) >> 16].r
			      + Blitter::palmono[(col & 0x0000ff00) >> 8].g
				  + Blitter::palmono[col & 0x000000ff].b;
	}
	while ( --count );
}

static void inner_remap_argb8888_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0x00e00000) >> 16) | ((col & 0x0000e000) >> 11)
				  | ((col & 0x000000c0) >> 6);
	}
	while ( --count );
}

static void inner_remap_argb8888_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32* src = (uint32*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0x00f80000) >> 8) | ((col & 0x0000fc00) >> 5)
				  | ((col & 0x000000f8) >> 3);
	}
	while ( --count );
}

static void inner_remap_argb8888_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		write24( dest, col );
		dest += 3;
	}
	while ( --count );
}

static void inner_remap_argb8888_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32* src = (uint32*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0x00f80000) >> 9) | ((col & 0x0000f800) >> 6)
				  | ((col & 0x000000f8) >> 3) | 0x8000;
	}
	while ( --count );
}

static void inner_remap_argb8888_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32* src = (uint32*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = *src++;
		*dest++ = ((col & 0x00f00000) >> 12) | ((col & 0x0000f000) >> 8)
				  | ((col & 0x000000f0) >> 4) | 0xf000;
	}
	while ( --count );
}

#endif


//////////////////////////////////////////////////////
// stretch/generic innerloops                      //
////////////////////////////////////////////////////

static void inner_stretch_1ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	do
	{
		*dest++ = src[u >> 16];
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_2ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	do
	{
		*dest++ = src[u >> 16];
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	do
	{
		write24( dest, read24( &src[(u >> 16) * 3] ) );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_4ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint32* dest = (uint32*) info->dest;
	do
	{
		*dest++ = src[u >> 16];
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_1rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	do
	{
		*dest++ = src[u >> 16];
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_2rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	do
	{
		*dest++ = src[u >> 16];
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	do
	{
		write24( dest, read24( &src[(u >> 16) * 3] ) );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_4rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint32* dest = (uint32*) info->dest;
	do
	{
		*dest++ = src[u >> 16];
		u += info->ustep;
	} while ( --x );
}


//////////////////////////////////////////////////////
// stretch+remap innerloops                        //
////////////////////////////////////////////////////

static void inner_stretch_remap_1rgba_1rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1rgba_2rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1rgba_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		write24( dest, (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
					   | (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
					   | (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
					   | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
					   | blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1rgba_4rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2rgba_1rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2rgba_2rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2rgba_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		write24( dest, (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
					   | (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
					   | (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
					   | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
					   | blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2rgba_4rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3rgba_1rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3rgba_2rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3rgba_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		write24( dest, (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
					   | (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
					   | (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
					   | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
					   | blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3rgba_4rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4rgba_1rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4rgba_2rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4rgba_3rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		write24( dest, (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
					   | (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
					   | (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
					   | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
					   | blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4rgba_4rgba(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = (((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask)
				| (((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask)
				| (((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask)
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1pa_1ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1pa_2ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint16* pal = (uint16*) blitter->palremap;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1pa_3ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		write24 ( dest, read24( &pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask) * 3] )
						| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
						| blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1pa_4ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint32* pal = (uint32*) blitter->palremap;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2pa_1ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2pa_2ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint16* pal = (uint16*) blitter->palremap;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2pa_3ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		write24 ( dest, read24( &pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask) * 3] )
						| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
						| blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2pa_4ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint32* pal = (uint32*) blitter->palremap;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3pa_1ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3pa_2ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint16* pal = (uint16*) blitter->palremap;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3pa_3ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		write24 ( dest, read24( &pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask) * 3] )
						| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
						| blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3pa_4ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint32* pal = (uint32*) blitter->palremap;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4pa_1ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4pa_2ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint16* pal = (uint16*) blitter->palremap;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4pa_3ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		write24 ( dest, read24( &pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask) * 3] )
						| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
						| blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4pa_4ta(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint32* pal = (uint32*) blitter->palremap;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol;
	do
	{
		origcol = src[u >> 16];
		*dest++ = pal[(((origcol >> blitter->index.right) << blitter->index.left) & blitter->index.mask)]
				| (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1rgba_1ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1rgba_2ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1rgba_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		write24( dest, (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
					   | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
					   | blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_1rgba_4ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2rgba_1ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2rgba_2ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2rgba_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		write24( dest, (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
					   | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
					   | blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_2rgba_4ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint16* src = (uint16*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3rgba_1ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3rgba_2ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3rgba_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		write24( dest, (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
					   | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
					   | blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_3rgba_4ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = read24( &src[(u >> 16) * 3] );
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4rgba_1ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4rgba_2ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4rgba_3ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		write24( dest, (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
					   | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
					   | blitter->alphaor );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_remap_4rgba_4ia(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32* src = (uint32*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 origcol, col;
	do
	{
		origcol = src[u >> 16];
		col = Blitter::palmono[((origcol >> blitter->red.right) << blitter->red.left) & blitter->red.mask].r
			  + Blitter::palmono[((origcol >> blitter->green.right) << blitter->green.left) & blitter->green.mask].g
			  + Blitter::palmono[((origcol >> blitter->blue.right) << blitter->blue.left) & blitter->blue.mask].b;
		*dest++ = (((col >> blitter->intens.right) << blitter->intens.left) & blitter->intens.mask)
			    | (((origcol >> blitter->alpha.right) << blitter->alpha.left) & blitter->alpha.mask)
				| blitter->alphaor;
		u += info->ustep;
	} while ( --x );
}


//////////////////////////////////////////////////////
// stretch+remap specialized innerloops            //
////////////////////////////////////////////////////

#ifdef ENABLE_SPECIALIZED_C_INNERS

static void inner_stretch_remap_p8_1ta(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 alphaor = blitter->alphaor;
	uint32 col;
	do
	{
		col = src[u >> 16];
		*dest++ = pal[col] | alphaor;
		u += ustep;
	} while ( --count );
}

static void inner_stretch_remap_p8_2ta(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* pal = (uint16*) blitter->palremap;
	uint16* dest = (uint16*) info->dest;
	uint32 alphaor = blitter->alphaor;
	uint32 col;
	do
	{
		col = src[u >> 16];
		*dest++ = pal[col] | alphaor;
		u += ustep;
	} while ( --count );
}

static void inner_stretch_remap_p8_3ta(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint8* pal = (uint8*) blitter->palremap;
	uint8* dest = (uint8*) info->dest;
	uint32 alphaor = blitter->alphaor;
	uint32 col;
	do
	{
		col = src[u >> 16];
		write24( dest, read24( &pal[col * 3] ) | alphaor );
		dest += 3;
		u += ustep;
	} while ( --count );
}

static void inner_stretch_remap_p8_4ta(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint32* pal = (uint32*) blitter->palremap;
	uint32* dest = (uint32*) info->dest;
	uint32 alphaor = blitter->alphaor;
	uint32 col;
	do
	{
		col = src[u >> 16];
		*dest++ = pal[col] | alphaor;
		u += ustep;
	} while ( --count );
}

static void inner_stretch_remap_i8_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = (col & 0xe0) | ((col & 0xe0) >> 3) | ((col & 0xc0) >> 6);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_i8_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xf8) << 8) | ((col & 0xfc) << 3) | ((col & 0xf8) >> 3);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_i8_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		write24( dest, (col << 16) | (col << 8) | col );
		dest += 3;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_i8_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col, col2;

	do
	{
		col = src[u >> 16];
		col2 = (col & 0xf8);
		*dest++ = (col2 << 7) | (col2 << 2) | (col2 >> 3) | 0x8000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_i8_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col, col2;

	do
	{
		col = src[u >> 16];
		col2 = (col & 0xf0);
		*dest++ = (col2 << 4) | col2 | (col2 >> 4) | 0xf000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_i8_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = (col << 16) | (col << 8) | col | 0xff000000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb332_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = Blitter::palmono[col & 0xe0].r + Blitter::palmono[(col & 0x1c) << 3].g
				  + Blitter::palmono[(col & 0x03) << 6].b;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb332_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xe0) << 8) | ((col & 0x1c) << 6) | ((col & 0x03) << 3);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb332_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		write24( dest, ((col & 0xe0) << 16) | ((col & 0x1c) << 11) | ((col & 0x03) << 6) );
		dest += 3;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb332_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xe0) << 7) | ((col & 0x1c) << 5) | ((col & 0x03) << 3) | 0x8000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb332_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xe0) << 4) | ((col & 0x1c) << 3) | ((col & 0x03) << 2) | 0xf000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb332_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xe0) << 16) | ((col & 0x1c) << 11) | ((col & 0x03) << 6) | 0xff000000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb565_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = Blitter::palmono[(col & 0xf800) >> 8].r
				  + Blitter::palmono[(col & 0x07e0) >> 3].g
				  + Blitter::palmono[(col & 0x001f) << 3].b;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb565_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xe000) >> 8) | ((col & 0x0700) >> 6) | ((col & 0x0018) >> 3);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb565_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		write24( dest, ((col & 0xf800) << 8) | ((col & 0x07e0) << 5) | ((col & 0x001f) << 3) );
		u += ustep;
		dest += 3;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb565_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xffc0) >> 1) | (col & 0x1f) | 0x8000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb565_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xf000) >> 4) | ((col & 0x0780) >> 3) | ((col & 0x001e) >> 1) | 0xf000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb565_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0xf800) << 8) | ((col & 0x07e0) << 5) | ((col & 0x001f) << 3)
				  | 0xff000000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb888_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = read24( &src[(u >> 16) * 3] );
		*dest++ = Blitter::palmono[(col & 0x00ff0000) >> 16].r
			      + Blitter::palmono[(col & 0x0000ff00) >> 8].g
				  + Blitter::palmono[col & 0x000000ff].b;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb888_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = read24( &src[(u >> 16) * 3] );
		*dest++ = ((col & 0x00e00000) >> 16) | ((col & 0x0000e000) >> 11)
				  | ((col & 0x000000c0) >> 6);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb888_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = read24( &src[(u >> 16) * 3] );
		*dest++ = ((col & 0x00f80000) >> 8) | ((col & 0x0000fc00) >> 5) | ((col & 0x000000f8) >> 3);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb888_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = read24( &src[(u >> 16) * 3] );
		*dest++ = ((col & 0x00f80000) >> 9) | ((col & 0x0000f800) >> 6) | ((col & 0x000000f8) >> 3)
				  | 0x8000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb888_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = read24( &src[(u >> 16) * 3] );
		*dest++ = ((col & 0x00f00000) >> 12) | ((col & 0x0000f000) >> 8) | ((col & 0x000000f0) >> 4)
				  | 0xf000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_rgb888_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint8* src = (uint8*) info->src;
	uint32* dest = (uint32*) info->dest;

	do
	{
		*dest++ = read24( &src[(u >> 16) * 3] ) | 0xff000000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb1555_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = Blitter::palmono[(col & 0x7c00) >> 7].r
				  + Blitter::palmono[(col & 0x03e0) >> 2].g
				  + Blitter::palmono[(col & 0x001f) << 3].b;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb1555_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x7000) >> 7) | ((col & 0x0380) >> 5) | ((col & 0x0018) >> 3);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb1555_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16] & 0x7fff;
		*dest++ = col + (col & 0xffe0);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb1555_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		write24( dest, ((col & 0x7c00) << 9) | ((col & 0x03e0) << 6) | ((col & 0x001f) << 3) );
		dest += 3;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb1555_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x7800) >> 3) | ((col & 0x03c0) >> 2) | ((col & 0x001e) >> 1)
				  | (0x10000 - ((col & 0x8000) >> 3));
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb1555_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x7c00) << 9) | ((col & 0x03e0) << 6) | ((col & 0x001f) << 3)
				  | ((-int(col & 0x8000)) & 0xff000000);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb4444_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = Blitter::palmono[(col & 0x0f00) >> 8].r
				  + Blitter::palmono[col & 0x00f0].g
				  + Blitter::palmono[(col & 0x000f) << 4].b;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb4444_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x0e00) >> 4) | ((col & 0x00e0) >> 3) | ((col & 0x000c) >> 2);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb4444_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x0f00) << 4) | ((col & 0x00f0) << 3) | ((col & 0x000f) << 1);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb4444_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		write24( dest, ((col & 0x0f00) << 12) | ((col & 0x00f0) << 8) | ((col & 0x000f) << 4) );
		dest += 3;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb4444_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x0f00) << 3) | ((col & 0x00f0) << 2) | ((col & 0x000f) << 1)
				  | (col & 0x8000);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb4444_argb8888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint16* src = (uint16*) info->src;
	uint32* dest = (uint32*) info->dest;
	uint32 col;
	uint32 alpha;

	do
	{
		col = src[u >> 16];
		alpha = (col & 0xf000);
		*dest++ = ((col & 0x0f00) << 12) | ((col & 0x00f0) << 8) | ((col & 0x000f) << 4)
				  | (alpha << 16) | (alpha << 12);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb8888_i8(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = Blitter::palmono[(col & 0x00ff0000) >> 16].r
			      + Blitter::palmono[(col & 0x0000ff00) >> 8].g
				  + Blitter::palmono[col & 0x000000ff].b;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb8888_rgb332(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x00e00000) >> 16) | ((col & 0x0000e000) >> 11)
				  | ((col & 0x000000c0) >> 6);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb8888_rgb565(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint32* src = (uint32*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x00f80000) >> 8) | ((col & 0x0000fc00) >> 5)
				  | ((col & 0x000000f8) >> 3);
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb8888_rgb888(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint32* src = (uint32*) info->src;
	uint8* dest = (uint8*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		write24( dest, col );
		dest += 3;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb8888_argb1555(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint32* src = (uint32*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x00f80000) >> 9) | ((col & 0x0000f800) >> 6)
				  | ((col & 0x000000f8) >> 3) | 0x8000;
		u += ustep;
	}
	while ( --count );
}

static void inner_stretch_remap_argb8888_argb4444(Blitter* blitter, const InnerInfo* info)
{
	int count = info->width;
	uint32 u = info->ustart;
	uint32 ustep = info->ustep;
	uint32* src = (uint32*) info->src;
	uint16* dest = (uint16*) info->dest;
	uint32 col;

	do
	{
		col = src[u >> 16];
		*dest++ = ((col & 0x00f00000) >> 12) | ((col & 0x0000f000) >> 8)
				  | ((col & 0x000000f0) >> 4) | 0xf000;
		u += ustep;
	}
	while ( --count );
}

#endif


//////////////////////////////////////////////////////
// stretch+bilinear generic innerloops             //
////////////////////////////////////////////////////

static void inner_stretch_1ia_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac;
	uint8* src1 = (uint8*) info->src;
	uint8* src2 = (uint8*) info->src2;
	uint8* dest = (uint8*) info->dest;
	uint32 col1i, col2i, col1a, col2a;
	uint32 col;
	do
	{
		uinte = (u >> 16);
		ufrac = u & 0xffff;
		col1i = ((0x10000 - ufrac) * (((src1[uinte] >> blitter->intens.right) << blitter->intens.left) & 0xff)
			     + ufrac * (((src1[uinte + 1] >> blitter->intens.right) << blitter->intens.left) & 0xff)) >> 16;
		col2i = ((0x10000 - ufrac) * (((src2[uinte] >> blitter->intens.right) << blitter->intens.left) & 0xff)
				 + ufrac * (((src2[uinte + 1] >> blitter->intens.right) << blitter->intens.left) & 0xff)) >> 16;
		col1a = ((0x10000 - ufrac) * (((src1[uinte] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((src1[uinte + 1] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col2a = ((0x10000 - ufrac) * (((src2[uinte] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((src2[uinte + 1] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col = ((((((0x10000 - info->vfrac) * col1i + info->vfrac * col2i) >> 16) >> blitter->intens.left) 
				<< blitter->intens.right) & blitter->intens.mask)
			  | ((((((0x10000 - info->vfrac) * col1a + info->vfrac * col2a) >> 16) >> blitter->alpha.left) 
				  << blitter->alpha.right) & blitter->alpha.mask);
		*dest++ = col;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_2ia_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac;
	uint16* src1 = (uint16*) info->src;
	uint16* src2 = (uint16*) info->src2;
	uint16* dest = (uint16*) info->dest;
	uint32 col1i, col2i, col1a, col2a;
	uint32 col;
	do
	{
		uinte = (u >> 16);
		ufrac = u & 0xffff;
		col1i = ((0x10000 - ufrac) * (((src1[uinte] >> blitter->intens.right) << blitter->intens.left) & 0xff)
			     + ufrac * (((src1[uinte + 1] >> blitter->intens.right) << blitter->intens.left) & 0xff)) >> 16;
		col2i = ((0x10000 - ufrac) * (((src2[uinte] >> blitter->intens.right) << blitter->intens.left) & 0xff)
				 + ufrac * (((src2[uinte + 1] >> blitter->intens.right) << blitter->intens.left) & 0xff)) >> 16;
		col1a = ((0x10000 - ufrac) * (((src1[uinte] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((src1[uinte + 1] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col2a = ((0x10000 - ufrac) * (((src2[uinte] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((src2[uinte + 1] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col = ((((((0x10000 - info->vfrac) * col1i + info->vfrac * col2i) >> 16) >> blitter->intens.left) 
				<< blitter->intens.right) & blitter->intens.mask)
			  | ((((((0x10000 - info->vfrac) * col1a + info->vfrac * col2a) >> 16) >> blitter->alpha.left) 
				  << blitter->alpha.right) & blitter->alpha.mask);
		*dest++ = col;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_3ia_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac;
	uint8* src1 = (uint8*) info->src;
	uint8* src2 = (uint8*) info->src2;
	uint8* dest = (uint8*) info->dest;
	uint32 pix11, pix12, pix21, pix22;
	int offs;
	uint32 col1i, col2i, col1a, col2a;
	uint32 col;
	do
	{
		uinte = (u >> 16);
		ufrac = u & 0xffff;
		offs = uinte * 3;
		pix11 = read24( &src1[offs] );
		pix12 = read24( &src1[offs + 3] );
		pix21 = read24( &src2[offs] );
		pix22 = read24( &src2[offs + 3] );
		col1i = ((0x10000 - ufrac) * (((pix11 >> blitter->intens.right) << blitter->intens.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->intens.right) << blitter->intens.left) & 0xff)) >> 16;
		col2i = ((0x10000 - ufrac) * (((pix21 >> blitter->intens.right) << blitter->intens.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->intens.right) << blitter->intens.left) & 0xff)) >> 16;
		col1a = ((0x10000 - ufrac) * (((pix11 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col2a = ((0x10000 - ufrac) * (((pix21 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix22 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col = ((((((0x10000 - info->vfrac) * col1i + info->vfrac * col2i) >> 16) >> blitter->intens.left) 
				<< blitter->intens.right) & blitter->intens.mask)
			  | ((((((0x10000 - info->vfrac) * col1a + info->vfrac * col2a) >> 16) >> blitter->alpha.left) 
				  << blitter->alpha.right) & blitter->alpha.mask);
		write24( dest, col );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_4ia_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac;
	uint32* src1 = (uint32*) info->src;
	uint32* src2 = (uint32*) info->src2;
	uint32* dest = (uint32*) info->dest;
	uint32 col1i, col2i, col1a, col2a;
	uint32 col;
	do
	{
		uinte = (u >> 16);
		ufrac = u & 0xffff;
		col1i = ((0x10000 - ufrac) * (((src1[uinte] >> blitter->intens.right) << blitter->intens.left) & 0xff)
			     + ufrac * (((src1[uinte + 1] >> blitter->intens.right) << blitter->intens.left) & 0xff)) >> 16;
		col2i = ((0x10000 - ufrac) * (((src2[uinte] >> blitter->intens.right) << blitter->intens.left) & 0xff)
				 + ufrac * (((src2[uinte + 1] >> blitter->intens.right) << blitter->intens.left) & 0xff)) >> 16;
		col1a = ((0x10000 - ufrac) * (((src1[uinte] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((src1[uinte + 1] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col2a = ((0x10000 - ufrac) * (((src2[uinte] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((src2[uinte + 1] >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col = ((((((0x10000 - info->vfrac) * col1i + info->vfrac * col2i) >> 16) >> blitter->intens.left) 
				<< blitter->intens.right) & blitter->intens.mask)
			  | ((((((0x10000 - info->vfrac) * col1a + info->vfrac * col2a) >> 16) >> blitter->alpha.left) 
				  << blitter->alpha.right) & blitter->alpha.mask);
		*dest++ = col;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_1rgba_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac;
	uint8* src1 = (uint8*) info->src;
	uint8* src2 = (uint8*) info->src2;
	uint8* dest = (uint8*) info->dest;
	uint32 pix11, pix12, pix21, pix22;
	uint32 col1r, col2r, col1g, col2g, col1b, col2b, col1a, col2a;
	uint32 col;
	do
	{
		uinte = (u >> 16);
		ufrac = u & 0xffff;
		pix11 = src1[uinte];
		pix12 = src1[uinte + 1];
		pix21 = src2[uinte];
		pix22 = src2[uinte + 1];
		col1r = ((0x10000 - ufrac) * (((pix11 >> blitter->red.right) << blitter->red.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->red.right) << blitter->red.left) & 0xff)) >> 16;
		col2r = ((0x10000 - ufrac) * (((pix21 >> blitter->red.right) << blitter->red.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->red.right) << blitter->red.left) & 0xff)) >> 16;
		col1g = ((0x10000 - ufrac) * (((pix11 >> blitter->green.right) << blitter->green.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->green.right) << blitter->green.left) & 0xff)) >> 16;
		col2g = ((0x10000 - ufrac) * (((pix21 >> blitter->green.right) << blitter->green.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->green.right) << blitter->green.left) & 0xff)) >> 16;
		col1b = ((0x10000 - ufrac) * (((pix11 >> blitter->blue.right) << blitter->blue.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->blue.right) << blitter->blue.left) & 0xff)) >> 16;
		col2b = ((0x10000 - ufrac) * (((pix21 >> blitter->blue.right) << blitter->blue.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->blue.right) << blitter->blue.left) & 0xff)) >> 16;
		col1a = ((0x10000 - ufrac) * (((pix11 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col2a = ((0x10000 - ufrac) * (((pix21 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix22 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col = ((((((0x10000 - info->vfrac) * col1r + info->vfrac * col2r) >> 16) >> blitter->red.left) 
				<< blitter->red.right) & blitter->red.mask)
			  | ((((((0x10000 - info->vfrac) * col1g + info->vfrac * col2g) >> 16) >> blitter->green.left) 
				  << blitter->green.right) & blitter->green.mask)
			  | ((((((0x10000 - info->vfrac) * col1b + info->vfrac * col2b) >> 16) >> blitter->blue.left) 
				  << blitter->blue.right) & blitter->blue.mask)
			  | ((((((0x10000 - info->vfrac) * col1a + info->vfrac * col2a) >> 16) >> blitter->alpha.left) 
				  << blitter->alpha.right) & blitter->alpha.mask);
		*dest++ = col;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_2rgba_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac;
	uint16* src1 = (uint16*) info->src;
	uint16* src2 = (uint16*) info->src2;
	uint16* dest = (uint16*) info->dest;
	uint32 pix11, pix12, pix21, pix22;
	uint32 col1r, col2r, col1g, col2g, col1b, col2b, col1a, col2a;
	uint32 col;
	do
	{
		uinte = (u >> 16);
		ufrac = u & 0xffff;
		pix11 = src1[uinte];
		pix12 = src1[uinte + 1];
		pix21 = src2[uinte];
		pix22 = src2[uinte + 1];
		col1r = ((0x10000 - ufrac) * (((pix11 >> blitter->red.right) << blitter->red.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->red.right) << blitter->red.left) & 0xff)) >> 16;
		col2r = ((0x10000 - ufrac) * (((pix21 >> blitter->red.right) << blitter->red.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->red.right) << blitter->red.left) & 0xff)) >> 16;
		col1g = ((0x10000 - ufrac) * (((pix11 >> blitter->green.right) << blitter->green.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->green.right) << blitter->green.left) & 0xff)) >> 16;
		col2g = ((0x10000 - ufrac) * (((pix21 >> blitter->green.right) << blitter->green.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->green.right) << blitter->green.left) & 0xff)) >> 16;
		col1b = ((0x10000 - ufrac) * (((pix11 >> blitter->blue.right) << blitter->blue.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->blue.right) << blitter->blue.left) & 0xff)) >> 16;
		col2b = ((0x10000 - ufrac) * (((pix21 >> blitter->blue.right) << blitter->blue.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->blue.right) << blitter->blue.left) & 0xff)) >> 16;
		col1a = ((0x10000 - ufrac) * (((pix11 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col2a = ((0x10000 - ufrac) * (((pix21 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix22 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col = ((((((0x10000 - info->vfrac) * col1r + info->vfrac * col2r) >> 16) >> blitter->red.left) 
				<< blitter->red.right) & blitter->red.mask)
			  | ((((((0x10000 - info->vfrac) * col1g + info->vfrac * col2g) >> 16) >> blitter->green.left) 
				  << blitter->green.right) & blitter->green.mask)
			  | ((((((0x10000 - info->vfrac) * col1b + info->vfrac * col2b) >> 16) >> blitter->blue.left) 
				  << blitter->blue.right) & blitter->blue.mask)
			  | ((((((0x10000 - info->vfrac) * col1a + info->vfrac * col2a) >> 16) >> blitter->alpha.left) 
				  << blitter->alpha.right) & blitter->alpha.mask);
		*dest++ = col;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_3rgba_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac;
	uint8* src1 = (uint8*) info->src;
	uint8* src2 = (uint8*) info->src2;
	uint8* dest = (uint8*) info->dest;
	uint32 pix11, pix12, pix21, pix22;
	int offs;
	uint32 col1r, col2r, col1g, col2g, col1b, col2b, col1a, col2a;
	uint32 col;
	do
	{
		uinte = (u >> 16);
		ufrac = u & 0xffff;
		offs = uinte * 3;
		pix11 = read24( &src1[offs] );
		pix12 = read24( &src1[offs + 3] );
		pix21 = read24( &src2[offs] );
		pix22 = read24( &src2[offs + 3] );
		col1r = ((0x10000 - ufrac) * (((pix11 >> blitter->red.right) << blitter->red.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->red.right) << blitter->red.left) & 0xff)) >> 16;
		col2r = ((0x10000 - ufrac) * (((pix21 >> blitter->red.right) << blitter->red.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->red.right) << blitter->red.left) & 0xff)) >> 16;
		col1g = ((0x10000 - ufrac) * (((pix11 >> blitter->green.right) << blitter->green.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->green.right) << blitter->green.left) & 0xff)) >> 16;
		col2g = ((0x10000 - ufrac) * (((pix21 >> blitter->green.right) << blitter->green.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->green.right) << blitter->green.left) & 0xff)) >> 16;
		col1b = ((0x10000 - ufrac) * (((pix11 >> blitter->blue.right) << blitter->blue.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->blue.right) << blitter->blue.left) & 0xff)) >> 16;
		col2b = ((0x10000 - ufrac) * (((pix21 >> blitter->blue.right) << blitter->blue.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->blue.right) << blitter->blue.left) & 0xff)) >> 16;
		col1a = ((0x10000 - ufrac) * (((pix11 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col2a = ((0x10000 - ufrac) * (((pix21 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix22 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col = ((((((0x10000 - info->vfrac) * col1r + info->vfrac * col2r) >> 16) >> blitter->red.left) 
				<< blitter->red.right) & blitter->red.mask)
			  | ((((((0x10000 - info->vfrac) * col1g + info->vfrac * col2g) >> 16) >> blitter->green.left) 
				  << blitter->green.right) & blitter->green.mask)
			  | ((((((0x10000 - info->vfrac) * col1b + info->vfrac * col2b) >> 16) >> blitter->blue.left) 
				  << blitter->blue.right) & blitter->blue.mask)
			  | ((((((0x10000 - info->vfrac) * col1a + info->vfrac * col2a) >> 16) >> blitter->alpha.left) 
				  << blitter->alpha.right) & blitter->alpha.mask);
		write24( dest, col );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_4rgba_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac;
	uint32* src1 = (uint32*) info->src;
	uint32* src2 = (uint32*) info->src2;
	uint32* dest = (uint32*) info->dest;
	uint32 pix11, pix12, pix21, pix22;
	uint32 col1r, col2r, col1g, col2g, col1b, col2b, col1a, col2a;
	uint32 col;
	do
	{
		uinte = (u >> 16);
		ufrac = u & 0xffff;
		pix11 = src1[uinte];
		pix12 = src1[uinte + 1];
		pix21 = src2[uinte];
		pix22 = src2[uinte + 1];
		col1r = ((0x10000 - ufrac) * (((pix11 >> blitter->red.right) << blitter->red.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->red.right) << blitter->red.left) & 0xff)) >> 16;
		col2r = ((0x10000 - ufrac) * (((pix21 >> blitter->red.right) << blitter->red.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->red.right) << blitter->red.left) & 0xff)) >> 16;
		col1g = ((0x10000 - ufrac) * (((pix11 >> blitter->green.right) << blitter->green.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->green.right) << blitter->green.left) & 0xff)) >> 16;
		col2g = ((0x10000 - ufrac) * (((pix21 >> blitter->green.right) << blitter->green.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->green.right) << blitter->green.left) & 0xff)) >> 16;
		col1b = ((0x10000 - ufrac) * (((pix11 >> blitter->blue.right) << blitter->blue.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->blue.right) << blitter->blue.left) & 0xff)) >> 16;
		col2b = ((0x10000 - ufrac) * (((pix21 >> blitter->blue.right) << blitter->blue.left) & 0xff)
				 + ufrac * (((pix22 >> blitter->blue.right) << blitter->blue.left) & 0xff)) >> 16;
		col1a = ((0x10000 - ufrac) * (((pix11 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix12 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col2a = ((0x10000 - ufrac) * (((pix21 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)
			     + ufrac * (((pix22 >> blitter->alpha.right) << blitter->alpha.left) & 0xff)) >> 16;
		col = ((((((0x10000 - info->vfrac) * col1r + info->vfrac * col2r) >> 16) >> blitter->red.left) 
				<< blitter->red.right) & blitter->red.mask)
			  | ((((((0x10000 - info->vfrac) * col1g + info->vfrac * col2g) >> 16) >> blitter->green.left) 
				  << blitter->green.right) & blitter->green.mask)
			  | ((((((0x10000 - info->vfrac) * col1b + info->vfrac * col2b) >> 16) >> blitter->blue.left) 
				  << blitter->blue.right) & blitter->blue.mask)
			  | ((((((0x10000 - info->vfrac) * col1a + info->vfrac * col2a) >> 16) >> blitter->alpha.left) 
				  << blitter->alpha.right) & blitter->alpha.mask);
		*dest++ = col;
		u += info->ustep;
	} while ( --x );
}


//////////////////////////////////////////////////////
// stretch+bilinear specialized innerloops         //
////////////////////////////////////////////////////

#ifdef ENABLE_SPECIALIZED_C_INNERS

static void inner_stretch_i8_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac1, ufrac2;
	uint32 vfrac2 = info->vfrac >> 8;
	uint32 vfrac1 = 0x100 - vfrac2;
	uint8* src1 = (uint8*) info->src;
	uint8* src2 = (uint8*) info->src2;
	uint8* dest = (uint8*) info->dest;
	uint32 pix1, pix2, pix3;
	do
	{
		uinte = (u >> 16);
		ufrac2 = (u & 0xff00) >> 8;
		ufrac1 = 0x100 - ufrac2;
		pix1 = (src1[uinte] << 16) | src2[uinte];
		pix2 = (src1[uinte + 1] << 16) | src2[uinte + 1];
		pix3 = ufrac1 * pix1 + ufrac2 * pix2;
		*dest++ = ((vfrac1 * ((pix3 & 0xff000000) >> 16)
					+ vfrac2 * (pix3 & 0x0000ff00)) & 0x00ff0000) >> 16;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_rgb565_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac1, ufrac2;
	uint32 vfrac2 = info->vfrac >> 11;
	uint32 vfrac1 = 0x20 - vfrac2;
	uint16* src1 = (uint16*) info->src;
	uint16* src2 = (uint16*) info->src2;
	uint16* dest = (uint16*) info->dest;
	uint32 pix1a, pix1b, pix2a, pix2b;
	uint32 colrbg, colgrb;
	do
	{
		uinte = (u >> 16);
		ufrac2 = (u & 0xf800) >> 11;
		ufrac1 = 0x20 - ufrac2;
		pix1a = (src1[uinte] << 16) | src2[uinte];
		pix2a = (src1[uinte + 1] << 16) | src2[uinte + 1];
		pix1b = pix1a & 0x07e0f81f;
		pix2b = pix2a & 0x07e0f81f;
		pix1a = (pix1a & 0xf81f07e0) >> 5;
		pix2a = (pix2a & 0xf81f07e0) >> 5;

		colrbg = ((ufrac1 * pix1a + ufrac2 * pix2a) & 0xf81f07e0);
		colgrb = ((ufrac1 * pix1b + ufrac2 * pix2b) & 0xfc1f03e0) >> 5;
		*dest++ = (((vfrac1 * ((colrbg & 0xf81f0000) >> 16)
					+ vfrac2 * (colgrb & 0x0000f81f)) & 0x001f03e0) >> 5)
			      | (((vfrac1 * ((colgrb & 0x07e00000) >> 16)
				      + vfrac2 * (colrbg & 0x000007e0)) & 0x0000fc00) >> 5);
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_rgb888_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac1, ufrac2;
	uint32 vfrac2 = info->vfrac >> 8;
	uint32 vfrac1 = 0x100 - vfrac2;
	uint8* src1 = (uint8*) info->src;
	uint8* src2 = (uint8*) info->src2;
	uint8* dest = (uint8*) info->dest;
	uint32 pix11a, pix11b, pix12a, pix12b, pix21a, pix21b, pix22a, pix22b;
	uint32 col1g, col2g, col1rb, col2rb;
	int offs;
	do
	{
		uinte = (u >> 16);
		ufrac2 = (u & 0xff00) >> 8;
		ufrac1 = 0x100 - ufrac2;
		offs = uinte * 3;
		pix11a = read24( &src1[offs] );
		pix12a = read24( &src1[offs + 3] );
		pix21a = read24( &src2[offs] );
		pix22a = read24( &src2[offs + 3] );
		pix11b = pix11a & 0x00ff00ff;
		pix12b = pix12a & 0x00ff00ff;
		pix21b = pix21a & 0x00ff00ff;
		pix22b = pix22a & 0x00ff00ff;
		pix11a = pix11a & 0x0000ff00;
		pix12a = pix12a & 0x0000ff00;
		pix21a = pix21a & 0x0000ff00;
		pix22a = pix22a & 0x0000ff00;

		col1g = (ufrac1 * pix11a + ufrac2 * pix12a) & 0x00ff0000;
		col2g = (ufrac1 * pix21a + ufrac2 * pix22a) & 0x00ff0000;
		col1rb = ((ufrac1 * pix11b + ufrac2 * pix12b) & 0xff00ff00) >> 8;
		col2rb = ((ufrac1 * pix21b + ufrac2 * pix22b) & 0xff00ff00) >> 8;
		write24( dest, (((vfrac1 * col1g + vfrac2 * col2g) & 0xff000000) >> 16)
					   | (((vfrac1 * col1rb + vfrac2 * col2rb) & 0xff00ff00) >> 8) );
		dest += 3;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_argb1555_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac1, ufrac2;
	uint32 vfrac2 = info->vfrac >> 11;
	uint32 vfrac1 = 0x20 - vfrac2;
	uint16* src1 = (uint16*) info->src;
	uint16* src2 = (uint16*) info->src2;
	uint16* dest = (uint16*) info->dest;
	uint32 pix1a, pix1b, pix2a, pix2b;
	uint32 colrbg, colgrb;
	uint32 alpha;
	do
	{
		uinte = (u >> 16);
		ufrac2 = (u & 0xf800) >> 11;
		ufrac1 = 0x20 - ufrac2;
		pix1a = (src1[uinte] << 16) | src2[uinte];
		pix2a = (src1[uinte + 1] << 16) | src2[uinte + 1];
		alpha = (pix1a & 0x80000000) >> 16;
		pix1b = pix1a & 0x03e07c1f;
		pix2b = pix2a & 0x03e07c1f;
		pix1a = (pix1a & 0x7c1f03e0) >> 5;
		pix2a = (pix2a & 0x7c1f03e0) >> 5;

		colrbg = ((ufrac1 * pix1a + ufrac2 * pix2a) & 0x7c1f03e0);
		colgrb = ((ufrac1 * pix1b + ufrac2 * pix2b) & 0x7c0f83e0) >> 5;
		*dest++ = (((vfrac1 * ((colrbg & 0x7c1f0000) >> 16) 
					+ vfrac2 * (colgrb & 0x00007c1f)) & 0x000f83e0) >> 5)
			      | (((vfrac1 * ((colgrb & 0x03e00000) >> 16)
				      + vfrac2 * (colrbg & 0x000003e0)) & 0x00007c00) >> 5)
				  | alpha;
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_argb4444_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac1, ufrac2;
	uint32 vfrac2 = info->vfrac >> 12;
	uint32 vfrac1 = 0x10 - vfrac2;
	uint16* src1 = (uint16*) info->src;
	uint16* src2 = (uint16*) info->src2;
	uint16* dest = (uint16*) info->dest;
	uint32 pix1a, pix1b, pix2a, pix2b;
	uint32 colag, colrb;
	do
	{
		uinte = (u >> 16);
		ufrac2 = (u & 0xf000) >> 12;
		ufrac1 = 0x10 - ufrac2;
		pix1a = (src1[uinte] << 16) | src2[uinte];
		pix2a = (src1[uinte + 1] << 16) | src2[uinte + 1];
		pix1b = pix1a & 0x0f0f0f0f;
		pix2b = pix2a & 0x0f0f0f0f;
		pix1a = (pix1a & 0xf0f0f0f0) >> 4;
		pix2a = (pix2a & 0xf0f0f0f0) >> 4;

		colag = ((ufrac1 * pix1a + ufrac2 * pix2a) & 0xf0f0f0f0) >> 4;
		colrb = ((ufrac1 * pix1b + ufrac2 * pix2b) & 0xf0f0f0f0) >> 4;
		*dest++ = ((vfrac1 * (colag >> 16) + vfrac2 * (colag & 0x0f0f)) & 0x0000f0f0)
			      | (((vfrac1 * (colrb >> 16) + vfrac2 * (colrb & 0x0f0f)) & 0x0000f0f0) >> 4);
		u += info->ustep;
	} while ( --x );
}

static void inner_stretch_argb8888_bilinear(Blitter* blitter, const InnerInfo* info)
{
	int x = info->width;
	uint32 u = info->ustart;
	uint32 uinte, ufrac1, ufrac2;
	uint32 vfrac2 = info->vfrac >> 8;
	uint32 vfrac1 = 0x100 - vfrac2;
	uint32* src1 = (uint32*) info->src;
	uint32* src2 = (uint32*) info->src2;
	uint32* dest = (uint32*) info->dest;
	uint32 pix11a, pix11b, pix12a, pix12b, pix21a, pix21b, pix22a, pix22b;
	uint32 col1ag, col2ag, col1rb, col2rb;
	do
	{
		uinte = (u >> 16);
		ufrac2 = (u & 0xff00) >> 8;
		ufrac1 = 0x100 - ufrac2;
		pix11a = src1[uinte];
		pix12a = src1[uinte + 1];
		pix21a = src2[uinte];
		pix22a = src2[uinte + 1];
		pix11b = pix11a & 0x00ff00ff;
		pix12b = pix12a & 0x00ff00ff;
		pix21b = pix21a & 0x00ff00ff;
		pix22b = pix22a & 0x00ff00ff;
		pix11a = (pix11a & 0xff00ff00) >> 8;
		pix12a = (pix12a & 0xff00ff00) >> 8;
		pix21a = (pix21a & 0xff00ff00) >> 8;
		pix22a = (pix22a & 0xff00ff00) >> 8;

		col1ag = ((ufrac1 * pix11a + ufrac2 * pix12a) & 0xff00ff00) >> 8;
		col2ag = ((ufrac1 * pix21a + ufrac2 * pix22a) & 0xff00ff00) >> 8;
		col1rb = ((ufrac1 * pix11b + ufrac2 * pix12b) & 0xff00ff00) >> 8;
		col2rb = ((ufrac1 * pix21b + ufrac2 * pix22b) & 0xff00ff00) >> 8;
		*dest++ = (((vfrac1 * col1ag + vfrac2 * col2ag) & 0xff00ff00))
			      | (((vfrac1 * col1rb + vfrac2 * col2rb) & 0xff00ff00) >> 8);
		u += info->ustep;
	} while ( --x );
}

#endif // ENABLE_SPECIALIZED_C_INNERS


//////////////////////////////////////////////////////
// blitter implementation                          //
////////////////////////////////////////////////////

static const int BlitHashBits = 6;						// Must be 1 or larger
static const int BlitHashSize = (1 << BlitHashBits);
static Blitter* blitters[BlitHashSize];


static void BitmapBlitterInitialize()
{
	memset( blitters, 0, sizeof(blitters) );

	// initialize the RGB->greyscale multiply lookuptable
	for ( int i = 0; i < 256; i++ )
	{
		Blitter::palmono[i].r = (uint8) (i * 0.299f);
		Blitter::palmono[i].g = (uint8) (i * 0.587f);
		Blitter::palmono[i].b = (uint8) (i * 0.114f);
		Blitter::palmono[i].a = 0;
	}
	

#ifdef ENABLE_SPECIALIZED_X86_INNERS
#ifdef PRCORE_PLATFORM_WIN32
	unsigned long temp;
	VirtualProtect( &inner_stretch_smc_start, 
		inner_stretch_smc_size, PAGE_EXECUTE_READWRITE, &temp );
#endif
#endif // ENABLE_SPECIALIZED_X86_INNERS

#ifdef ENABLE_SPECIALIZED_X86_MMX_INNERS
	mmxfound = IsMMX();
#endif

}


static inline void MakeBlitMask(BlitMask& out, uint32 src_mask, uint32 dst_mask )
{
	if ( !src_mask || !dst_mask )
	{
		out.left = out.right = out.mask = 0;
	}
	else
	{	
		int shift = HighestBit( dst_mask ) - HighestBit( src_mask );
		if (shift > 0)
		{
			out.left = shift;
			out.right = 0;
		}
		else
		{
			out.left = 0;
			out.right = -shift;
		}
		out.mask = ((src_mask >> out.right) << out.left) & dst_mask;
	}
}


static inline void MakeBlitMaskBilinear(BlitMask& out, uint32 mask)
{
	if ( !mask )
	{
		out.left = 0;
		out.right = 32;
		out.mask = 0;
	}
	else
	{
		int shift = 7 - HighestBit( mask );
		if (shift > 0)
		{
			out.left = shift;
			out.right = 0;
		}
		else
		{
			out.left = 0;
			out.right = -shift;
		}
		out.mask = mask & ((0xff >> out.left) << out.right);
	}
}


static bool UpdatePalGenericRemap(class Blitter& b, const PixelFormat& dest, const PixelFormat& source)
{
	if ( !source.IsIndexed() )
		return true;

	if ( dest.IsIndexed() ) // Indexed -> Indexed
	{
		// only partial support for Indexed -> Indexed so far
		return true;
	}
	else // Indexed -> !Indexed
	{
		Blitter blitter( dest, palformat, BLIT_REMAP );

		InnerInfo info;
		info.src = (uint8*) source.GetPalette();
		info.dest = (uint8*) b.palremap;
		info.width = 256;
		blitter.Blit( &info );
	}

	return true;
}


Blitter::Blitter(const PixelFormat& dest, const PixelFormat& source, int type)
{
	func = NULL;
	updatepal = NULL;

	static PixelFormat stdformats[8] = 
	{ 
		PixelFormat( PALETTE8( NULL ) ),
		PixelFormat( INTENSITY8 ), 
		PixelFormat( RGB332 ), 
		PixelFormat( RGB565 ), 
		PixelFormat( RGB888 ), 
		PixelFormat( ARGB1555 ), 
		PixelFormat( ARGB4444 ), 
		PixelFormat( ARGB8888 ) 
	};

	if ( source.GetBytes() > 4 || dest.GetBytes() > 4 )
		return;

	if ( !source.IsIndexed() && dest.IsIndexed() )	// !Indexed -> Indexed is not supported
		return;


	convtype = type;
	this->source = source;
	this->dest = dest;

	int sourceidx, destidx;
	for ( sourceidx = 0; sourceidx < (int)sizeof( stdformats ) / (int)sizeof( PixelFormat ); sourceidx++ )
		if ( stdformats[sourceidx] == source )
			break;
	if ( sourceidx == sizeof( stdformats ) / sizeof( PixelFormat ) )
		sourceidx = -1;

	for ( destidx = 0; destidx < (int)sizeof( stdformats ) / (int)sizeof( PixelFormat ); destidx++ )
		if ( stdformats[destidx] == dest )
			break;
	if ( destidx == sizeof( stdformats ) / sizeof( PixelFormat ) )
		destidx = -1;

	switch ( convtype )
	{
		case 0:
			break;

		case BLIT_REMAP:
			if ( source.IsIndexed() && dest.IsIndexed() ) // Indexed -> Indexed is fully supported,
			{											  // except for remapping between different
				MakeBlitMask( red, source.GetIntensityMask(), dest.GetIntensityMask() ); // palettes
				MakeBlitMask( green, 0, 0 );
				MakeBlitMask( blue, 0, 0 );
				MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
				alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();

				static BlitFunc funcptrs[4][4] =
				{
					{ inner_remap_1rgba_1rgba, inner_remap_1rgba_2rgba, inner_remap_1rgba_3rgba, inner_remap_1rgba_4rgba},
					{ inner_remap_2rgba_1rgba, inner_remap_2rgba_2rgba, inner_remap_2rgba_3rgba, inner_remap_2rgba_4rgba},
					{ inner_remap_3rgba_1rgba, inner_remap_3rgba_2rgba, inner_remap_3rgba_3rgba, inner_remap_3rgba_4rgba},
					{ inner_remap_4rgba_1rgba, inner_remap_4rgba_2rgba, inner_remap_4rgba_3rgba, inner_remap_4rgba_4rgba}
				};

				func = funcptrs[source.GetBytes() - 1][dest.GetBytes() - 1];
			}
			else if ( source.IsIndexed() ) // Indexed -> !Indexed is fully supported
			{
				if ( dest.IsIntensity() )
				{
					MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
					MakeBlitMask( index, source.GetIntensityMask(), 0x000000ff );
					alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();
				}
				else
				{
					MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
					MakeBlitMask( index, source.GetIntensityMask(), 0x000000ff );
					alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();
				}

				/* // Feature il Debugga Maxima
				
				char buffy[256];
				sprintf( buffy, "r: %d %d %08x\ng: %d %d %08x\nb: %d %d %08x\n"
								"a: %d %d %08x\ni: %d %d %08x\n"
								"source: %08x %08x %08x %08x\n"
								"dest: %08x %08x %08x %08x\n",
					red.left, red.right, red.mask,
					green.left, green.right, green.mask,
					blue.left, blue.right, blue.mask,
					alpha.left, alpha.right, alpha.mask,
					index.left, index.right, index.mask,
					source.GetRedMask(), source.GetGreenMask(), source.GetBlueMask(), source.GetAlphaMask(),
					dest.GetRedMask(), dest.GetGreenMask(), dest.GetBlueMask(), dest.GetAlphaMask() );
				MessageBox( NULL, buffy, "P2?", MB_OK );
				*/

#ifdef ENABLE_SPECIALIZED_C_INNERS

				if (sourceidx == 0 && destidx != -1)
				{
					static BlitFunc funcptrs[4] =
					{ inner_remap_p8_1ta, inner_remap_p8_2ta, inner_remap_p8_3ta, 
					  inner_remap_p8_4ta };

					updatepal = &UpdatePalGenericRemap;

					func = funcptrs[dest.GetBytes() - 1];
					return;
				}

#endif				

				static BlitFunc funcptrs[4][4] =
				{
					{ inner_remap_1pa_1ta, inner_remap_1pa_2ta, inner_remap_1pa_3ta, inner_remap_1pa_4ta},
					{ inner_remap_2pa_1ta, inner_remap_2pa_2ta, inner_remap_2pa_3ta, inner_remap_2pa_4ta},
					{ inner_remap_3pa_1ta, inner_remap_3pa_2ta, inner_remap_3pa_3ta, inner_remap_3pa_4ta},
					{ inner_remap_4pa_1ta, inner_remap_4pa_2ta, inner_remap_4pa_3ta, inner_remap_4pa_4ta}
				};

				updatepal = &UpdatePalGenericRemap;

				func = funcptrs[source.GetBytes() - 1][dest.GetBytes() - 1];
			}
			else // !Indexed -> !Indexed is fully supported
			{

#ifdef ENABLE_SPECIALIZED_C_INNERS

				if (sourceidx != -1 && destidx != -1)
				{
					static BlitFunc funcptrs[8][8] =
					{
						{ 0, 0, 0, 0, 0, 0, 0, 0 },
						{ 0, 0, inner_remap_i8_rgb332, inner_remap_i8_rgb565,
						  inner_remap_i8_rgb888, inner_remap_i8_argb1555,
						  inner_remap_i8_argb4444, inner_remap_i8_argb8888 },
						{ 0, inner_remap_rgb332_i8, 0, inner_remap_rgb332_rgb565,
						  inner_remap_rgb332_rgb888, inner_remap_rgb332_argb1555,
						  inner_remap_rgb332_argb4444, inner_remap_rgb332_argb8888 },
						{ 0, inner_remap_rgb565_i8, inner_remap_rgb565_rgb332, 0,
						  inner_remap_rgb565_rgb888, inner_remap_rgb565_argb1555,
						  inner_remap_rgb565_argb4444, inner_remap_rgb565_argb8888 },
						{ 0, inner_remap_rgb888_i8, inner_remap_rgb888_rgb332,
						  inner_remap_rgb888_rgb565, 0, inner_remap_rgb888_argb1555,
						  inner_remap_rgb888_argb4444, inner_remap_rgb888_argb8888 },
						{ 0, inner_remap_argb1555_i8, inner_remap_argb1555_rgb332,
						  inner_remap_argb1555_rgb565, inner_remap_argb1555_rgb888, 0,
						  inner_remap_argb1555_argb4444, inner_remap_argb1555_argb8888 },
						{ 0, inner_remap_argb4444_i8, inner_remap_argb4444_rgb332,
						  inner_remap_argb4444_rgb565, inner_remap_argb4444_rgb888,
						  inner_remap_argb4444_argb1555, 0, inner_remap_argb4444_argb8888 },
						{ 0, inner_remap_argb8888_i8, inner_remap_argb8888_rgb332,
						  inner_remap_argb8888_rgb565, inner_remap_argb8888_rgb888,
						  inner_remap_argb8888_argb1555, inner_remap_argb8888_argb4444, 0 },
					};

					func = funcptrs[sourceidx][destidx];
					if ( func )
						return;
				}

#endif

				if ( dest.IsIntensity() )
				{
					MakeBlitMask( red,	(source.IsIntensity() ? source.GetIntensityMask() : source.GetRedMask()), 0x000000ff );
					MakeBlitMask( green,(source.IsIntensity() ? source.GetIntensityMask() : source.GetGreenMask()), 0x000000ff );
					MakeBlitMask( blue,	(source.IsIntensity() ? source.GetIntensityMask() : source.GetBlueMask()), 0x000000ff );
					MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
					MakeBlitMask( intens, 0x000000ff, dest.GetIntensityMask() );
					alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();

					static BlitFunc funcptrs[4][4] =
					{
						{ inner_remap_1rgba_1ia, inner_remap_1rgba_2ia, inner_remap_1rgba_3ia, inner_remap_1rgba_4ia},
						{ inner_remap_2rgba_1ia, inner_remap_2rgba_2ia, inner_remap_2rgba_3ia, inner_remap_2rgba_4ia},
						{ inner_remap_3rgba_1ia, inner_remap_3rgba_2ia, inner_remap_3rgba_3ia, inner_remap_3rgba_4ia},
						{ inner_remap_4rgba_1ia, inner_remap_4rgba_2ia, inner_remap_4rgba_3ia, inner_remap_4rgba_4ia}
					};

					func = funcptrs[source.GetBytes() - 1][dest.GetBytes() - 1];
				}
				else
				{
					MakeBlitMask( red, (source.IsIntensity() ? source.GetIntensityMask() : source.GetRedMask()), dest.GetRedMask() );
					MakeBlitMask( green, (source.IsIntensity() ? source.GetIntensityMask() : source.GetGreenMask()), dest.GetGreenMask() );
					MakeBlitMask( blue, (source.IsIntensity() ? source.GetIntensityMask() : source.GetBlueMask()), dest.GetBlueMask() );
					MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
					alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();

					/* // Feature il Debugga Maxima

					char buffy[256];
					sprintf( buffy, "r: %d %d %08x\ng: %d %d %08x\nb: %d %d %08x\n"
									"a: %d %d %08x %08x\ni: %d %d %08x\n"
									"source: %08x %08x %08x %08x\n"
									"dest: %08x %08x %08x %08x\n",
						red.left, red.right, red.mask,
						green.left, green.right, green.mask,
						blue.left, blue.right, blue.mask,
						alpha.left, alpha.right, alpha.mask, alphaor,
						index.left, index.right, index.mask,
						source.GetRedMask(), source.GetGreenMask(), source.GetBlueMask(), source.GetAlphaMask(),
						dest.GetRedMask(), dest.GetGreenMask(), dest.GetBlueMask(), dest.GetAlphaMask() );
					MessageBox( NULL, buffy, "Direct2Direct", MB_OK );
					*/
					static BlitFunc funcptrs[4][4] =
					{
						{ inner_remap_1rgba_1rgba, inner_remap_1rgba_2rgba, inner_remap_1rgba_3rgba, inner_remap_1rgba_4rgba},
						{ inner_remap_2rgba_1rgba, inner_remap_2rgba_2rgba, inner_remap_2rgba_3rgba, inner_remap_2rgba_4rgba},
						{ inner_remap_3rgba_1rgba, inner_remap_3rgba_2rgba, inner_remap_3rgba_3rgba, inner_remap_3rgba_4rgba},
						{ inner_remap_4rgba_1rgba, inner_remap_4rgba_2rgba, inner_remap_4rgba_3rgba, inner_remap_4rgba_4rgba}
					};

					func = funcptrs[source.GetBytes() - 1][dest.GetBytes() - 1];
				}
			}
			break;

		case BLIT_STRETCH:
			if ( source.IsIndexed() || source.IsIntensity() )
			{
				static BlitFunc funcptrs[4] =
				{ inner_stretch_1ia, inner_stretch_2ia, inner_stretch_3ia, inner_stretch_4ia };

				func = funcptrs[source.GetBytes() - 1];
			}
			else // Direct
			{
				static BlitFunc funcptrs[4] =
				{ inner_stretch_1rgba, inner_stretch_2rgba, inner_stretch_3rgba, inner_stretch_4rgba };

				func = funcptrs[source.GetBytes() - 1];
			}

			break;

		case BLIT_STRETCH_REMAP:

			if ( source.IsIndexed() && dest.IsIndexed() ) // Indexed -> Indexed is fully supported,
			{											  // except for remapping between different
				MakeBlitMask( red, source.GetIntensityMask(), dest.GetIntensityMask() ); // palettes
				MakeBlitMask( green, 0, 0 );
				MakeBlitMask( blue, 0, 0 );
				MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
				alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();

				static BlitFunc funcptrs[4][4] =
				{
					{ inner_remap_1rgba_1rgba, inner_remap_1rgba_2rgba, inner_remap_1rgba_3rgba, inner_remap_1rgba_4rgba},
					{ inner_remap_2rgba_1rgba, inner_remap_2rgba_2rgba, inner_remap_2rgba_3rgba, inner_remap_2rgba_4rgba},
					{ inner_remap_3rgba_1rgba, inner_remap_3rgba_2rgba, inner_remap_3rgba_3rgba, inner_remap_3rgba_4rgba},
					{ inner_remap_4rgba_1rgba, inner_remap_4rgba_2rgba, inner_remap_4rgba_3rgba, inner_remap_4rgba_4rgba}
				};

				func = funcptrs[source.GetBytes() - 1][dest.GetBytes() - 1];
			}
			else if ( source.IsIndexed() ) // Indexed -> !Indexed is fully supported
			{
				if ( dest.IsIntensity() )
				{
					MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
					MakeBlitMask( index, source.GetIntensityMask(), 0x000000ff );
					alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();
				}
				else
				{
					MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
					MakeBlitMask( index, source.GetIntensityMask(), 0x000000ff );
					alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();
				}

#ifdef ENABLE_SPECIALIZED_C_INNERS

				if (sourceidx == 0 && destidx != -1)
				{
					static BlitFunc funcptrs[4] =
					{ inner_stretch_remap_p8_1ta, inner_stretch_remap_p8_2ta,
					  inner_stretch_remap_p8_3ta, inner_stretch_remap_p8_4ta };

					updatepal = &UpdatePalGenericRemap;

					func = funcptrs[dest.GetBytes() - 1];
					return;
				}

#endif

				static BlitFunc funcptrs[4][4] =
				{
					{ inner_stretch_remap_1pa_1ta, inner_stretch_remap_1pa_2ta,
					  inner_stretch_remap_1pa_3ta, inner_stretch_remap_1pa_4ta },
					{ inner_stretch_remap_2pa_1ta, inner_stretch_remap_2pa_2ta,
					  inner_stretch_remap_2pa_3ta, inner_stretch_remap_2pa_4ta },
					{ inner_stretch_remap_3pa_1ta, inner_stretch_remap_3pa_2ta,
					  inner_stretch_remap_3pa_3ta, inner_stretch_remap_3pa_4ta },
					{ inner_stretch_remap_4pa_1ta, inner_stretch_remap_4pa_2ta,
					  inner_stretch_remap_4pa_3ta, inner_stretch_remap_4pa_4ta }
				};

				updatepal = &UpdatePalGenericRemap;

				func = funcptrs[source.GetBytes() - 1][dest.GetBytes() - 1];
			}
			else // !Indexed -> !Indexed is fully supported
			{

#ifdef ENABLE_SPECIALIZED_C_INNERS

				if (sourceidx != -1 && destidx != -1)
				{
					static BlitFunc funcptrs[8][8] =
					{
						{ 0, 0, 0, 0, 0, 0, 0, 0 },
						{ 0, 0, inner_stretch_remap_i8_rgb332, inner_stretch_remap_i8_rgb565,
						  inner_stretch_remap_i8_rgb888, inner_stretch_remap_i8_argb1555,
						  inner_stretch_remap_i8_argb4444, inner_stretch_remap_i8_argb8888 },
						{ 0, inner_stretch_remap_rgb332_i8, 0, inner_stretch_remap_rgb332_rgb565,
						  inner_stretch_remap_rgb332_rgb888, inner_stretch_remap_rgb332_argb1555,
						  inner_stretch_remap_rgb332_argb4444, inner_stretch_remap_rgb332_argb8888 },
						{ 0, inner_stretch_remap_rgb565_i8, inner_stretch_remap_rgb565_rgb332, 0,
						  inner_stretch_remap_rgb565_rgb888, inner_stretch_remap_rgb565_argb1555,
						  inner_stretch_remap_rgb565_argb4444, inner_stretch_remap_rgb565_argb8888 },
						{ 0, inner_stretch_remap_rgb888_i8, inner_stretch_remap_rgb888_rgb332,
						  inner_stretch_remap_rgb888_rgb565, 0, inner_stretch_remap_rgb888_argb1555,
						  inner_stretch_remap_rgb888_argb4444, inner_stretch_remap_rgb888_argb8888 },
						{ 0, inner_stretch_remap_argb1555_i8, inner_stretch_remap_argb1555_rgb332,
						  inner_stretch_remap_argb1555_rgb565, inner_stretch_remap_argb1555_rgb888, 0,
						  inner_stretch_remap_argb1555_argb4444, inner_stretch_remap_argb1555_argb8888 },
						{ 0, inner_stretch_remap_argb4444_i8, inner_stretch_remap_argb4444_rgb332,
						  inner_stretch_remap_argb4444_rgb565, inner_stretch_remap_argb4444_rgb888,
						  inner_stretch_remap_argb4444_argb1555, 0, inner_stretch_remap_argb4444_argb8888 },
						{ 0, inner_stretch_remap_argb8888_i8, inner_stretch_remap_argb8888_rgb332,
						  inner_stretch_remap_argb8888_rgb565, inner_stretch_remap_argb8888_rgb888,
						  inner_stretch_remap_argb8888_argb1555, inner_stretch_remap_argb8888_argb4444, 0 },
					};

					func = funcptrs[sourceidx][destidx];
					if ( func )
						return;
				}

#endif

				if ( dest.IsIntensity() )
				{
					MakeBlitMask( red, (source.IsIntensity() ? source.GetIntensityMask() : source.GetGreenMask()), 0x000000ff );
					MakeBlitMask( green, (source.IsIntensity() ? source.GetIntensityMask() : source.GetGreenMask()), 0x000000ff );
					MakeBlitMask( blue, (source.IsIntensity() ? source.GetIntensityMask() : source.GetBlueMask()), 0x000000ff );
					MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
					MakeBlitMask( intens, 0x000000ff, dest.GetIntensityMask() );
					alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();

					static BlitFunc funcptrs[4][4] =
					{
						{ inner_stretch_remap_1rgba_1ia, inner_stretch_remap_1rgba_2ia,
						  inner_stretch_remap_1rgba_3ia, inner_stretch_remap_1rgba_4ia },
						{ inner_stretch_remap_2rgba_1ia, inner_stretch_remap_2rgba_2ia,
						  inner_stretch_remap_2rgba_3ia, inner_stretch_remap_2rgba_4ia },
						{ inner_stretch_remap_3rgba_1ia, inner_stretch_remap_3rgba_2ia,
						  inner_stretch_remap_3rgba_3ia, inner_stretch_remap_3rgba_4ia },
						{ inner_stretch_remap_4rgba_1ia, inner_stretch_remap_4rgba_2ia,
						  inner_stretch_remap_4rgba_3ia, inner_stretch_remap_4rgba_4ia }
					};

					func = funcptrs[source.GetBytes() - 1][dest.GetBytes() - 1];
				}
				else
				{

					MakeBlitMask( red, (source.IsIntensity() ? source.GetIntensityMask() : source.GetRedMask()), dest.GetRedMask() );
					MakeBlitMask( green, (source.IsIntensity() ? source.GetIntensityMask() : source.GetGreenMask()), dest.GetGreenMask() );
					MakeBlitMask( blue, (source.IsIntensity() ? source.GetIntensityMask() : source.GetBlueMask()), dest.GetBlueMask() );
					MakeBlitMask( alpha, source.GetAlphaMask(), dest.GetAlphaMask() );
					alphaor = (source.GetAlphaMask() ? 0 : 0xffffffff) & dest.GetAlphaMask();

					static BlitFunc funcptrs[4][4] =
					{
						{ inner_stretch_remap_1rgba_1rgba, inner_stretch_remap_1rgba_2rgba,
						  inner_stretch_remap_1rgba_3rgba, inner_stretch_remap_1rgba_4rgba },
						{ inner_stretch_remap_2rgba_1rgba, inner_stretch_remap_2rgba_2rgba,
						  inner_stretch_remap_2rgba_3rgba, inner_stretch_remap_2rgba_4rgba },
						{ inner_stretch_remap_3rgba_1rgba, inner_stretch_remap_3rgba_2rgba,
						  inner_stretch_remap_3rgba_3rgba, inner_stretch_remap_3rgba_4rgba },
						{ inner_stretch_remap_4rgba_1rgba, inner_stretch_remap_4rgba_2rgba,
						  inner_stretch_remap_4rgba_3rgba, inner_stretch_remap_4rgba_4rgba }
					};

					func = funcptrs[source.GetBytes() - 1][dest.GetBytes() - 1];
				}
			}
			break;

		case BLIT_STRETCH_BILINEAR:

#ifdef ENABLE_SPECIALIZED_X86_INNERS

			if ( sourceidx != -1 )
			{

	#ifdef ENABLE_SPECIALIZED_X86_MMX_INNERS

				static BlitFunc funcptrs[2][8] =
					{
						{ 0, 0, 0, inner_stretch_rgb565_bilinear_x86,
						  0, inner_stretch_argb1555_bilinear_x86,
						  inner_stretch_argb4444_bilinear_x86, 0 },
						{ 0, 0, 0, inner_stretch_rgb565_bilinear_x86,
						  inner_stretch_rgb888_bilinear_x86_mmx, inner_stretch_argb1555_bilinear_x86,
						  inner_stretch_argb4444_bilinear_x86, inner_stretch_argb8888_bilinear_x86_mmx }
					};
					func = funcptrs[mmxfound][sourceidx];

	#else // !ENABLE_SPECIALIZED_X86_MMX_INNERS

				static BlitFunc funcptrs[8] =
					{ 0, 0, 0, inner_stretch_rgb565_bilinear_x86,
					  0, inner_stretch_argb1555_bilinear_x86,
					  inner_stretch_argb4444_bilinear_x86, 0 };
					func = funcptrs[sourceidx];

	#endif // !ENABLE_SPECIALIZED_X86_MMX_INNERS

					if ( func )
						return;
			}

#endif // ENABLE_SPECIALIZED_X86_INNERS

#ifdef ENABLE_SPECIALIZED_C_INNERS

			if ( sourceidx != -1 )
			{
					static BlitFunc funcptrs[8] =
					{ 
						0, inner_stretch_i8_bilinear, 0, inner_stretch_rgb565_bilinear,
						inner_stretch_rgb888_bilinear, inner_stretch_argb1555_bilinear,
						inner_stretch_argb4444_bilinear, inner_stretch_argb8888_bilinear 
					};

					func = funcptrs[sourceidx];

					if ( func )
						return;
			}

#endif // ENABLE_SPECIALIZED_C_INNERS

			if ( source.IsIndexed() )
			{
				convtype &= ~BLIT_BILINEAR;
				static BlitFunc funcptrs[4] =
				{ 
					inner_stretch_1ia, inner_stretch_2ia, 
					inner_stretch_3ia, inner_stretch_4ia 
				};

				func = funcptrs[source.GetBytes() - 1];
			}
			else if ( source.IsIntensity() )
			{
				MakeBlitMaskBilinear( intens, source.GetIntensityMask() );
				MakeBlitMaskBilinear( alpha,  source.GetAlphaMask() );

				static BlitFunc funcptrs[4] =
				{ inner_stretch_1ia_bilinear, inner_stretch_2ia_bilinear, 
				  inner_stretch_3ia_bilinear, inner_stretch_4ia_bilinear };

				func = funcptrs[source.GetBytes() - 1];
			}
			else // Direct
			{
				MakeBlitMaskBilinear( red,   source.GetRedMask() );
				MakeBlitMaskBilinear( green, source.GetGreenMask() );
				MakeBlitMaskBilinear( blue,  source.GetBlueMask() );
				MakeBlitMaskBilinear( alpha, source.GetAlphaMask() );
				static BlitFunc funcptrs[4] =
				{ 
					inner_stretch_1rgba_bilinear, inner_stretch_2rgba_bilinear, 
					inner_stretch_3rgba_bilinear, inner_stretch_4rgba_bilinear 
				};

				func = funcptrs[source.GetBytes() - 1];
			}

			break;
	}

}

Blitter::~Blitter()
{
}


static inline int32 HashFormat(const PixelFormat& f)
{
	return   HighestBit( f.GetRedMask() ) 
		   ^ HighestBit( f.GetGreenMask() )
		   ^ HighestBit( f.GetBlueMask() )
		   ^ HighestBit( f.GetAlphaMask() )
		   ^ f.GetBytes();
}


static Blitter* GetBlitter(const PixelFormat& dest, const PixelFormat& src, int type)
{
	//  bilinear resampling is not available when remapping
	if ( type & BLIT_REMAP )
		type &= ~BLIT_BILINEAR;
	
	int idx = ((((HashFormat( src ) << 1) ^ HashFormat( dest )) & ((BlitHashSize >> 2) - 1)) 
			  ^ ((type & ~BLIT_BILINEAR) << (BlitHashBits - 2))) & (BlitHashSize - 1);

	// is suitable blitter already created?
	if ( blitters[idx] && blitters[idx]->source == src && blitters[idx]->dest == dest
		 && blitters[idx]->convtype == type )
	{
		// update Indexed -> Direct remapping table if necessary
		if ( blitters[idx]->updatepal && !blitters[idx]->updatepal( *blitters[idx], dest, src ) )
			return NULL;

		return blitters[idx];
	}

	// create a new blitter
	Blitter* blitter = new Blitter( dest, src, type );
	if ( !blitter )
	{
		/*
		if ( blitter )
			delete blitter;
		*/
		return NULL;
	}

	// update Indexed -> !Indexed remapping table if necessary
	if ( blitter->updatepal && !blitter->updatepal(*blitter,dest,src) )
	{
		delete blitter;
		return NULL;
	}

	// insert blitter into table
	if ( blitters[idx] )
	{
		delete blitters[idx];
	}
	blitters[idx] = blitter;
	return blitter;
}


//////////////////////////////////////////////////////
// clear innerloops                                //
////////////////////////////////////////////////////

typedef void (*ClearFunc)(void *src, int length, uint32 or_, uint32 mask);

static void inner_clear_1(void *src, int length, uint32 or_, uint32 mask)
{
	uint8* src2 = (uint8*) src;
	do
	{
		*src2++ = (*src2 & mask) | or_;
	} while ( --length );
}

static void inner_clear_2(void *src, int length, uint32 or_, uint32 mask)
{
	uint16* src2 = (uint16*) src;
	do {
		*src2++ = (*src2 & mask) | or_;
	} while ( --length );
}

static void inner_clear_3(void *src, int length, uint32 or_, uint32 mask)
{
	uint8* src2 = (uint8*) src;
	do {
		write24( src2, (read24( src2 ) & mask) | or_ );
		src2 += 3;
	} while ( --length );
}

static void inner_clear_4(void *src, int length, uint32 or_, uint32 mask)
{
	uint32* src2 = (uint32*) src;
	do {
		*src2++ = (*src2 & mask) | or_;
	} while ( --length );
}


//////////////////////////////////////////////////////
// surface interface                               //
////////////////////////////////////////////////////

void Surface::Clear(const Color32& color, ClearMode mode)
{
	ClearFunc clearfuncs[4] =
	{ inner_clear_1, inner_clear_2, inner_clear_3, inner_clear_4 };

	// choose clear function
	ClearFunc func = clearfuncs[format.GetBytes() - 1];

	unsigned int or_, mask;

	if ( format.IsIndexed() )
	{
		int bestdist = 256 * 256 * 3;
		int bestidx = 0;
		Color32* pal = format.GetPalette();

		for ( int i = 0; i < 256; i++ )
		{
			int newdist =   ((uint32)pal[i].r - (uint32)color.r) * ((uint32)pal[i].r - (uint32)color.r)
						  + ((uint32)pal[i].g - (uint32)color.g) * ((uint32)pal[i].g - (uint32)color.g)
						  + ((uint32)pal[i].b - (uint32)color.b) * ((uint32)pal[i].b - (uint32)color.b);
			if ( newdist < bestdist )
			{
				bestdist = newdist;
				bestidx = i;
			}
		}

		mask = ((mode & CLEAR_COLOR) ? format.GetIntensityMask() : 0) |
			   ((mode & CLEAR_ALPHA) ? format.GetAlphaMask() : 0);
		BlitMask maski, maska;
		MakeBlitMask( maski, 0xff, format.GetIntensityMask() );
		MakeBlitMask( maska, 0xff000000, format.GetAlphaMask() );
		or_ = ((((bestidx >> maski.right) << maski.left) & maski.mask)
			  | (((((color & 0xff000000) >> 24) >> maska.right) << maska.left)
			       & maska.mask)) & mask;

	}
	else if ( format.IsIntensity() )
	{
		mask = ((mode & CLEAR_COLOR) ? format.GetIntensityMask() : 0) |
			   ((mode & CLEAR_ALPHA) ? format.GetAlphaMask() : 0);
		BlitMask maski, maska;
		MakeBlitMask( maski, 0xff, format.GetIntensityMask() );
		MakeBlitMask( maska, 0xff000000, format.GetAlphaMask() );
		uint32 c = Blitter::palmono[(color & 0xff0000) >> 16].r
						 + Blitter::palmono[(color & 0xff00) >> 8].g
						 + Blitter::palmono[color & 0xff].b;
		or_ = ((((c >> maski.right) << maski.left) & maski.mask)
			  | (((c >> maska.right) << maska.left) & maska.mask)) & mask;
	}
	else // Direct
	{
		mask = ((mode & CLEAR_COLOR) ? (format.GetRedMask() | format.GetGreenMask() | format.GetBlueMask()) : 0) |
			   ((mode & CLEAR_ALPHA) ? (format.GetAlphaMask()) : 0);
		BlitMask maskr, maskg, maskb, maska;
		MakeBlitMask( maskr, palformat.GetRedMask(), format.GetRedMask() );
		MakeBlitMask( maskg, palformat.GetGreenMask(), format.GetGreenMask() );
		MakeBlitMask( maskb, palformat.GetBlueMask(), format.GetBlueMask() );
		MakeBlitMask( maska, 0xff000000, format.GetAlphaMask() );
		uint32 c = color;
		or_ = ((((c >> maskr.right) << maskr.left) & maskr.mask)
			  | (((c >> maskg.right) << maskg.left) & maskg.mask)
			  | (((c >> maskb.right) << maskb.left) & maskb.mask)
			  | (((c >> maska.right) << maska.left) & maska.mask)) & mask;
	}

	// clear
	for ( int y = 0; y < height; y++ )
		func( image + y * pitch, width, or_, ~mask );

}


void Surface::Blit(const Surface& source, BlitMode mode)
{

	static bool init = false;
	if ( !init )
	{
		BitmapBlitterInitialize();
		init = true;
	}

	
	// validate surfaces
	if ( !(width && height && image && source.width && source.height && source.image) )
		return;

	bool difsize = width != source.width || height != source.height;

	// choose blitter mode
	int type = 0;
	
	if ( source.format != format )
		type |= BLIT_REMAP;

	if ( mode==BLIT_SCALE && difsize )
		type |= BLIT_STRETCH;

	if ( mode==BLIT_BILINEAR_SCALE && difsize )
		type |= BLIT_BILINEAR | BLIT_STRETCH;
	


	// plain row-by-row copying if !conversion && !stretching
	if ( !type )
	{
		int minwidth = min( width, source.width );
		int minheight = min( height, source.height );

		int length = minwidth * format.GetBytes();
		uint8* s = source.image;
		uint8* d = image;
		for ( int y = 0; y < minheight; y++ )
		{
			memcpy(d,s,length);
			s += source.pitch;
			d += pitch;
		}
	}
	else
	{
		Blitter* blitter;

		// get a suitable blitter
		if ( !(blitter = GetBlitter( format, source.format, type )) )
			return;

		InnerInfo innerinfo;
		innerinfo.dest = (uint8*)image;

		// stretching
		if ( blitter->convtype & BLIT_STRETCH )
		{
			uint32 vstep, v;

			// bilinear stretch has special UV calculations
			if ( blitter->convtype & BLIT_BILINEAR )
			{
				if ( width < source.width )
				{
					innerinfo.ustep = (source.width << 16) / width;
					innerinfo.ustart = (innerinfo.ustep >> 1) - 0x8000;
				}
				else
				{
					int div = width - 1;
					if ( !div ) div = 1;

					innerinfo.ustep = ((source.width - 1) << 16) / div;
					innerinfo.ustart = 0;
				}

				if ( height < source.height )
				{
					vstep = (source.height << 16) / height;
					v = (vstep >> 1) - 0x8000;
				}
				else
				{
					int div = height - 1;
					if ( !div ) div = 1;

					vstep = ((source.height - 1) << 16) / div;
					v = 0;
				}
			}
			else // !bilinear
			{
				innerinfo.ustep = (source.width << 16) / width;
				innerinfo.ustart = (innerinfo.ustep >> 1);
				vstep = (source.height << 16) / height;
				v = vstep >> 1;
			}

			innerinfo.width = width;
			
			for ( int y = 0; y < height; y++ )
			{
				innerinfo.src = (uint8*)source.image + ((v >> 16) * source.pitch);
				innerinfo.src2 = innerinfo.src + source.pitch;

				// clamp lower scanline - just in case
				// the delta calculation above should ensure this is not required,
				// but, rgb888_bilinear innerloops pagefault -- fix this performance/quality
				// caveat sometime. ;-)
				if ( y==(height-1) ) innerinfo.src2 = innerinfo.src;

				innerinfo.vfrac = v & 0xffff;
				blitter->Blit( &innerinfo );
				v += vstep;
				innerinfo.dest += pitch;
			}
		}
		else // conversion && !stretching
		{
			int minwidth = min( width, source.width );
			int minheight = min( height, source.height );

			innerinfo.width = minwidth;

			innerinfo.src = (uint8*)source.image;
			for ( int y = 0; y < minheight; y++ )
			{
				blitter->Blit( &innerinfo );
				innerinfo.dest += pitch;
				innerinfo.src += source.pitch;
			}
		}
	}
}
