//-------------------------------------------------------------------------------------
//
// Copyright 2009 Intel Corporation
// All Rights Reserved
//
// Permission is granted to use, copy, distribute and prepare derivative works of this
// software for any purpose and without fee, provided, that the above copyright notice
// and this statement appear in all copies.  Intel makes no representations about the
// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
// assume any responsibility for any errors which may appear in this software nor any
// responsibility to update it.
//

/////////////////////////////////////////////////////////////////////////////
// Based upon:
//
// Approximate Math Library for SSE / SSE2
//  Header File
//  Version 2.0
//  Author Alex Klimovitski, Intel GmbH
/////////////////////////////////////////////////////////////////////////////
#include <emmintrin.h>

#include "AMaths.h"
#include "AMaths_internal.h"

#ifdef AMATHS_ASM

__m128 __declspec(naked) __stdcall am_pow_ss(__m128 x, __m128 y)
{
	__asm
	{
		xorps	xmm7, xmm7
		comiss	xmm7, xmm0
		movss	xmm7, _ps_am_inv_mant_mask
		maxss	xmm0, _ps_am_min_norm_pos  // cut off denormalized stuff
		jnc		l_zerobase
		movss	xmm3, _ps_am_1
		movss	[esp - 4], xmm0

		andps	xmm0, xmm7
		orps	xmm0, xmm3
		movss	xmm7, xmm0

		addss	xmm7, xmm3
		subss	xmm0, xmm3
		mov		edx, [esp - 4]
		rcpss	xmm7, xmm7  
		mulss	xmm0, xmm7
		addss	xmm0, xmm0

		shr		edx, 23

		movss	xmm4, _ps_log_p0
		movss	xmm6, _ps_log_q0

		sub		edx, 0x7f
		movss	xmm2, xmm0
		mulss	xmm2, xmm2

		mulss	xmm4, xmm2
		movss	xmm5, _ps_log_p1
		mulss	xmm6, xmm2
		cvtsi2ss	xmm3, edx
		movss	xmm7, _ps_log_q1

		addss	xmm4, xmm5
		mulss	xmm3, xmm1
		addss	xmm6, xmm7

		movss	xmm5, _ps_log_p2
		mulss	xmm4, xmm2
		movss	xmm7, _ps_log_q2
		mulss	xmm6, xmm2

		addss	xmm4, xmm5
		mulss	xmm1, _ps_log2_c0
		addss	xmm6, xmm7

		mulss	xmm4, xmm2
		rcpss	xmm6, xmm6  

		mulss	xmm6, xmm0
		mulss	xmm4, xmm6
		movss	xmm6, _ps_exp2_hi
		addss	xmm0, xmm4
		movss	xmm4, _ps_exp2_lo
		xorps	xmm7, xmm7
		movss	xmm5, _ps_am_0p5
		mulss	xmm0, xmm1

		addss	xmm0, xmm3
		xor		ecx, ecx

		minss	xmm0, xmm6
		mov		edx, 1
		maxss	xmm0, xmm4

		addss	xmm5, xmm0

		comiss	xmm5, xmm7
		cvttss2si	eax, xmm5
		cmovc	ecx, edx  // 'c' is 'lt' for comiss
		sub		eax, ecx

		cvtsi2ss	xmm5, eax
		add		eax, 0x7f

		subss	xmm0, xmm5

		movss	xmm2, xmm0
		mulss	xmm2, xmm2

		movss	xmm6, _ps_exp2_q0
		movss	xmm4, _ps_exp2_p0

		mulss	xmm6, xmm2
		movss	xmm7, _ps_exp2_q1
		mulss	xmm4, xmm2
		movss	xmm5, _ps_exp2_p1

		shl		eax, 23
		addss	xmm6, xmm7
		addss	xmm4, xmm5

		movss	xmm5, _ps_exp2_p2
		mulss	xmm4, xmm2

		addss	xmm4, xmm5

		mulss	xmm4, xmm0

		mov 	[esp - 4], eax
		subss	xmm6, xmm4
		movss	xmm7, _ps_am_1
		rcpss	xmm6, xmm6  
		mulss	xmm4, xmm6
		movss	xmm0, [esp - 4]
		addss	xmm4, xmm4
		addss	xmm4, xmm7

		mulss	xmm0, xmm4

		ret		32

l_zerobase:
		xorps	xmm0, xmm0

		ret		32
	}
}

#endif