%ifndef PREFETCH_DISTANCE
%define PREFETCH_DISTANCE 174
%endif

;%define NO_PREFETCH
%define DISABLE_INTS

; NOTE: This is a try for SSE solution

section .text

global _matbyvec@16
_matbyvec@16: ; int matbyvec (const float* matrix, const float* vectors, int num_vec, int* output)

%ifdef DISABLE_INTS
			cli
%endif
			pxor	mm0, mm0
			push	ebx
			rdtsc
			push	eax

			mov	eax, [esp+12]
			mov	edx, [esp+16]
			mov	ecx, [esp+20]
			mov	ebx, [esp+24]

;	[ x']	[ A B C D ]   [ x ]
;	[ y'] = [ E F G H ] * [ y ]
;	[ z']	[ I J K L ]   [ z ]
;	[ 1 ]	[ 0 0 0 1 ]   [ 1 ]
;
;	final = [ X Y ] = [ x'/z' y'/z' ]
;
; input:  eax - matrix [ a11 a12 a13 a14 a21 .. a44 ]
;	  edx - array of 3D points [ x y z w ]
;	  ecx - number of 3D points (divisible by 2)
; output: ebx - destination - array of 2D points
; matrix and a3d contain 32bit floats, a2d contains 32bit ints

		; Nice that we can load this only once
			movups	xmm0, [eax]	; [ D C B A ]
			movups	xmm1, [eax+16]	; [ H G F E ]
			movups	xmm2, [eax+32]	; [ L K J I ]

		; Now we have to transpose the matrix for our algo
			movaps	xmm4, xmm0
			unpcklps xmm4, xmm2	; [ J B I A ]
			unpcklps xmm4, xmm1	; [ F I E A ]
			shufps	xmm0, xmm0, 00111001b ; [ A D C B ]
			shufps	xmm1, xmm1, 00111001b ; [ E H G F ]
			shufps	xmm2, xmm2, 00111001b ; [ I L K J ]
			movaps	xmm5, xmm0
			unpcklps xmm5, xmm2	; [ K C J B ]
			unpcklps xmm5, xmm1	; [ G J F B ]
			shufps	xmm0, xmm0, 00111001b ; [ B A D C ]
			shufps	xmm1, xmm1, 00111001b ; [ F E H G ]
			shufps	xmm2, xmm2, 00111001b ; [ J I L K ]
			movaps	xmm6, xmm0
			unpcklps xmm6, xmm2	; [ L D K C ]
			unpcklps xmm6, xmm1	; [ H K G C ]
			shufps	xmm0, xmm0, 00111001b ; [ C B A D ]
			shufps	xmm1, xmm1, 00111001b ; [ G F E H ]
			shufps	xmm2, xmm2, 00111001b ; [ K J I L ]
			movaps	xmm7, xmm0
			unpcklps xmm7, xmm2	; [ I A L D ]
			unpcklps xmm7, xmm1	; [ E L H D ]

			align	16

_loop:
		; Load vector
			movaps	xmm0, [edx]	; [ 1 z y x ]

		; Prefetch
%ifndef NO_PREFETCH
			prefetcht0 [edx+PREFETCH_DISTANCE]
%endif
			add	edx, 16

		; Reorganize
			movaps	xmm2, xmm0	; [ 1 z y x ]
			unpcklps xmm0, xmm0	; [ y y x x ]
			movaps	xmm1, xmm0
			unpcklps xmm0, xmm0	; [ x x x x ]
			unpckhps xmm1, xmm1	; [ y y y y ]
			unpckhps xmm2, xmm2	; [ 1 1 z z ]
			unpcklps xmm2, xmm2	; [ z z z z ]

		; Multiplication (pretty nice)
			mulps	xmm0, xmm4	; [ F*x I*x E*x A*x ]
			mulps	xmm1, xmm5	; [ G*y J*y F*y B*y ]
			mulps	xmm2, xmm6	; [ H*z K*z G*z C*z ]

		; Accumulation
			addps	xmm0, xmm1
			addps	xmm2, xmm7
			addps	xmm0, xmm2	; [ ? z' y' x']

		; Division
			movhlps xmm3, xmm0	; [ ? ? ? z']
			rcpss	xmm3, xmm3	; [ ? ? ? 1/z']
			unpcklps xmm3, xmm3	; [ ? ? 1/z' 1/z']
			mulps	xmm0, xmm3	; [ ? ? Y X ]

		; Final result
			cvtps2pi mm0, xmm0	; [ Y X ]
			movq	[ebx], mm0
			add	ebx, 8

		; Next point
			dec	ecx
			jnz	_loop

			rdtsc
			pop	ecx
			sub	eax, ecx
			pop	ebx
			emms
%ifdef DISABLE_INTS
			sti
%endif
			ret	16


; Check for SSE support
			align	16
global _check_sse@0
_check_sse@0:		push	ebx
			xor	eax, eax
			cpuid
			cmp	eax, 1
			jl	.no_sse
			mov	eax, 1
			cpuid
			test	edx, 02000000h
			jz	.no_sse
			mov	eax, 1
			jmp	short .ok
.no_sse:		xor	eax, eax
.ok:			pop	ebx
			ret
