%ifndef PREFETCH_DISTANCE
%define PREFETCH_DISTANCE 196
%endif

;%define NO_PREFETCH
;%define DISABLE_INTS

; NOTE: This routine has been optimized for K6-2

section .text

global _matbyvec@16
_matbyvec@16: ; int matbyvec (const float* matrix, const float* vectors, int num_vec, int* output)

%ifdef DISABLE_INTS
			cli
%endif
			femms
			push	ebx
			rdtsc
			push	eax

			mov	eax, [esp+12]
			mov	edx, [esp+16]
			mov	ecx, [esp+20]
			mov	ebx, [esp+24]

;	[ x']	[ A B C D ]   [ x ]
;	[ y'] = [ E F G H ] * [ y ]
;	[ z']	[ I J K L ]   [ z ]
;	[ 1 ]	[ 0 0 0 1 ]   [ 1 ]
;
;	final = [ x y ] = [ x'/z' y'/z' ]
;
; input:  eax - matrix [ a11 a12 a13 a14 a21 .. a44 ]
;	  edx - array of 3D points [ x y z w ]
;	  ecx - number of 3D points (divisible by 2)
; output: ebx - destination - array of 2D points
; matrix and a3d contain 32bit floats, a2d contains 32bit ints

			movq	mm0, [edx]	; [ y x ]
			movq	mm1, [edx+8]	; [ 1 z ]
			movq	mm2, [eax+8*4]	; [ J I ]
			movq	mm3, [eax+10*4] ; [ L K ]
			add	edx, 16

			align	16

_loop:
			movq	mm4, [eax]	;  1 (2) [ B A ]
			pfmul	mm2, mm0	;  - (2) [ J*y I*x ]

			movq	mm5, [eax+2*4]	;  2 (2) [ D C ]
			pfmul	mm3, mm1	;  - (2) [ L K*z ]

			movq	mm6, [eax+4*4]	;  3 (2) [ F E ]
			pfmul	mm4, mm0	;  - (2) [ B*y A*x ]

			movq	mm7, [eax+6*4]	;  4 (2) [ H G ]
			pfmul	mm5, mm1	;  - (2) [ D C*z ]

			pfacc	mm2, mm3	;  5 (2) [ K*z+L I*x+J*y ]
			pfmul	mm6, mm0	;  - (2) [ F*y E*x ]

			pfacc	mm4, mm5	;  6 (2) [ C*z+D A*x+B*y ]
			pfmul	mm7, mm1	;  - (2) [ H G*z ]

			movq	mm0, [edx]	;  7 (2) [ next_y next_x ]
			pfacc	mm2, mm2	;  - (2) [ z' z']

			movq	mm1, [edx+8]	;  9 (2) [ 1 next_z ]
			pfacc	mm6, mm7	;  - (2) [ G*z+H E*x+F*y ]

			movq	mm3, [eax+10*4] ;  9 (2) [ L K ]
			pfrcp	mm5, mm2	;  - (2) [ 1/z' 1/z']

			movq	mm2, [eax+8*4]	; 10 (2) [ J I ]
			pfacc	mm4, mm6	;  - (2) [ y' x']

%ifndef NO_PREFETCH
			prefetch [edx+PREFETCH_DISTANCE] ; 11 (2)
%endif
			add	edx, 16 	;  - (1)

			pfmul	mm4, mm5	; 12 (2) [ Y X ]
			pf2id	mm4, mm4	; 14 (2)

			movq	[ebx], mm4	; 15 (2)
			add	ebx, 8		;  - (1)

			dec	ecx		; 14 (1)
			jnz	_loop

			rdtsc
			pop	ecx
			sub	eax, ecx
			pop	ebx
			femms
%ifdef DISABLE_INTS
			sti
%endif
			ret	16


; Check for 3DNow! support
			align	16
global _check_3dnow@0
_check_3dnow@0: 	push	ebx
			mov	eax, 80000000h
			cpuid
			cmp	eax, 80000001h
			jb	.no_3dnow
			mov	eax, 80000001h
			cpuid
			test	edx, 80000000h
			jz	.no_3dnow
			mov	eax, 1
			jmp	short .ok
.no_3dnow:		xor	eax, eax
.ok:			pop	ebx
			ret
