;'plattenbau 3000 sse' at 1024x768x8Bit by Kuemmel 
;requires SSE4.1
org 100h
use16
width=1024
height=768

;---parameters
effect_speed_shift=10	;default: 10
effect_01=100000000b	;default: 100000000b should kind of fit to effect_speed_shift
effect_02=111111111b	;default: 111111111b should kind of fit to effect_speed_shift
depth_initial=8 		;default: 8
depth_steps=255 	    ;raycasting steps 0...255 (could be raised, costs +1 Byte)
bg_colour=54			;choose background colour from standard VGA palette
x_offset=512-130	    ;default 320 (center x: 512)
y_offset=384+50 	    ;default 384 (center y: 384)
xy_speed=9
;---create words with 0,1,2,3,4,5,6,7,... for SSE x-offset mask
mov di,0x310			;aligned to 16 Byte
loop01234567:
   stosw
   inc ax
loop loop01234567

;---switch to 1024x768x8Bit before palette is changed
mov ax,0x4f02
mov bx,0x105
int 10h
push 0xa000
pop es

;---create background colour (palette entry 0...looks expensive but shorter than done in SSE
;if background colour black is okay this can be skipped totally
mov ax,0x1015
mov bl,bg_colour	;doesn't seem to care that bh is set from before
int 10h 			;get RGB in dh,ch,cl
mov al,10h
xor bx,bx			;set RGB in dh,ch,cl (clear bh for later also)
int 10h

;---constants/variable address
xor bp,bp			;init global timer to 0...can be ommitted if you need two bytes...
mov si,0x300
;add si,si			;=0x200 aligned to 16 Byte...only save if code size maximum is <=256 Bytes !!!
xorps xmm6,xmm6     ;init sse timer = 0 ...seems zero at FreeDOS fresh start on all tested systems, but just to be save here...

;---main intro frame loop
main_loop:
shld ax,bp,effect_speed_shift
and al,00000011b
or  al,00010000b		;mask caluclation/variation => 000100??b
mov ah,al
mov word[si],ax
mov word[si+2],ax		
pshufd xmm7,[si],0		;needed on all 16 bytes

;change geometry for cube/platte/depth
test bp,effect_02
jne skip_effect_02
   xor byte[si-(0x300-(effect_loc0-2))],1	 ;toggles between 54 and 55 => andps/andnps
   xor byte[si-(0x300-(effect_loc2-2))],4    ;toggles between 0xfd and 0xf9 => psubw/paddw
   ;xor byte[si-(0x200-(effect_loc1-1))],15	 ;toggles between 7 and 8 => initial depth
skip_effect_02:

cwd						;init bank dx, ax is always positive here
xor di,di				;init screen bank
mov cx,-height+y_offset
loopy:
  mov word[si],cx
  mov word[si+2],cx
  pshufd xmm3,[si],0	;y	|y  |y	|y  |y	|y	|y	|y

  ;change geometry for 'platte'
  test bp,effect_01
  jz skip_effect_01
     paddw xmm3,xmm7
  skip_effect_01:

  ;switch screenbank if needed, needs bx=0, is zero here always
  ;due to width=1024 => 65536/1024=64 it's save to do this outside x loop
  test di,di
  jnz skip_bank_switch
	mov ax,0x4f05
	int 10h
	inc dx
  skip_bank_switch:

  mov ax,-width+x_offset
  loopx:
    pcmpeqw xmm2,xmm2		;all bits to 1 => = xmm2 = -1|-1|...
	movaps xmm4,xmm2
	psllw  xmm4,depth_initial
	effect_loc1:
	mov word[si],ax
	mov word[si+2],ax
	pshufd xmm0,[si],0
	paddw  xmm0,[si+16]		;x = x+0|x+1|x+2|x+3|x+4|x+5|x+6|x+7
	movaps [si],xmm0		;store aligned x as there's no more regs available
	xorps  xmm1,xmm1		;hit_colours = 0
	mov bl,depth_steps			;depth => 255 steps
	depth_loop:
		paddw	xmm4,xmm2		;depth = depth - 1
		movaps	xmm0,[si]		;get x
		movaps	xmm5,xmm3	    ;get y
		pmullw	xmm0,xmm4		;x = (x-center)*depth	could be outside loop, but eats bytes
		pmullw	xmm5,xmm4		;y = (y-center)*depth	could be outside loop, but eats bytes
		paddw	xmm0,xmm6		;x + timer				could be outside loop, but eats bytes
		paddw	xmm5,xmm6	    ;y + timer				could be outside loop, but eats bytes
		effect_loc2:
		andps	xmm5,xmm0
		psraw	xmm5,10
		andps	xmm5,xmm4	    ;initial plattenbau geometry
		effect_loc0:
		dec bx					;reordered
		packsswb xmm5,xmm5		;current color from words to bytes
		movaps	xmm0,xmm1		;hit_colours
		andnps	xmm0,xmm7		;mask only if hit_colour NOT set already
		andps	xmm0,xmm5		;check if hit occurred => if current color contains the mask
		pcmpeqb xmm0,xmm7		;if hit occurred set byte to 11111111 
		pblendvb xmm1,xmm5,xmm0 ;SSE4.1 update only the 11111111 byte's of hit_colours
	jnz depth_loop		;using LOOP is at least 10% slower !!!	
	movq [es:di],xmm1	;plot all 8 pixel bytes
    add ax,8			;8 pixels per x loop
	add di,8
  cmp ax,x_offset
  jl loopx			;far jmps could be optimized by a call subroutine, put huge penalty on speed...
inc cx
cmp cx,y_offset
jne loopy		;far jmps could be optimized by a call subroutine, put huge penalty on speed...

xchg ax,dx		;to clear ah for exit, as dh is zero here

;---vsync for timing & flicker reduce
;mov dx,03dah
;vsync:
;  in al,dx
;  test al,8
;jz vsync

;---timing for x/y movement
psllw xmm2,xy_speed	;adjust speed
psubw xmm6,xmm2 	;inc x/y timer
inc bp

cmp bp,10000000000b
je exit

check_keyboard:
in al,0x60
dec ax
jnz main_loop	;far jmps could be optimized by a call subroutine, put huge penalty on speed...
exit:
mov al,3		;set text mode...can be omitted if needed, costs 4 bytes...
int 10h
ret