;-----------------------------------------------------------
;Bubble Universe DOS - port & minor mods by Kuemmel 11/2023
;Thanks Jin X and TomCat for testing and support !
;
;learn about the original frome here:
;https://stardot.org.uk/forums/viewtopic.php?t=25833
;seems to be from ZXDunny, but even he said he didn't
;know where he got the algorithm from
;-----------------------------------------------------------
;CONST n=200,r=TAU/235:
;x, y, v, t=0, sz=200,sw=SCRW/sz,sh=SCRH/sz:
;WINDOW DEPTH 0,32:
;ORIGIN -sw,-sh TO sw,sh:
;SCREEN LOCK:
;DO:
;  CLS 0:
;  FOR i=0 TO n; j=0 TO n:
;    u=SIN(i+v)+SIN(r*i+x),
;    v=COS(i+v)+COS(r*i+x),
;    x=u+t:
;    PLOT INK RGBTOINT(i,j,99);u,v:
;  NEXT j;i:
;  t+=.025:
;  WAIT SCREEN:
;LOOP
;
;Modifications by me: n=255, BLUE component = ABS(v)*127, dt and sz are adjusted to fit here

;define bfloat16 number (high word of 32-bit float number) with optional name
;If fp32l <> 0 (default) then name label is shifted up by a word (to use it for access to 32-bit float value with undefined low word)
;macro by Jin X
macro df16 n*, name, fp32l=1
{
  local fp32
  if ~ name eq
    if fp32l	
      label name dword at $-2
    else ; ~ fp32l
      label name word at $
    end if ; [~] fp32l
  end if ; ~ name eq
  fp32 = dword (n)
		dw	fp32 shr 16
};df16

org 100h
;---parameters
x_res = 640
y_res = 480

;---screen mode stuff by Jin X, set screen mode and get LFB address
    db 0x3F,0x42,0x12,0x21 ;640x480x32Bit modes (for VMware,QEMU/VirtualBox,Intel,AMD)
    mov di,0x200  ;where the mode information block is stored
@@: lodsb
    xchg ax,cx
    mov ch,0x41   ;0x41 = LFB mode, 1 = windowed mode
    mov ax,0x4F01
    int 0x10      ;get mode info INT 0x10, ax=0x4f01, cx=mode, es:di=256 byte buffer; if successful ax = 0x004f
    sahf
    jc @B
    mov bx,cx
    mov ax,0x4F02 ;set video mode INT 0x10, ax=0x4f02, bx=mode, es:di=CRTCInfoBlock
    int 0x10      ;if successful ax = 0x004f

;---pmode stuff by Jin X
cli
pop fs            ;fs instead of es due to some crashes with some gfx cards
mov eax,0x40603
lmsw ax
mov cr4,eax
xor ecx,ecx
xgetbv
or al,0x7
xsetbv

;---init stuff                        
fninit                  ;st0          st1         st2         st3         st4     st5     st6     st7
fldz                    ;t=0 
fldz                    ;u=0          t=0
fldz                    ;v=0          u=0         t=0
mov si,data_stuff
vzeroall                ;short way to clear all ymm register, relevant for screen clear
mov edi,[si+(0x200-data_stuff+0x28)] ;init screen address (di + 0x28 => 0x228)

mainloop:
;cx is zero here always cx = i|j
ij_loop:
    ;bubble universe algo
    movzx bx,ch         ;get i
    mov word[bp+si],bx  ;basically not needed to update all the time, but saves space or a loop...
                        ;v            u           t
    fiadd word[bp+si]   ;v+i          u           t  
    fsincos             ;cos(v+i)     sin(v+i)    u           
    fld dword[si+6]     ;r            cos(v+i)    sin(v+i)    u           t
    fimul word[bp+si]   ;r*i          cos(v+i)    sin(v+i)    u           t
    fadd  st0,st3       ;r*i+u        cos(v+i)    sin(v+i)    u           t   
    fadd  st0,st4       ;r*i+u+t      cos(v+i)    sin(v+i)    u           t   
    fsincos             ;cos(r*i+u+t) sin(r*i+u+t)cos(v+i)    sin(v+i)    u       t
    faddp st2,st0       ;sin(r*i+u+t) v'=cos+cos  sin(v+i)    u           t   
    faddp st2,st0       ;v'           u'=sin+sin  u           t
    fxch  st1           ;u'           v'          u           t       
    fstp  st2           ;v'           u'          t
     
    ;pixel addressing
    fld   st0           ;v'           v'          u'          t
    fmul  dword[si+2]   ;v'*s         v'          u'          t
    fistp dword[bp+si]  ;v'           u'          t
    mov   edx,dword[bp+si] ;get y
    imul  edx,edx,x_res    ;y*x_res
    fld   st1           ;u'           v'          u'          t
    fimul word[si]      ;u'*s         v'          u'          t
    fistp dword[bp+si]  ;v'           u'          t
    add   edx,dword[bp+si]              ;add x

    ;pixel colouring
    fld   st0           ;v'           v'          u'          t
    fimul word[si+6]    ;v'*cm        v'          u'          t
    fabs                ;abs(v'*cm)   v'          u'          t
    fistp word[bp+si]   ;v'           u'          t
    mov   bx,cx         ;Red = i, Green = j 
    shl   ebx,8         
    mov   bl,byte[bp+si];Blue  = ABS(v*127) ;not in the original code, brings a bit more colour   

    ;plot RGB pixel  
    ;mov   [es:edi+edx*4+(y_res/2*x_res+x_res/2)*4],ebx  ;center x and y and plot
    mov   [0x00200000+edx*4+(y_res/2*x_res+x_res/2)*4],ebx  ;center x and y and plot
    inc cx              
jnz ij_loop

;inc timer
fld dword[si+8]         ;dt           v'          u'          t
faddp st3,st0           ;v'           u'          t+dt

;vsync
mov dx,0x03da
vsync:
   in al,dx
   test al,8
jz vsync

;copy memory to screen
xor ebx,ebx
mov cx,x_res*y_res*4/32
copy_mem_to_screen_loop:
   vmovdqu ymm0,[0x00200000+ebx] ;get   32 Bytes
   vmovdqa [fs:edi+ebx],ymm0     ;plot  32 Bytes fs instead of es due to some crashes with some gfx cards
   vmovdqu [0x00200000+ebx],ymm1 ;clear 32 Bytes
   add ebx,32
loop copy_mem_to_screen_loop

;slow build up for v resize factor, looks better in float
fld dword[si+10]      ;110.0
fld dword[si+2]       ;sy      110.0
fcomi st0,st1
jnc skip_inc_y
   fadd dword[si+12]  ;sy+dsy  110.0
   fst  dword[si+2]   ;new sy  110.0
skip_inc_y:
fcompp

;exit or loop
in al,0x60
cbw
dec ax
jnz mainloop
mov cr0,eax  ;eax needs clear upper 3 Bits and lower byte here, that is okay due eax init above
mov al,0x3   ;exit to text mode
int 0x10
int 0x20     ;needed due to pop es/fs from init code

data_stuff:
dw   83           ;sx  = x resize factor = sy*0.75 because (4:3)/(16:9)  [ 0]
dd   0            ;sy  = y resize factor                                 [ 2]
dw   127          ;cm = colour multiplier                                [ 6]
df16 0.0245436926 ;r  = PI*2/256          ***could be reduced to 2 Bytes [ 6]
df16 0.00175      ;dt = timer offset      ***could be reduced to 2 Bytes [ 8]
df16 110.0        ;max sy                                                [10]
df16 0.125        ;sy inc                                                [12]