//----------------------------------------------------------
// "confetti tail" by Kuemmel for CodeCraft #4 2021
//----------------------------------------------------------
.syntax unified
.thumb

//--- resolution...also needs to be set at the end for the screen mode string
x_res = 1280 // 800
y_res =  720 // 600
buffer_size = 0x00390000 //x_res+y_res*4 needs to be rounded up to 0x00??0000
                         // 800*600 => 0x001e0000
                         //1280*720 => 0x00390000

//--- OS routines
.set OS_Byte,              0x06
.set OS_ScreenMode,        0x65
.set OS_RemoveCursors,     0x36
.set OS_ReadVduVariables,  0x31
.set OS_Exit,              0x11
.set OS_ReadEscapeState,   0x2c
.set OS_ReadMonotonicTime, 0x42

//--- init screen and get screen start address --------------------
  movs r7,#0              //for buffer clear and mvns later
  movs r0,#15
  adr.n r1,mode
  swi OS_ScreenMode
  mvns r3,r7              //  -1
  movs r2,#148            // 148
  movs r0,r1              // read = write address
  stmia r1!,{r2,r3}
  swi OS_ReadVduVariables // screen address at r1 or mode+8
  swi OS_RemoveCursors    // remove cursor

//--- clear first buffer ------------------------------------------
  ldr r6,buffer_address
  mov r0,#buffer_size     // a bit more than x_res*y_res*4 Bytes to clear
  buffer_clear_loop:
    str r7,[r6,r0]        // r7 is zero here still
    subs r0,r0,#4
  bne buffer_clear_loop

//--- init constants ----------------------------------------------
  mov   r10,#x_res        // x_res as multiplier and loop counter
  mov   r11,#16807        // randomizer magic number
  mov   r12,#0x02020202   // sub pattern

//--- main intro loop ---------------------------------------------
mainloop:

//--- plot random coords within a limited circle ------------------
  adr r1,buffer_address //needed also for randomizer
  ldr r6,[r1]           // 1st buffer address
  movs r7,#0
  movs r2,#y_res
  y_loop_content:
     subs r5,r2,#(y_res>>1)    //center y
     muls r5,r5,r5             //y*y
     mov  r3,r10
     x_loop_content:
        subs r4,r3,#(x_res>>1) //center x
        mla  r4,r4,r4,r5       //r = x*x + y*y
        cmp  r4,#255           //limit...use x_res or radomizer magic ?
        bhi skip_plot_random   //inside ?
           ldr r0,[r1,#4]      //get random
           mul r0,r0,r11       //create new random
           str r0,[r1,#4]      //write back random
           tst r0,#0b111000000000
           bne skip_random
             str r0,[r6,r7]      //plot random
           skip_random:
        skip_plot_random:
        adds r7,r7,#4          //inc plot address
        subs r3,r3,#1
        bne x_loop_content
     subs r2,r2,#1
  bne y_loop_content
  eor r7,r6,#buffer_size // toggle to 2nd buffer address
  str r7,[r1]            // save 2nd buffer address for next frame toggle

//--- random x,y based zoomer -----------------------------------
  ldrb r2,[r1,#5]   //get random byte for x_offset or sxtb r2,r0,ror#8  // asrs r2,r2,#5
  ldrb r4,[r1,#6]   //get random byte for y_offset or sxtb r4,r0,ror#16 // asrs r4,r4,#5
  lsrs r2,r2,#4     //x_offset range 0...15
  lsrs r4,r4,#4     //x_offset range 0...15
  subs r2,r2,#7     //x_offset range -7...8
  subs r4,r4,#7     //y_offset range -7...8

//--- timer based action, change every 255 ms -------------------
  swi OS_ReadMonotonicTime
  lsrs r0,r0,#10         // trigger effect change
  bcs zoom_effect

//--- circle x,y based zoomer -----------------------------------
  adr r1,sine_cosine_data
  ldmia r1,{r2-r5}       // get former data
  movs r0,#10            // isn't there an easier way !?
  loop_sine:
    adds r2,r2,r3        // calc next sine value
    adds r4,r4,r5        // calc next cosine value
    sub  r3,r3,r2,asr#13 // calc next sine speed
    sub  r5,r5,r4,asr#13 // calc next cosine value
  subs r0,r0,#1
  bne loop_sine
  stmia r1!,{r2-r5}      // write back
  asrs r2,r2,#13         // sine   scale to -6...+6
  asrs r4,r4,#13         // cosine scale to -6...+6

//--- zoom effect -----------------------------------------------
  zoom_effect:
  ldr  r0,mode+8         // get screen address
  adds r2,r2,#19         // x_offset range -7...8 + 11(800) //1280*31/32 => 19
  adds r9,r4,#10         // y_offset range -7...8 +  8(600) // 720*31/32 => 10
  movs r1,#0
  y_loop_zoom:
     adds r4,r1,r9             // add y_offset
     sub  r4,r4,r1,lsr#5       // y_new = y - y>>5  = (31/32=0.96875) * y => move to y_loop later
     movs r3,#0
     x_loop_zoom:
        adds r5,r3,r2          // add x_offset
        sub  r5,r5,r3,lsr#5    // x_new = x - x>>5  = (31/32=0.96875) * x
        mla  r5,r4,r10,r5      // x_new + y_new * x_res
        ldr  r5,[r6,r5,lsl#2]  // p_new = 1st buffer[(x+y*x_res)<<2] check if ldr r5,[r6,r5] possible
        uqsub8 r5,r5,r12       // saturated subtract R-...,G-...,B-...
        stmia r7!,{r5}         // plot to 2nd buffer => feedback
        stmia r0!,{r5}         // plot to screen
        adds r3,r3,#1
     cmp r3,r10
     bne x_loop_zoom
     adds r1,r1,#1
  cmp r1,#y_res
  bne y_loop_zoom

//--- vsync for timing -------------------------------------------
  movs r0,#19              // r0 preserved, r1 and r2 corrupted
  swi OS_Byte

//--- escape test and exit ---------------------------------------
  swi OS_ReadEscapeState   // check for ESC
  bcc mainloop
  swi OS_Exit              // exit to OS

//--- data
.align 2
sine_cosine_data:
.word 0,512,50363,43
buffer_address:
.word 0x00009000  // address for buffer 1 and 2 that is toggled per frame 0x00009000 <-> 0x00??90000
mode:
.string "X1280 Y720 C16M" //"32 C16M" //...for video capture...800x600 would be shorter => "32 C16M"
                  // 4 Bytes used also as seed for random generator as string is only used once at start
