comment #/*

cache optimized roto-zoomer (c) '95/96 Niklas Beisert / pascal

Use this in your non commercial productions if you are too lazy,
or use some of the ideas in this "article" if you like, but don't be too
lazy to greet me then! ;)
Spread it only with all associated files:
  FILE_ID.DIZ
  ROTO.ASM
  ROTO.H
  ROTO.OBJ
  MAKEFILE
  ROTOEXAM.CPP
  ROTO.EXE
  TIMER.ASM
  TIMER.H

Send me some money: :)
  pascal@nightmare.harz.de, Niklas Beisert@2:2437/301.44
  Niklas Beisert, Oberstrasse 84, 20149 Hamburg, Germany

Thanks go to jmagic/complex for releasing some source of his asm94 intro.
The texture innerloop is quite fast and pentium optimized, so I used it! ;)
Furthermore thanks go to Daredevil and Tran for pmode/w (lite).


 usage: texturescreen dst,src,x0,y0,xx,xy,yx,yy,xblk,yblk,scrwid,op,dir
                 dst: destination address. eg:0xA0000
                 src: segment aligned (0xijkl0000) 256x256 map
   x0,y0,xx,xy,yx,yy: 16:16 fixpoint transformation matrix
           xblk,yblk: a block is 8x8, so 320x200 would be 40,25
              scrwid: bytes per line
                  op: opcode: 0x89:mov, 0x01:add...
                 dir: blockorder: 0:horizontal first, 1:vertical first

 tested and compiled with watcom c++ 10.0 and tasm 3.1


 transformation: dst[x,y]=src[x0+xx*x+xy*y,y0+yx*x+yy*y]

 history and background:
   Second Reality: you remeber the head with the pentagram and the
   lens effect... then a flash and whooopsy, what's that? this is 160x100...
   Why? Otherwise it'd be too slow! But why, This is such a simple effect?
   When I (some time later) recoded this effect I noticed that the framerate
   drops at a certain angle, and the only reason could be the cache.
   The processor cache is organized in a special way to have fast access
   to it's memory. So you have cache lines of 16 (32) bytes on a 486 (pentium)
   which are atoms. They all have a tag address field which stores the
   position of the 16 (32) bytes in memory. Then you have 4 (2) ways which
   are in a way 4 (2) equal caches which can be processed at the same time.
   Finally there are 256 sets of one cache line per way. Bits 4-11 (5-12) of
   the address determine the used set for a memory access.
   At a memory access, the address is split into 3 parts: bits 0-3 (0-4)
   determine the byte in a line, bits 4-11 (5-12) determine the set, and
   bits 12-31 (13-31) are the tag address. The tag address is then compared
   to the tag addresses of the 4 (2) lines of the set. If one matches it is
   a cache hit, if not you get a cache miss, 16 (32) bytes are read from
   memory to the least recently used cache line of that set. This takes
   about 23 cycles on a 486dx2-66, while a cache hit takes no extra cycles.
   A cache is most effective if you read the memory in a linear order like
   you do it in a rotozoomer at low angles. You then get one cache miss
   out of 16 (32) memory accesses. Now imagine the angle is exactly 90.
   You would then read then memory in steps of 256, after 8k the first
   cache line is overwritten, so if you process the next line, it is a
   cache miss. This results in 100% cache misses...
   How to optimize it?
   I had several discussions with Scholar / $eeN on this topic. (hiho!)
   We though about rendering the screen in a different order, so that
   the texture is read in a linear fashion. This would be diagonal lines
   instead of h-lines. But this is not a fast solution either, and more
   complicated anyway. You could also keep prerotated versions of the
   texture, but this would require 2x or more the amount of memory,
   and you are limited to a fixed texture if you do not want to modify
   2 textures all the time.
   The 8x8 block approach was a good compromise. :) You can write dwords,
   and do not need too much memory while the cache contents are not
   destroyed.
   You can also use this 8x8 block approach to optimize movelist-tunnels:
   keep the movelist linear, while you go though the 8x8 blocks.
   And you can do other nice things with 8x8 blocks... ;))))))))))
   Which I cannot tell you yet. probably later! =}
   Cache optimizing seems to be quite stupid for vector engines, but
   it is ESSENTIAL for fast bitmap effects.

   This little assembler fragments show you what cache can do and
   what it cannot:

    fastloop:
      mov dx,12
    l1:
        mov cx,32768
      l2:
          mov ax,[0]
          mov ax,[0]
          mov ax,[0]
          mov ax,[0]
          mov ax,[0]
        dec cx
        jnz l2
      dec dx
      jnz l1

    slowloop:
      mov dx,12
    l3:
         mov cx,32768
       l4:
           mov ax,[2047]
           mov ax,[4095]
           mov ax,[6143]
           mov ax,[8191]
           mov ax,[10239]
         dec cx
         jnz l4
       dec dx
       jnz l3

   On a 486dx2-66 the first loop is about 25-50 times as fast as the second
   one. If you don't believe me, try it yourself.
   On a pentium 3 moves are enough to show the effects of the cache:
           mov ax,[4095]
           mov ax,[8191]
           mov ax,[12287]
   use those in the loop.


 description:
   This rotozoomer (also stretcher if you like...) does not process the
   picture like it is usually done, line by line, but block by block.
   The reason is simple: If you do it line by line you get 100% cache misses
   beyond a certain rotation angle (~60 on a 486dx 8k processor cache).
   100% cache misses mean 64000 (mode 13h) times 23 cycles on a
   486dx2-66 (experimental value), which is > 20ms, and your frame rate
   cannot get any better than 50Hz (ignoring all other operations).
   You might end up at 30Hz at certain angels and 100Hz at others.
   If you do it block by block you can reduce the cache misses to about
   4000 to 8000 (2-5 ms), which has only little effect on the framerate.
   You end up with a rotozoomer which runs at constant 100fps on a dx2-66.
   or constant 350fps on a p120.

   Every 8x8 block is processed in the usual way, ie. 8 pixels, next line,
   8 pixels, next line... The blocks are then drawn either in horizontal or
   vertical order, ie. row by row vs column by column. If used correctly this
   feature reduces the number of cache misses still a bit. You should use
   vertical order, if the angle is about 90 or 270. If it is rather 0 or
   180 use horizontal order.

   I had no problem to use 4 rotozoomers at the same time in our intro
   "LASSE REINB0NG" which was still quite smooth on a 486dx2-66 (>15fps).
   This is the same routine as used in the intro, so believe me, it's fast.
   (only that you can now control the blockorder)
   Nothing was slowed down!

   On a P120 this routine nearly runs in 1 frame / retrace... (did I say
   640x480??? :) )
   That is why there is the bytes/line parameter. You can use this routine
   with segmented screen memory. In 640x480 you should set the virtual screen
   width to 1024. Then the bytes/line is not equal to yblocks*8. You can then
   process a 80x8 blocks (640x64) range with this routine without segment
   changes. If you process 7.5 of these ranges with segment changes in
   between you can fill the screen with this routine.
*/#


.486
model flat,prolog
locals

.code

adc_ecx macro lab
  db 081h,0D1h
    lab dd 0
endm

adc_edx macro lab
  db 081h,0D2h
    lab dd 0
endm

add_edx macro lab
  db 081h,0C2h
    lab dd 0
endm

add_ecx macro lab
  db 081h,0C1h
    lab dd 0
endm

sub_ecx macro lab
  db 081h,0E9h
    lab dd 0
endm

add_edi macro lab
  db 081h,0C7h
    lab dd 0
endm

textureblock proc
  add_ecx beforedfy0
  mov bh,dl
  adc dl,0
  mov bl,cl

@@block:
    add_edx dfx0dy0_0
    mov al,[ebx]
    adc_ecx dfy0dx0_0
    mov bh,dl
    mov bl,cl
    adc_edx dfx0dy0_1
    mov ah,[ebx]
    adc_ecx dfy0dx0_1
    mov bh,dl
    bswap eax
    mov bl,cl

    adc_edx dfx0dy0_2
    mov ah,[ebx]
    adc_ecx dfy0dx0_2
    mov bh,dl
    mov bl,cl
    adc_edx dfx0dy0_3
    mov al,[ebx]
    adc_ecx dfy0dx0_3
    mov bh,dl
    bswap eax
    adc dl,0
    op0 db 089h,007h ;//op [edi],eax
    mov bl,cl

    add_edx dfx0dy0_4
    mov al,[ebx]
    adc_ecx dfy0dx0_4
    mov bh,dl
    mov bl,cl
    adc_edx dfx0dy0_5
    mov ah,[ebx]
    adc_ecx dfy0dx0_5
    mov bh,dl
    bswap eax
    mov bl,cl

    adc_edx dfx0dy0_6
    mov ah,[ebx]
    adc_ecx dfy7dx0
    mov bh,dl
    mov bl,cl
    adc_edx dfx7dy7
    mov al,[ebx]
    adc_ecx dfy0dx7
    mov bh,dl
    bswap eax
    adc dl,0
    op1 db 089h,047h,004h ;//op [edi+4],eax
    mov bl,cl

    add_edi scrwidth
  dec esi
  jnz @@block

  sub_ecx afterdfy0
  sbb dl,0

  ret
endp

public texturescreen_

texturescreen_ proc dst:dword, src:dword, x0:dword, y0:dword, xx:dword, xy:dword, yx:dword, yy:dword, xblocks:dword, yblocks:dword, scrwid:dword, op:dword, dir:dword
local c1:dword, c2:dword, retxf1:dword, retyf1:dword, retsp1:dword, retxf2:dword, retyf2:dword, retsp2:dword, retxi1:byte, retyi1:byte, retxi2:byte, retyi2:byte
  lea edi,textureblock+128

  mov eax,scrwid
  mov [edi-128+scrwidth-textureblock],eax

  mov al,byte ptr op
  mov [edi-128+op0-textureblock],al
  mov [edi-128+op1-textureblock],al

  mov eax,xx
  shl eax,16
  mov al,byte ptr xy+2
  mov [edi-128+dfx0dy0_0-textureblock],eax
  mov [edi-128+dfx0dy0_1-textureblock],eax
  mov [edi-128+dfx0dy0_2-textureblock],eax
  mov [edi-128+dfx0dy0_3-textureblock],eax
  mov [edi-128+dfx0dy0_4-textureblock],eax
  mov [edi-128+dfx0dy0_5-textureblock],eax
  mov [edi-128+dfx0dy0_6-textureblock],eax
  mov ebx,xy
  shl ebx,3
  sub ebx,xy
  neg ebx
  add ebx,yy
  shr ebx,16
  mov al,bl
  mov ebx,xx
  shl ebx,19
  sub eax,ebx
  mov ebx,yx
  shl ebx,16
  add eax,ebx
  mov [edi-128+dfx7dy7-textureblock],eax

  mov eax,xy
  shl eax,16
  mov [edi-128+beforedfy0-textureblock],eax
  mov [edi-128+afterdfy0-textureblock],eax
  mov al,byte ptr xx+2
  mov [edi-128+dfy0dx0_0-textureblock],eax
  mov [edi-128+dfy0dx0_1-textureblock],eax
  mov [edi-128+dfy0dx0_2-textureblock],eax
  mov [edi-128+dfy0dx0_3-textureblock],eax
  mov [edi-128+dfy0dx0_4-textureblock],eax
  mov [edi-128+dfy0dx0_5-textureblock],eax
  mov ebx,xx
  shl ebx,3
  sub ebx,xx
  neg ebx
  add ebx,yx
  shr ebx,16
  mov al,bl
  mov [edi-128+dfy0dx7-textureblock],eax
  mov ebx,xy
  shl ebx,19
  sub eax,ebx
  mov ebx,yy
  shl ebx,16
  add eax,ebx
  mov al,byte ptr xx+2
  mov [edi-128+dfy7dx0-textureblock],eax

  shl xx,3
  shl xy,3
  shl yx,3
  shl yy,3

  cmp dir,0
  jne @@vert

@@horz:
  mov eax,xx
  sub eax,yx
  rol eax,16
  mov retxi1,al
  and eax,not 65535
  mov retxf1,eax

  mov eax,xy
  sub eax,yy
  rol eax,16
  mov retyi1,al
  and eax,not 65535
  mov retyf1,eax

  mov eax,scrwid
  neg eax
  inc eax
  shl eax,3
  mov retsp1,eax

  mov eax,xblocks
  neg eax
  imul xx
  add eax,yx
  rol eax,16
  mov retxi2,al
  and eax,not 65535
  mov retxf2,eax

  mov eax,xblocks
  neg eax
  imul xy
  add eax,yy
  rol eax,16
  mov retyi2,al
  and eax,not 65535
  mov retyf2,eax

  mov eax,xblocks
  neg eax
  add eax,scrwid
  shl eax,3
  mov retsp2,eax

  mov ebx,src
  mov edi,dst

  mov edx,x0
  rol edx,16
  mov ecx,y0
  rol ecx,16
  xchg cl,dl

  mov eax,yblocks
  mov c1,eax
@@l1:
    mov eax,xblocks
    mov c2,eax
  @@l2:
      mov esi,8
      call textureblock
      add edx,retxf1
      adc cl,retxi1
      add ecx,retyf1
      adc dl,retyi1
      add edi,retsp1
    dec c2
    jnz @@l2

    add edx,retxf2
    adc cl,retxi2
    add ecx,retyf2
    adc dl,retyi2
    add edi,retsp2
  dec c1
  jnz @@l1
  jmp @@done

@@vert:
  mov eax,yblocks
  neg eax
  imul eax,yx
  add eax,xx
  rol eax,16
  mov retxi1,al
  and eax,not 65535
  mov retxf1,eax

  mov eax,yblocks
  neg eax
  imul eax,yy
  add eax,xy
  rol eax,16
  mov retyi1,al
  and eax,not 65535
  mov retyf1,eax

  mov eax,yblocks
  neg eax
  imul scrwid
  inc eax
  shl eax,3
  mov retsp1,eax

  mov ebx,src
  mov edi,dst

  mov edx,x0
  rol edx,16
  mov ecx,y0
  rol ecx,16
  xchg cl,dl

  mov eax,xblocks
  mov c1,eax
@@l3:
    mov esi,yblocks
    shl esi,3
    call textureblock
    add edx,retxf1
    adc cl,retxi1
    add ecx,retyf1
    adc dl,retyi1
    add edi,retsp1
  dec c1
  jnz @@l3

@@done:
  ret
endp

end
