! ppu.s
! shuboy picture processing unit
! sh2 assembly version
! /mic 2009


.section .data

.global _ppu_reset
.global _ppu_mode
.global _ppu_scanline
.global ppu_draw_scanline
.global __PPU_COPY_TO_IWRAM_START__
.global __PPU_COPY_TO_IWRAM_END__
.global _s_ppu_scanline


! If the symbol PPU_USE_LINEBUFFER is defined, a 160-byte buffer in cache will
! be used for rendering the scanline completely and then copying it to the 32X
! framebuffer, instead of rendering directly to the framebuffer.


! ###########################################################################################################

.include "shuboy.inc"

! ###########################################################################################################


.align 2
__PPU_COPY_TO_IWRAM_START__:

ppu_draw_scanline:
	sts.l	pr,@-r15

        mov.l   ___cache_ctl,r2
        mov     #0x19,r1
        mov.b   r1,@r2			! Purge cache	

	mov.l	_at_s_ppu_scanline,r1
	mov.l	@r1,r0
	mov	#144,r2
	mov	r0,r3
	extu.b	r2,r2
	add	#1,r3
	cmp/eq	r2,r3
	bf	_pds_no_wrap
	mov	#0,r3
_pds_no_wrap:
	mov.l	r3,@r1
	
.ifdef PPU_USE_LINEBUFFER
	mov.l	FRAMEBUFFER1,r13
.else
	mov	r0,r1
	add 	#40,r1	
	shll8	r1		! r1 = (scanline + 40)*256
	mov.l	FRAMEBUFFER1,r13
	mov	#2,r2
	add	r1,r13
	shll8	r2		! r2 = 0x200
	shlr2	r1		! r1 = (scanline + 40)*64
	add	r2,r13	
	add	r1,r13		
	add 	#80,r13		! r13 = framebuffer + (scanline + 40)*320 + 0x200 + 80
.endif

	mov.l	r13,@-r15	! save r13  	
	mov.l	r0,@-r15	! save r0 (scanline)

	! Palette setup
	mov.l	_dmg_pal1,r12
	mov	#0x47,r3	! REG_BGP
_pds_setup_palettes:
	mov	r3,r0
	mov.b	@(r0,r14),r1	
	mov	#4,r2
_pds_setup_pal_inner:	
	mov	r1,r0
	shlr2	r1
	and	#3,r0
	add 	#1,r0	
	mov.b	r0,@r12
	dt	r2
	bf/s	_pds_setup_pal_inner
	add	#1,r12
	add	#1,r3
	mov	r3,r0
	cmp/eq	#0x4A,r0
	bf	_pds_setup_palettes

	mov	r12,r10
	add	#-12,r12

	! First clear the scanline with color 0 from REG_BGP
	mov	#4,r0
	mov.l	@(r0,r15),r13	! framebuffer pointer
	mov.b	@r12,r1
	extu.b	r1,r1
	swap.b	r1,r2
	or	r2,r1
.ifdef PPU_USE_LINEBUFFER
	mov	r1,r2
	shll16	r1
	or	r2,r1
	mov	#40,r2
_pds_clear_scanline:
	mov.l	r1,@r13
	dt	r2
	bf/s	_pds_clear_scanline
	add	#4,r13
.else
	mov	#80,r2
_pds_clear_scanline:
	mov.w	r1,@r13
	dt	r2
	bf/s	_pds_clear_scanline
	add	#2,r13
.endif	
	mov.l	_ppu_draw_obj,r1
	mov	#0x80,r0	! draw background OBJs
	jsr	@r1
	extu.b	r0,r0
	
	mov	#4,r0
	mov.l	@(r0,r15),r13	! framebuffer pointer
	
	mov	#0x40,r0
	mov.b	@(r0,r14),r1	! REG_LCDC	
	mov	#0x18,r5
	shll8	r5
	extu.b	r1,r0
	and	#8,r0
	shll8	r0
	shlr	r0		! r0 = (REG_LCDC & 8)?0x400:0
	add	r0,r5

	mov	#0x42,r0
	mov.b	@(r0,r14),r2	! REG_SCY
	mov.l	@r15,r0		! r0 = scanline
	extu.b	r2,r1
	add	r0,r1		! r1 = scanline + REG_SCY
	extu.b	r1,r0
	and	#0xF8,r0
	shll2	r0
	add	r0,r5

	extu.b	r1,r0
	and	#7,r0
	add	r0,r0
	mov	r0,r11

	mov	#0x40,r0
	mov.b	@(r0,r14),r1	! REG_LCDC
	shlr	r1
	bt	_pds_bg_on
	bra	_pds_bg_off
_pds_bg_on:
	mov.l	_VRAM2,r4
	mov	#0x43,r0
	mov.b	@(r0,r14),r1	! REG_SCX
	extu.b	r1,r0
	shlr2	r0
	shlr	r0
	and	#0x1F,r0	! r0 = (REG_SCX >> 3) & 0x1F
	add	r4,r0
	mov.b	@(r0,r5),r1	! tile number
	extu.b	r1,r1
	
	mov	#0x40,r0
	mov.b	@(r0,r14),r2	! REG_LCDC	
	extu.b	r2,r0
	tst	#0x10,r0
	bt	_pds_tile0_hi_nt
	shll2	r1
	shll2	r1
	mov	r11,r0
	bra	_pds_tile0_got_pat_ptr
	add	r1,r0
_pds_tile0_hi_nt:	
	exts.b 	r1,r1
	shll2 	r1
	shll2 	r1
	mov 	#0x10,r2
	shll8 	r2
	mov 	r11,r0
	add 	r1,r0
	add 	r2,r0
_pds_tile0_got_pat_ptr:
	mov	#0xFF,r3
	mov.b	@(r0,r4),r1	! low byte of char
	extu.b	r3,r3
	add	#1,r0
	mov.b	@(r0,r4),r2	! high byte of char
	xor	r3,r1
	xor	r3,r2
_pds_bg_tile0:

! r1: low byte of char (inverted)
! r2: high byte of char (inverted)
! r5: mapbase
! r12: palette
! r13: framebuffer pointer
! r14: IOREGS

	mov	#0x43,r0
	mov.b	@(r0,r14),r3	! REG_SCX
	mov	r3,r0
	and	#7,r0
	mov	r0,r3
	
	mov	#0x80,r9
	extu.b	r9,r9
_pds_tile0_shift:
	cmp/pl	r3
	bf	_pds_draw_tile0
	shlr	r9
	bra	_pds_tile0_shift	
	add	#-1,r3
	
_pds_draw_tile0:	
	xor	#7,r0
	add	#1,r0
	mov	r0,r7		! number of pixels to draw
	mov	r0,r10
_pds_tile0_xloop:
	tst	r9,r2
	movt	r0		! r0 = (r2 & r9)?0:1
	tst	r9,r1
	rotcl 	r0		! r0 = (r0 << 1) | ((r1 & r9)?0:1))
	tst	r0,r0
	bt/s	_pds_tile0_transparent
	shlr	r9
	mov.b	@(r0,r12),r3	! palette lookup
	mov.b	r3,@r13
_pds_tile0_transparent:	
	dt	r7
	bf/s	_pds_tile0_xloop
	add	#1,r13

	mov	#8,r6
_pds_tile_loop:
	mov.l	_VRAM2,r4
	mov	#0x43,r0
	mov.b	@(r0,r14),r1	! REG_SCX
	extu.b	r1,r0
	add	r6,r0
	shlr2	r0
	shlr	r0
	and	#0x1F,r0	! r0 = ((REG_SCX + (i << 3)) >> 3) & 0x1F
	add	r4,r0
	mov.b	@(r0,r5),r1	! tile number
	extu.b	r1,r1
	mov	#0x40,r0
	mov.b	@(r0,r14),r2
	extu.b	r2,r0
	tst	#0x10,r0
	bt	_pds_loop_hi_nt
	shll2	r1
	shll2	r1
	mov	r11,r0
	bra	_pds_loop_got_pat_ptr
	add	r1,r0
_pds_loop_hi_nt:	
	exts.b 	r1,r1
	shll2 	r1
	shll2 	r1
	mov 	#0x10,r2
	shll8 	r2
	mov 	r11,r0
	add 	r1,r0
	add 	r2,r0
_pds_loop_got_pat_ptr:
	mov	#0xFF,r3
	mov.b	@(r0,r4),r1	! low byte of char
	extu.b	r3,r3
	add	#1,r0
	mov.b	@(r0,r4),r2	! high byte of char
	xor	r3,r1
	xor	r3,r2
	
	mov	#0x80,r9
	mov	#8,r7
	extu.b	r9,r9
_pds_tile_xloop:
	tst	r9,r2
	movt	r0
	tst	r9,r1
	rotcl 	r0
	tst	r0,r0
	bt/s	_pds_tile_transparent
	shlr	r9
	mov.b	@(r0,r12),r3
	mov.b	r3,@r13
_pds_tile_transparent:	
	dt	r7
	bf/s	_pds_tile_xloop
	add	#1,r13
	mov	#152,r0
	add	#8,r6
	add	#8,r10
	extu.b	r0,r0
	cmp/hs	r0,r10
	bf	_pds_tile_loop

	mov.l	_VRAM2,r4
	mov	#0x43,r0
	mov.b	@(r0,r14),r1	! REG_SCX
	extu.b	r1,r0
	add 	r10,r0
	shlr2	r0
	shlr	r0
	and	#0x1F,r0	! r0 = ((REG_SCX + x)>> 3) & 0x1F
	add	r4,r0
	mov.b	@(r0,r5),r1	! tile number
	extu.b	r1,r1
	mov	#0x40,r0
	mov.b	@(r0,r14),r2	! REG_LCDC	
	extu.b	r2,r0
	tst	#0x10,r0
	bt	_pds_tile19_hi_nt
	shll2	r1
	shll2	r1
	mov	r11,r0
	bra	_pds_tile19_got_pat_ptr
	add	r1,r0
_pds_tile19_hi_nt:	
	exts.b 	r1,r1
	shll2 	r1
	shll2 	r1
	mov 	#0x10,r2
	shll8 	r2
	mov 	r11,r0
	add 	r1,r0
	add 	r2,r0
_pds_tile19_got_pat_ptr:
	mov	#0xFF,r3
	mov.b	@(r0,r4),r1	! low byte of char
	extu.b	r3,r3
	add	#1,r0
	mov.b	@(r0,r4),r2	! high byte of char
	xor	r3,r1
	xor	r3,r2
_pds_bg_tile19:
	mov	#0x43,r0
	mov.b	@(r0,r14),r3	! REG_SCX
	mov	r3,r0
	and	#7,r0
	mov	#0x80,r9
	extu.b	r9,r9
	mov	r0,r7
	tst	r7,r7
	movt	r0
	shll2	r0
	shll	r0
	add	r0,r7
_pds_tile19_xloop:
	tst	r7,r7
	bt	_pds_bg_off
	tst	r9,r2
	movt	r0
	tst	r9,r1
	rotcl 	r0
	tst	r0,r0
	bt/s	_pds_tile19_transparent
	shlr	r9
	mov.b	@(r0,r12),r3
	mov.b	r3,@r13
_pds_tile19_transparent:	
	dt	r7
	bra	_pds_tile19_xloop
	add	#1,r13
	
_pds_bg_off:

	bsr	ppu_draw_window
	nop

	mov.l	_ppu_draw_obj,r1
	nop
	jsr	@r1
	mov	#0x00,r0	! draw foreground OBJs

	mov.l	@r15+,r0		
	mov.l	@r15+,r13		

.ifdef PPU_USE_LINEBUFFER
	mov	r0,r1
	add 	#40,r1	
	shll8	r1		! r1 = (scanline + 40)*256
	mov.l	FRAMEBUFFER1_,r13
	mov	#2,r2
	add	r1,r13
	shll8	r2		! r2 = 0x200
	shlr2	r1		! r1 = (scanline + 40)*64
	add	r2,r13	
	add	r1,r13		
	add 	#80,r13	
	mov.l	FRAMEBUFFER1,r12
	mov	#80,r2
_pds_copy_scanline:
	mov.w	@r12+,r1
	mov.w	r1,@r13
	dt	r2
	bf/s	_pds_copy_scanline
	add	#2,r13
.endif

	lds.l	@r15+,pr
	nop
	rts
	nop
	

.align 2
.ifdef PPU_USE_LINEBUFFER
FRAMEBUFFER1:	.long 0xC0000760 
FRAMEBUFFER1_:	.long 0x24000000
.else
FRAMEBUFFER1:	.long 0x24000000
.endif
_VRAM2:		.long VRAM  !+ UNCACHED_ADDRESS
_ppu_draw_obj:	.long PPU_CACHE_COPY_ADR + ppu_draw_objects - __PPU_COPY_TO_IWRAM_START__
_dmg_pal1:	.long dmg_palette  
___cache_ctl:   .long 0xFFFFFE92
dmg_palette:	.long 0,0,0, 0,0,0,0,0,0,0,0
_at_s_ppu_scanline: .long _s_ppu_scanline
_s_ppu_scanline: .long 0


! ###########################################################################################################

ppu_draw_window:
	mov	#0x40,r0
	mov.b	@(r0,r14),r1	! REG_LCDC
	extu.b	r1,r0
	tst	#0x20,r0
	bt	_pdw_wnd_off
	mov	#0x4A,r0
	mov.b	@(r0,r14),r1	! REG_WY
	extu.b	r1,r1
	mov.l	@r15,r0		! r0 = scanline
	cmp/hs	r1,r0		! scanline >= REG_WY
	bf	_pdw_wnd_off
	bra	_pdw_wnd_on
	nop
_pdw_wnd_off:
	rts
	nop
_pdw_wnd_on:
.ifdef PPU_USE_LINEBUFFER
	mov.l	FRAMEBUFFER4,r13
.else
	add 	#40,r0
	shll8	r0		! r0 = scanline*256
	mov.l	FRAMEBUFFER4,r13
	mov	#2,r2
	add	r0,r13
	shll8	r2		! r2 = 0x200
	shlr2	r0		! r0 = scanline*64
	add	r2,r13	
	add	r0,r13		
	add 	#80,r13
.endif
	mov	#0x4B,r0
	mov.b	@(r0,r14),r1
	extu.b	r1,r1
	add	#-6,r1
	extu.b	r1,r10
	add	r1,r13		! r13 = framebuffer + scanline*320 + 0x200 + REG_WX - 6

	mov	#0x40,r0
	mov.b	@(r0,r14),r1	! REG_LCDC	
	mov	#0x18,r5
	shll8	r5
	extu.b	r1,r0
	and	#0x40,r0
	shll2	r0
	shll2	r0		! r0 = (REG_LCDC & 0x40)?0x400:0
	add	r0,r5

	mov	#0x4A,r0
	mov.b	@(r0,r14),r1	! REG_WY
	mov.l	@r15,r0		! r0 = scanline
	extu.b	r1,r2
	sub	r2,r0		! r1 = scanline - REG_WY
	extu.b	r0,r0
	extu.b	r0,r1
	and	#0xF8,r0
	shll2	r0
	add	r0,r5

	extu.b	r1,r0
	and	#7,r0
	add	r0,r0
	mov	r0,r11

	mov	#0,r6
_pdw_tile_loop:
	mov.l	_VRAM4,r4
	mov	#152,r0
	extu.b	r0,r0
	cmp/hi	r0,r10
	bt	_pdw_tile_loop_end
	mov	r6,r0
	shlr2	r0
	shlr	r0
	and	#0x1F,r0	! r0 = ((REG_SCX + (i << 3)) >> 3) & 0x1F
	add	r4,r0
	mov.b	@(r0,r5),r1	! tile number
	extu.b	r1,r1
	mov	#0x40,r0
	mov.b	@(r0,r14),r2
	extu.b	r2,r0
	tst	#0x10,r0
	bt	_pdw_loop_hi_nt
	shll2	r1
	shll2	r1
	mov	r11,r0
	bra	_pdw_loop_got_pat_ptr
	add	r1,r0
_pdw_loop_hi_nt:	
	exts.b 	r1,r1
	shll2 	r1
	shll2 	r1
	mov 	#0x10,r2
	shll8 	r2
	mov 	r11,r0
	add 	r1,r0
	add 	r2,r0
_pdw_loop_got_pat_ptr:
	mov	#0xFF,r3
	mov.b	@(r0,r4),r1	! low byte of char
	extu.b	r3,r3
	add	#1,r0
	mov.b	@(r0,r4),r2	! high byte of char
	xor	r3,r1
	xor	r3,r2
	
	mov	#0x80,r9
	mov	#8,r7
	extu.b	r9,r9
_pdw_tile_xloop:
	tst	r9,r2
	movt	r0
	tst	r9,r1
	rotcl 	r0
	tst	r0,r0
	bt/s	_pdw_tile_transparent
	shlr	r9
	mov.b	@(r0,r12),r3
	mov.b	r3,@r13
_pdw_tile_transparent:	
	dt	r7
	bf/s	_pdw_tile_xloop
	add	#1,r13
	mov	#144,r0
	add	#8,r6
	extu.b	r0,r0
	add	#8,r10
	cmp/hi	r0,r6
	bf	_pdw_tile_loop
_pdw_tile_loop_end:
	rts
	nop
	

.align 2
.ifdef PPU_USE_LINEBUFFER
FRAMEBUFFER4:	.long 0xC0000760
.else
FRAMEBUFFER4:	.long 0x24000000
.endif
_VRAM4:		.long VRAM !+ UNCACHED_ADDRESS


! ###########################################################################################################


ppu_draw_objects:
	mov	r0,r3		! priority

.ifdef PPU_USE_LINEBUFFER
	mov.l	FRAMEBUFFER5,r13
.else
	mov.l	@r15,r0		! scanline
	add 	#40,r0
	shll8	r0		! r0 = scanline*256
	mov.l	FRAMEBUFFER5,r13
	mov	#2,r2
	add	r0,r13
	shll8	r2		! r2 = 0x200
	shlr2	r0		! r0 = scanline*64
	add	r2,r13	
	add	r0,r13		
	add 	#80,r13		! r13 = framebuffer + (scanline + 40)*320 + 0x200 + 80
.endif

	mov	#0x40,r0
	mov.b	@(r0,r14),r1	! REG_LCDC
	shlr	r1
	shlr	r1
	bt	_pdo_obj_on	! check REG_LCDC bit 1 (OBJ Enable)
	rts
	nop
_pdo_obj_on:
	shlr	r1
	bt	_pdo_size_8x16	! check REG_LCDC bit 2 (OBJ Size)
	mov	#0,r9		! loop counter
	mov.l	_OAM5,r11
_pdo_8x8_loop:
	mov	r9,r0

	add	#3,r0
	mov.b	@(r0,r11),r4	! Obj.Flags
	extu.b	r4,r0
	and	#0x80,r0
	cmp/eq	r0,r3		! do the priorities match ?	
	bf/s	_pdo_8x8_next
	mov	r9,r0
	
	mov.b	@(r0,r11),r1	! Obj.Y
	add	#-16,r1
	extu.b	r1,r1
	mov.l	@r15,r2		! scanline
	cmp/hs	r1,r2		! Obj.Y <= scanline ?
	bf	_pdo_8x8_next
	add	#7,r1
	cmp/hs	r2,r1		! Obj.Y+7 >= scanline ?
	bf	_pdo_8x8_next
	add	#-7,r1
	sub	r1,r2		! r2 = scanline - Obj.Y
	add	#1,r0
	mov.b	@(r0,r11),r1	! Obj.X
	add	#-8,r1
	extu.b	r1,r1
!	add	#2,r0
!	mov.b	@(r0,r11),r4	! Obj.Flags
!	extu.b	r4,r0
!	and	#0x80,r0
!	cmp/eq	r0,r3		! do the priorities match ?	
!	bf	_pdo_8x8_next
	mov	#160,r0
	extu.b	r0,r0
	cmp/hs	r0,r1		! Obj.X >= 160 ?
	bt	_pdo_8x8_next
	extu.b	r4,r0
	tst	#0x40,r0	! check vflip
	bt	_pdo_8x8_no_vflip
	mov	#7,r0
	xor	r0,r2
_pdo_8x8_no_vflip:
	mov	r9,r0
	add	#2,r0
	mov.b	@(r0,r11),r5	! Obj.Tile
	add	r13,r1
	mov	r1,r8		! framebuffer + scanline*320 + 0x200 + Obj.X
	extu.b	r4,r0
	and	#0x10,r0
	shlr2	r0
	add	r12,r0
	add	#4,r0
	mov	r0,r10		! object palette

	extu.b	r4,r0
	tst	#0x20,r0	! check hflip
	bf	_pdo_8x8_hflip
	
	extu.b	r5,r0
	shll2	r0
	shll2	r0
	mov.l	_VRAM5,r4
	add	r2,r0
	add	r2,r0		! r0 = (Obj.Tile << 4) + ObjLine * 2
	mov	#0xFF,r5
	mov.b	@(r0,r4),r1	! low byte of char
	extu.b	r5,r5
	add	#1,r0
	mov.b	@(r0,r4),r2	! high byte of char
	xor	r5,r1
	xor	r5,r2
	mov	#0x80,r4
	mov	#8,r5
	extu.b	r4,r4
_pdo_8x8_xloop:
	tst	r4,r2
	movt	r0
	tst	r4,r1
	rotcl 	r0
	tst	r0,r0
	bt/s	_pdo_8x8_transparent
	shlr	r4
	mov.b	@(r0,r10),r7
	mov.b	r7,@r8
_pdo_8x8_transparent:
	dt	r5
	bf/s	_pdo_8x8_xloop
	add	#1,r8
_pdo_8x8_next:
	mov	#160,r0
	add	#4,r9
	extu.b	r0,r0
	cmp/hs	r0,r9
	bf	_pdo_8x8_loop
	rts
	nop

_pdo_8x8_hflip:
	extu.b	r5,r0
	shll2	r0
	shll2	r0
	mov.l	_VRAM5,r4
	add	r2,r0
	add	r2,r0		! r0 = (Obj.Tile << 4) + ObjLine * 2
	mov	#0xFF,r5
	mov.b	@(r0,r4),r1	! low byte of char
	extu.b	r5,r5
	add	#1,r0
	mov.b	@(r0,r4),r2	! high byte of char
	xor	r5,r1
	xor	r5,r2
	mov	#0x01,r4
	mov	#8,r5
_pdo_8x8_hflip_xloop:
	tst	r4,r2
	movt	r0
	tst	r4,r1
	rotcl 	r0
	tst	r0,r0
	bt/s	_pdo_8x8_hflip_transparent
	shll	r4
	mov.b	@(r0,r10),r7
	mov.b	r7,@r8
_pdo_8x8_hflip_transparent:
	dt	r5
	bf/s	_pdo_8x8_hflip_xloop
	add	#1,r8
	mov	#160,r0
	add	#4,r9
	extu.b	r0,r0
	cmp/hs	r0,r9
	bf	_pdo_8x8_loop
	rts
	nop
	
_pdo_size_8x16:
	mov	#0,r9		! loop counter
	mov.l	_OAM5,r11
_pdo_8x16_loop:
	mov	r9,r0

	add	#3,r0
	mov.b	@(r0,r11),r4	! Obj.Flags
	extu.b	r4,r0
	and	#0x80,r0
	cmp/eq	r0,r3		! do the priorities match ?	
	bf/s	_pdo_8x16_next
	mov	r9,r0
	
	mov.b	@(r0,r11),r1	! Obj.Y
	add	#-16,r1
	extu.b	r1,r1
	mov.l	@r15,r2		! scanline
	cmp/hs	r1,r2		! Obj.Y <= scanline ?
	bf	_pdo_8x16_next
	add	#15,r1
	cmp/hs	r2,r1		! Obj.Y+15 >= scanline ?
	bf	_pdo_8x16_next
	add	#-15,r1
	sub	r1,r2		! r2 = scanline - Obj.Y
	add	#1,r0
	mov.b	@(r0,r11),r1	! Obj.X
	add	#-8,r1
	extu.b	r1,r1
!	add	#2,r0
!	mov.b	@(r0,r11),r4	! Obj.Flags
!	extu.b	r4,r0
!	and	#0x80,r0
!	cmp/eq	r0,r3		! do the priorities match ?	
!	bf	_pdo_8x16_next
	mov	#160,r0
	extu.b	r0,r0
	cmp/hs	r0,r1		! Obj.X >= 160 ?
	bt	_pdo_8x16_next
	extu.b	r4,r0
	tst	#0x40,r0	! check vflip
	bt	_pdo_8x16_no_vflip
	mov	#15,r0
	xor	r0,r2
_pdo_8x16_no_vflip:
	mov	r9,r0
	add	#2,r0
	mov.b	@(r0,r11),r5	! Obj.Tile
	add	r13,r1
	mov	r1,r8		! framebuffer + scanline*320 + 0x200 + Obj.X
	extu.b	r4,r0
	and	#0x10,r0
	shlr2	r0
	add	r12,r0
	add	#4,r0
	mov	r0,r10		! object palette

	extu.b	r4,r0
	tst	#0x20,r0	! check hflip
	bf	_pdo_8x16_hflip
	
	extu.b	r5,r0
	and	#0xFE,r0
	shll2	r0
	shll2	r0
	mov.l	_VRAM5,r4
	add	r2,r0
	add	r2,r0		! r0 = (Obj.Tile << 4) + ObjLine * 2
	mov	#0xFF,r5
	mov.b	@(r0,r4),r1	! low byte of char
	extu.b	r5,r5
	add	#1,r0
	mov.b	@(r0,r4),r2	! high byte of char
	xor	r5,r1
	xor	r5,r2
	mov	#0x80,r4
	mov	#8,r5
	extu.b	r4,r4
_pdo_8x16_xloop:
	tst	r4,r2
	movt	r0
	tst	r4,r1
	rotcl 	r0
	tst	r0,r0
	bt/s	_pdo_8x16_transparent
	shlr	r4
	mov.b	@(r0,r10),r7
	mov.b	r7,@r8
_pdo_8x16_transparent:
	dt	r5
	bf/s	_pdo_8x16_xloop
	add	#1,r8
_pdo_8x16_next:
	mov	#160,r0
	add	#4,r9
	extu.b	r0,r0
	cmp/hs	r0,r9
	bf	_pdo_8x16_loop
	rts
	nop

_pdo_8x16_hflip:
	extu.b	r5,r0
	and	#0xFE,r0
	shll2	r0
	shll2	r0
	mov.l	_VRAM5,r4
	add	r2,r0
	add	r2,r0		! r0 = (Obj.Tile << 4) + ObjLine * 2
	mov	#0xFF,r5
	mov.b	@(r0,r4),r1	! low byte of char
	extu.b	r5,r5
	add	#1,r0
	mov.b	@(r0,r4),r2	! high byte of char
	xor	r5,r1
	xor	r5,r2
	mov	#0x01,r4
	mov	#8,r5
_pdo_8x16_hflip_xloop:
	tst	r4,r2
	movt	r0
	tst	r4,r1
	rotcl 	r0
	tst	r0,r0
	bt/s	_pdo_8x16_hflip_transparent
	shll	r4
	mov.b	@(r0,r10),r7
	mov.b	r7,@r8
_pdo_8x16_hflip_transparent:
	dt	r5
	bf/s	_pdo_8x16_hflip_xloop
	add	#1,r8
	mov	#160,r0
	add	#4,r9
	extu.b	r0,r0
	cmp/hs	r0,r9
	bf	_pdo_8x16_loop
	rts
	nop
	

.align 2
.ifdef PPU_USE_LINEBUFFER
FRAMEBUFFER5:	.long 0xC0000760
.else
FRAMEBUFFER5:	.long 0x24000000
.endif
_OAM5:		.long OAM !+ UNCACHED_ADDRESS
_VRAM5:		.long VRAM !+ UNCACHED_ADDRESS

! approx 1500 bytes in total

__PPU_COPY_TO_IWRAM_END__:


! ###########################################################################################################


.align 1
_ppu_reset:
	mova	_ppu_currmode,r0
	mov	#0,r1
	mov.w	r1,@r0
	mov.l	_ppu_scanline2,r0
	mov.w	r1,@r0
	mov.l	_REG_LY2,r0
	mov.b	r1,@r0
	mov.l	_REG_LCDC2,r0
	mov	#0x91,r1
	rts
	mov.b	r1,@r0

.align 2
_REG_LCDC2:	.long IOREGS+0x40 + UNCACHED_ADDRESS
_REG_LY2:	.long IOREGS+0x44 + UNCACHED_ADDRESS


! ###########################################################################################################

.align 1
_ppu_mode:
	sts.l	pr,@-r15
	mov.l	r7,@-r15
	
	mov	r0,r7
	mov.l	_IOREGS3,r14
	
	cmp/eq	#0,r0
	bt	_pm_cmp_with_currmode
	bra	_pm_not_mode0
	nop	
_pm_cmp_with_currmode:	
	mov.w	_ppu_currmode,r1
	cmp/eq	r0,r1
	bf	_pm_check_if_last_active_line
	bra	_pm_same_mode
	nop
_pm_check_if_last_active_line:	

	mov.l	_ppu_scanline2,r0
	mov.w	@r0,r1
	add	#1,r1
	mov	#144,r2
	mov.w	r1,@r0
	extu.b	r2,r2
	cmp/eq	r2,r1
	bf	_pm_not_vblank_start
	mov	#1,r7
	mov	#1,r0
	mov.b	@(0x0F,r14),r0	! REG_IF
	or	#1,r0
	mov.b	r0,@(0x0F,r14)	! REG_IF
	mov	#0xFF,r0
	extu.b	r0,r0
	mov.b	@(r0,r14),r1	! REG_IE
	extu.b	r1,r0
	tst	#1,r0
	bt/s	_pm_no_vblank_int
	mov	r13,r0
	tst	#CPU_IME,r0	! check IME
	bt	_pm_no_vblank_int
	mov.b	@(0x0F,r14),r0
	and	#0xFE,r0
	mov.b	r0,@(0x0F,r14)	! REG_IF
	mov.l	_cpu_rst2,r1
	jsr	@r1
	mov	#0x40,r0
	mov	#0x0F,r0
	and	r0,r13		! clear IME
_pm_no_vblank_int:
	bra	_pm_check_hblank_irq
	nop
_pm_not_vblank_start:
	mov.l	_ppu_scanline2,r0
	mov.w	@r0,r1
	mov	#152,r0
	extu.b	r0,r0
	cmp/eq	r0,r1
	bf	_pm_not_line152
	bra	_pm_check_hblank_irq
	mov	#2,r7
_pm_not_line152:
	add	#1,r0
	cmp/eq	r0,r1
	bf	_pm_not_line153
	bra	_pm_check_hblank_irq
	mov	#3,r7
_pm_not_line153:
	add	#1,r0
	cmp/eq	r0,r1
	bf	_pm_check_hblank_irq
	mov.l	_ppu_scanline2,r0
	mov	#0,r1
	mov.w	r1,@r0
_pm_check_hblank_irq:

bra _pm_no_hblank_int
nop
	tst	r7,r7
	bf	_pm_check_ly_irq
	mov	#0x41,r0
	mov.b	@(r0,r14),r1	! REG_STAT
	extu.b	r1,r0
	tst	#8,r0
	bt	_pm_check_ly_irq

	mov.b	@(0x0F,r14),r0	! REG_IF
	or	#2,r0
	mov.b	r0,@(0x0F,r14)	! REG_IF
	mov	#0xFF,r0
	extu.b	r0,r0
	mov.b	@(r0,r14),r1	! REG_IE
	extu.b	r1,r0
	tst	#2,r0
	bt/s	_pm_no_hblank_int
	mov	r13,r0
	tst	#CPU_IME,r0	! check IME
	bt	_pm_no_hblank_int
	mov.b	@(0x0F,r14),r0
	and	#0xFD,r0
	mov.b	r0,@(0x0F,r14)	! REG_IF
	mov.l	_cpu_rst2,r1
	jsr	@r1
	mov	#0x48,r0
	mov	#0x0F,r0
	and	r0,r13		! clear IME
_pm_no_hblank_int:

_pm_check_ly_irq:
	mov.l	_ppu_scanline2,r0
	mov.w	@r0,r1
	mov	#0x44,r0
	mov.b	r1,@(r0,r14)	! REG_LY
	add	#1,r0
	mov.b	@(r0,r14),r2	! REG_LYC
	extu.b	r2,r2
	cmp/eq	r2,r1
	bf	_pm_no_lyc_match
	mov	#0x41,r0
	mov.b	@(r0,r14),r1	! REG_STAT
	extu.b	r1,r0
	tst	#0x40,r0
	bt	_pm_no_lyc_match
	or	#4,r0
	extu.b	r0,r1
	mov	#0x41,r0
	mov.b	r1,@(r0,r14)	! REG_STAT
	mov.b	@(0x0F,r14),r0	! REG_IF
	or	#2,r0
	mov.b	r0,@(0x0F,r14)	! REG_IF

	mov	#0xFF,r0
	extu.b	r0,r0
	mov.b	@(r0,r14),r1	! REG_IE
	extu.b	r1,r0
	tst	#2,r0
	bt/s	_pm_no_ly_int
	mov	r13,r0
	tst	#CPU_IME,r0	! check IME
	bt	_pm_no_ly_int
	mov.b	@(0x0F,r14),r0
	and	#0xFD,r0
	mov.b	r0,@(0x0F,r14)	! REG_IF
	mov.l	_cpu_rst2,r1
	jsr	@r1
	mov	#0x48,r0
	mov	#0x0F,r0
	and	r0,r13		! clear IME
_pm_no_ly_int:
	nop
	bra	_pm_same_mode
	nop
_pm_no_lyc_match:
	mov	#0x41,r0
	mov.b	@(r0,r14),r1	! REG_STAT
	extu.b	r1,r0
	and	#0xFB,r0
	extu.b	r0,r1
	mov	#0x41,r0
	mov.b	r1,@(r0,r14)	! REG_STAT
	bra	_pm_same_mode
	nop
_pm_not_mode0:

	cmp/eq	#3,r0
	bf	_pm_same_mode
	mov.l	_ppu_scanline2,r1
	mov.w	@r1,r0
	mov	#144,r1
	extu.b	r1,r1
	cmp/hs	r1,r0
	bt	_pm_dont_draw
	mov	r0,r1	
	mov.l	_slavectl_send_command_,r2
	mov	#SLAVE_CMD_DRAW_SCANLINE,r0
	jsr	@r2
	nop
_pm_dont_draw:	

_pm_same_mode:

	mov.l	_ppu_currmode2,r0
	mov.w	r7,@r0

	mov	#0x41,r0
	mov.b	@(r0,r14),r1	! REG_STAT
	extu.b	r1,r0
	and	#0xFC,r0
	extu.b	r0,r1
	mov	r7,r0
	and	#3,r0
	or	r0,r1
	mov	#0x41,r0
	mov.b	r1,@(r0,r14)	! REG_STAT

	mov.l	@r15+,r7	
	lds.l	@r15+,pr
	nop
	rts
	nop

! ###########################################################################################################


.align 2
_ppu_currmode:	.short 0,0
_ppu_scanline:	.short 0,0

.align 2
.long 0
_cpu_rst2:	.long _cpu_rst
_IOREGS3:	.long IOREGS + UNCACHED_ADDRESS
!_ppu_draw_scanline: .long ppu_draw_scanline
_ppu_scanline2:	.long _ppu_scanline + UNCACHED_ADDRESS
_ppu_currmode2:	.long _ppu_currmode
_slavectl_send_command_: .long SLAVECTL_CACHE_COPY_ADR !_slavectl_send_command
