Dreamcast: Optimise vertex transform by a couple of cycles

This commit is contained in:
UnknownShadow200 2024-06-18 08:36:47 +10:00
parent 74f3c424e8
commit 6153ff8c8a
4 changed files with 48 additions and 34 deletions

View File

@ -7,13 +7,10 @@ _DrawColouredQuads:
! Setup
fldi0 fr1 ! U = 0
fldi0 fr2 ! V = 0
mov r4,r3 ! r3 = src
add #-32, r5 ! r5 -= sizeof(VERTEX)
TransformSetup
ViewportTransformSetup _VP_COL_HWIDTH
.TRANSFORM_QUAD:
mov.l CMD_COL_VERT, r1 ! r1 = GPU VERT command
LoadColouredVertex
ProcessVertex1
@ -24,7 +21,7 @@ _DrawColouredQuads:
ProcessVertex3
LoadColouredVertex
ProcessVertex4 CMD_COL_EOS
ProcessVertex4
! CLIPFLAGS TESTING
cmp/eq #0,r0 ! T = r0 == 0 (all points invisible)
@ -44,13 +41,9 @@ _DrawColouredQuads:
bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
nop
add #32, r5 ! r5 += sizeof(VERTEX)
rts ! return after executing instruction in delay slot
mov r5,r0 ! r0 = r5
TransformEnd
.align 2
CMD_COL_VERT: .long 0xe0000000
CMD_COL_EOS: .long 0xf0000000
.align 4
.global _VP_COL_HWIDTH
_VP_COL_HWIDTH: .long 0

View File

@ -5,13 +5,10 @@
_DrawTexturedQuads:
! Setup
mov r4,r3 ! r3 = src
add #-32, r5 ! r5 -= sizeof(VERTEX)
TransformSetup
ViewportTransformSetup _VP_TEX_HWIDTH
.TRANSFORM_QUAD:
mov.l CMD_TEX_VERT, r1 ! r1 = GPU VERT command
LoadTexturedVertex
ProcessVertex1
@ -22,7 +19,7 @@ _DrawTexturedQuads:
ProcessVertex3
LoadTexturedVertex
ProcessVertex4 CMD_TEX_EOS
ProcessVertex4
! CLIPFLAGS TESTING
cmp/eq #0,r0 ! T = r0 == 0 (all points invisible)
@ -42,13 +39,9 @@ _DrawTexturedQuads:
bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
nop
add #32, r5 ! r5 += sizeof(VERTEX)
rts ! return after executing instruction in delay slot
mov r5,r0 ! r0 = r5
TransformEnd
.align 2
CMD_TEX_VERT: .long 0xe0000000
CMD_TEX_EOS: .long 0xf0000000
.align 4
.global _VP_TEX_HWIDTH
_VP_TEX_HWIDTH: .long 0

View File

@ -3,8 +3,8 @@
! =========================================================
! The SH4 can dual issue (i.e. parallel execution) two instructions
! as long as the groups of the two instructions are different:
! * LS - most APU and FPU register load/stores
! * EX - most APU arithmetic instructions
! * LS - most ALU and FPU register load/stores
! * EX - most ALU arithmetic instructions
! * MT - TST, CMP, NOP, MOV Rm,Rn
! * FE - most FPU arithmetic instructions
! * CO - other instructions (NOTE: Cannot be exeucted in parallel)
@ -14,7 +14,6 @@
! * Latency: Interval between the issue of an instruction and the generation of its result (completion)
! * Latency is also the interval between the execution of two instructions with an interdependent relationship.
! (although different cases may either increase or decrease Latency)
!
! =========================================================
@ -38,6 +37,8 @@
!r5 = dst pointer ARG
!r6 = quads count ARG
!r7 = ?
!r10 = PVR_CMD_VERTEX
!r11 = PVR_CMD_VERTEX_EOL
!fr0 = temp
!fr1 = u
@ -55,6 +56,34 @@
!fv4 = XYZW
! =========================================================
! ========================= TRANSFORM SETUP ===============
! =========================================================
.macro TransformSetup
mov r4,r3 ! MT, r3 = src
mov.l r10, @-r15 ! LS, push(r10)
add #-32, r5 ! EX, r5 -= sizeof(VERTEX)
mov.l r11, @-r15 ! LS, push(r11)
mov #0xE0, r10 ! EX, r10 = 0x00 00 00 E0
pref @r3 ! LS, PREFETCH r3 (first vertex)
shll16 r10 ! EX, r10 = 0x00 E0 00 00
shll8 r10 ! EX, r10 = 0xE0 00 00 00 (PVR_CMD_VERTEX)
mov #0xF0, r11 ! EX, r11 = 0x00 00 00 F0
shll16 r11 ! EX, r11 = 0x00 F0 00 00
shll8 r11 ! EX, r11 = 0xF0 00 00 00 (PVR_CMD_VERTEX_EOL)
nop ! MT, align to even boundary
.endm
.macro TransformEnd
mov.l @r15+, r11 ! LS, pop(r11)
mov.l @r15+, r10 ! LS, pop(r10)
add #32, r5 ! EX, r5 += sizeof(VERTEX)
rts ! CO, return after executing instruction in delay slot
mov r5,r0 ! MT, r0 = r5
.endm
! =========================================================
! ========================= VERTEX LOADING ================
! =========================================================
@ -108,7 +137,7 @@
fmov.s fr6,@-r5 ! LS, dst->z = Z
fmov.s fr5,@-r5 ! LS, dst->y = Y
fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm
.macro ProcessVertex2
@ -124,7 +153,7 @@
fmov.s fr5,@-r5 ! LS, dst->y = Y
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 1)
fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm
.macro ProcessVertex3
@ -140,11 +169,12 @@
shll2 r2 ! EX, tmp = tmp << 2
fmov.s fr4,@-r5 ! LS, dst->x = X
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 2)
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm
.macro ProcessVertex4 eos_addr
.macro ProcessVertex4
fmov.s fr7,@-r5 ! LS, dst->w = W
or r11,r0 ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
fmov.s fr3,@-r5 ! LS, dst->c = C
fneg fr7 ! LS, W = -W
fmov.s fr2,@-r5 ! LS, dst->v = V
@ -154,12 +184,10 @@
fmov.s fr6,@-r5 ! LS, dst->z = Z
shll2 r2 ! EX, tmp = tmp << 2
fmov.s fr5,@-r5 ! LS, dst->y = Y
add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2)
add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)
fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l \eos_addr, r1 ! LS, r1 = GPU EOS command
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 3)
or r0,r1 ! EX, r1 |= CLIPFLAGS
mov.l r1,@-r5 ! LS, dst->flags = GPU EOS | CLIPFLAGS
mov.l r0,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX_EOL | CLIPFLAGS
.endm

View File

@ -156,7 +156,7 @@ Although the regular linux compiliation flags will work fine, to take full advan
## Compiling - macOS
```cc -fno-math-errno *.c interop_cocoa.m -o ClassiCube -framework Cocoa -framework OpenGL -framework IOKit -lobjc```
```cc -fno-math-errno *.c Window_cocoa.m -o ClassiCube -framework Cocoa -framework OpenGL -framework IOKit -lobjc```
Note: You may need to install Xcode before you can compile ClassiCube