Dreamcast: Optimise vertex transform by a couple of cycles

This commit is contained in:
UnknownShadow200 2024-06-18 08:36:47 +10:00
parent 74f3c424e8
commit 6153ff8c8a
4 changed files with 48 additions and 34 deletions

View File

@ -7,13 +7,10 @@ _DrawColouredQuads:
! Setup ! Setup
fldi0 fr1 ! U = 0 fldi0 fr1 ! U = 0
fldi0 fr2 ! V = 0 fldi0 fr2 ! V = 0
mov r4,r3 ! r3 = src TransformSetup
add #-32, r5 ! r5 -= sizeof(VERTEX)
ViewportTransformSetup _VP_COL_HWIDTH ViewportTransformSetup _VP_COL_HWIDTH
.TRANSFORM_QUAD: .TRANSFORM_QUAD:
mov.l CMD_COL_VERT, r1 ! r1 = GPU VERT command
LoadColouredVertex LoadColouredVertex
ProcessVertex1 ProcessVertex1
@ -24,7 +21,7 @@ _DrawColouredQuads:
ProcessVertex3 ProcessVertex3
LoadColouredVertex LoadColouredVertex
ProcessVertex4 CMD_COL_EOS ProcessVertex4
! CLIPFLAGS TESTING ! CLIPFLAGS TESTING
cmp/eq #0,r0 ! T = r0 == 0 (all points invisible) cmp/eq #0,r0 ! T = r0 == 0 (all points invisible)
@ -44,13 +41,9 @@ _DrawColouredQuads:
bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
nop nop
add #32, r5 ! r5 += sizeof(VERTEX) TransformEnd
rts ! return after executing instruction in delay slot
mov r5,r0 ! r0 = r5
.align 2 .align 4
CMD_COL_VERT: .long 0xe0000000
CMD_COL_EOS: .long 0xf0000000
.global _VP_COL_HWIDTH .global _VP_COL_HWIDTH
_VP_COL_HWIDTH: .long 0 _VP_COL_HWIDTH: .long 0

View File

@ -5,13 +5,10 @@
_DrawTexturedQuads: _DrawTexturedQuads:
! Setup ! Setup
mov r4,r3 ! r3 = src TransformSetup
add #-32, r5 ! r5 -= sizeof(VERTEX)
ViewportTransformSetup _VP_TEX_HWIDTH ViewportTransformSetup _VP_TEX_HWIDTH
.TRANSFORM_QUAD: .TRANSFORM_QUAD:
mov.l CMD_TEX_VERT, r1 ! r1 = GPU VERT command
LoadTexturedVertex LoadTexturedVertex
ProcessVertex1 ProcessVertex1
@ -22,7 +19,7 @@ _DrawTexturedQuads:
ProcessVertex3 ProcessVertex3
LoadTexturedVertex LoadTexturedVertex
ProcessVertex4 CMD_TEX_EOS ProcessVertex4
! CLIPFLAGS TESTING ! CLIPFLAGS TESTING
cmp/eq #0,r0 ! T = r0 == 0 (all points invisible) cmp/eq #0,r0 ! T = r0 == 0 (all points invisible)
@ -42,13 +39,9 @@ _DrawTexturedQuads:
bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
nop nop
add #32, r5 ! r5 += sizeof(VERTEX) TransformEnd
rts ! return after executing instruction in delay slot
mov r5,r0 ! r0 = r5
.align 2 .align 4
CMD_TEX_VERT: .long 0xe0000000
CMD_TEX_EOS: .long 0xf0000000
.global _VP_TEX_HWIDTH .global _VP_TEX_HWIDTH
_VP_TEX_HWIDTH: .long 0 _VP_TEX_HWIDTH: .long 0

View File

@ -3,8 +3,8 @@
! ========================================================= ! =========================================================
! The SH4 can dual issue (i.e. parallel execution) two instructions ! The SH4 can dual issue (i.e. parallel execution) two instructions
! as long as the groups of the two instructions are different: ! as long as the groups of the two instructions are different:
! * LS - most APU and FPU register load/stores ! * LS - most ALU and FPU register load/stores
! * EX - most APU arithmetic instructions ! * EX - most ALU arithmetic instructions
! * MT - TST, CMP, NOP, MOV Rm,Rn ! * MT - TST, CMP, NOP, MOV Rm,Rn
! * FE - most FPU arithmetic instructions ! * FE - most FPU arithmetic instructions
! * CO - other instructions (NOTE: Cannot be exeucted in parallel) ! * CO - other instructions (NOTE: Cannot be exeucted in parallel)
@ -14,7 +14,6 @@
! * Latency: Interval between the issue of an instruction and the generation of its result (completion) ! * Latency: Interval between the issue of an instruction and the generation of its result (completion)
! * Latency is also the interval between the execution of two instructions with an interdependent relationship. ! * Latency is also the interval between the execution of two instructions with an interdependent relationship.
! (although different cases may either increase or decrease Latency) ! (although different cases may either increase or decrease Latency)
!
! ========================================================= ! =========================================================
@ -38,6 +37,8 @@
!r5 = dst pointer ARG !r5 = dst pointer ARG
!r6 = quads count ARG !r6 = quads count ARG
!r7 = ? !r7 = ?
!r10 = PVR_CMD_VERTEX
!r11 = PVR_CMD_VERTEX_EOL
!fr0 = temp !fr0 = temp
!fr1 = u !fr1 = u
@ -55,6 +56,34 @@
!fv4 = XYZW !fv4 = XYZW
! =========================================================
! ========================= TRANSFORM SETUP ===============
! =========================================================
.macro TransformSetup
mov r4,r3 ! MT, r3 = src
mov.l r10, @-r15 ! LS, push(r10)
add #-32, r5 ! EX, r5 -= sizeof(VERTEX)
mov.l r11, @-r15 ! LS, push(r11)
mov #0xE0, r10 ! EX, r10 = 0x00 00 00 E0
pref @r3 ! LS, PREFETCH r3 (first vertex)
shll16 r10 ! EX, r10 = 0x00 E0 00 00
shll8 r10 ! EX, r10 = 0xE0 00 00 00 (PVR_CMD_VERTEX)
mov #0xF0, r11 ! EX, r11 = 0x00 00 00 F0
shll16 r11 ! EX, r11 = 0x00 F0 00 00
shll8 r11 ! EX, r11 = 0xF0 00 00 00 (PVR_CMD_VERTEX_EOL)
nop ! MT, align to even boundary
.endm
.macro TransformEnd
mov.l @r15+, r11 ! LS, pop(r11)
mov.l @r15+, r10 ! LS, pop(r10)
add #32, r5 ! EX, r5 += sizeof(VERTEX)
rts ! CO, return after executing instruction in delay slot
mov r5,r0 ! MT, r0 = r5
.endm
! ========================================================= ! =========================================================
! ========================= VERTEX LOADING ================ ! ========================= VERTEX LOADING ================
! ========================================================= ! =========================================================
@ -108,7 +137,7 @@
fmov.s fr6,@-r5 ! LS, dst->z = Z fmov.s fr6,@-r5 ! LS, dst->z = Z
fmov.s fr5,@-r5 ! LS, dst->y = Y fmov.s fr5,@-r5 ! LS, dst->y = Y
fmov.s fr4,@-r5 ! LS, dst->x = X fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm .endm
.macro ProcessVertex2 .macro ProcessVertex2
@ -124,7 +153,7 @@
fmov.s fr5,@-r5 ! LS, dst->y = Y fmov.s fr5,@-r5 ! LS, dst->y = Y
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 1) or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 1)
fmov.s fr4,@-r5 ! LS, dst->x = X fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm .endm
.macro ProcessVertex3 .macro ProcessVertex3
@ -140,11 +169,12 @@
shll2 r2 ! EX, tmp = tmp << 2 shll2 r2 ! EX, tmp = tmp << 2
fmov.s fr4,@-r5 ! LS, dst->x = X fmov.s fr4,@-r5 ! LS, dst->x = X
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 2) or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 2)
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm .endm
.macro ProcessVertex4 eos_addr .macro ProcessVertex4
fmov.s fr7,@-r5 ! LS, dst->w = W fmov.s fr7,@-r5 ! LS, dst->w = W
or r11,r0 ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
fmov.s fr3,@-r5 ! LS, dst->c = C fmov.s fr3,@-r5 ! LS, dst->c = C
fneg fr7 ! LS, W = -W fneg fr7 ! LS, W = -W
fmov.s fr2,@-r5 ! LS, dst->v = V fmov.s fr2,@-r5 ! LS, dst->v = V
@ -154,12 +184,10 @@
fmov.s fr6,@-r5 ! LS, dst->z = Z fmov.s fr6,@-r5 ! LS, dst->z = Z
shll2 r2 ! EX, tmp = tmp << 2 shll2 r2 ! EX, tmp = tmp << 2
fmov.s fr5,@-r5 ! LS, dst->y = Y fmov.s fr5,@-r5 ! LS, dst->y = Y
add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2) add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)
fmov.s fr4,@-r5 ! LS, dst->x = X fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l \eos_addr, r1 ! LS, r1 = GPU EOS command
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 3) or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 3)
or r0,r1 ! EX, r1 |= CLIPFLAGS mov.l r0,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX_EOL | CLIPFLAGS
mov.l r1,@-r5 ! LS, dst->flags = GPU EOS | CLIPFLAGS
.endm .endm

View File

@ -156,7 +156,7 @@ Although the regular linux compiliation flags will work fine, to take full advan
## Compiling - macOS ## Compiling - macOS
```cc -fno-math-errno *.c interop_cocoa.m -o ClassiCube -framework Cocoa -framework OpenGL -framework IOKit -lobjc``` ```cc -fno-math-errno *.c Window_cocoa.m -o ClassiCube -framework Cocoa -framework OpenGL -framework IOKit -lobjc```
Note: You may need to install Xcode before you can compile ClassiCube Note: You may need to install Xcode before you can compile ClassiCube