Dreamcast: Optimise vertex transform by a couple of cycles

2025-09-11 16:45:48 -04:00 · 2024-06-18 08:36:47 +10:00 · 2024-06-18 08:36:47 +10:00 · 6153ff8c8a
commit 6153ff8c8a
parent 74f3c424e8
4 changed files with 48 additions and 34 deletions
--- a/misc/dreamcast/DrawColouredQuads.S
+++ b/misc/dreamcast/DrawColouredQuads.S
@ -7,13 +7,10 @@ _DrawColouredQuads:
 ! Setup
    fldi0 fr1     ! U = 0
    fldi0 fr2     ! V = 0
-    mov r4,r3     ! r3  = src
-    add #-32, r5  ! r5 -= sizeof(VERTEX)
+    TransformSetup
    ViewportTransformSetup _VP_COL_HWIDTH

 .TRANSFORM_QUAD:
-    mov.l CMD_COL_VERT, r1 ! r1  = GPU VERT command
-
    LoadColouredVertex
    ProcessVertex1

@ -24,7 +21,7 @@ _DrawColouredQuads:
    ProcessVertex3

    LoadColouredVertex
-    ProcessVertex4 CMD_COL_EOS
+    ProcessVertex4

 ! CLIPFLAGS TESTING
    cmp/eq   #0,r0 ! T = r0 == 0 (all points invisible)
@ -44,13 +41,9 @@ _DrawColouredQuads:
    bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
    nop
    
-    add #32, r5     ! r5 += sizeof(VERTEX)
-    rts             ! return after executing instruction in delay slot
-    mov r5,r0       ! r0 = r5
+    TransformEnd

-.align 2
-CMD_COL_VERT: .long 0xe0000000
-CMD_COL_EOS:  .long 0xf0000000
+.align 4

 .global _VP_COL_HWIDTH
 _VP_COL_HWIDTH:  .long 0
--- a/misc/dreamcast/DrawTexturedQuads.S
+++ b/misc/dreamcast/DrawTexturedQuads.S
@ -5,13 +5,10 @@

 _DrawTexturedQuads:
 ! Setup
-    mov r4,r3     ! r3  = src
-    add #-32, r5  ! r5 -= sizeof(VERTEX)
+    TransformSetup
    ViewportTransformSetup _VP_TEX_HWIDTH

 .TRANSFORM_QUAD:
-    mov.l CMD_TEX_VERT, r1 ! r1  = GPU VERT command
-
    LoadTexturedVertex
    ProcessVertex1

@ -22,7 +19,7 @@ _DrawTexturedQuads:
    ProcessVertex3

    LoadTexturedVertex
-    ProcessVertex4 CMD_TEX_EOS
+    ProcessVertex4

 ! CLIPFLAGS TESTING
    cmp/eq  #0,r0  ! T = r0 == 0 (all points invisible)
@ -42,13 +39,9 @@ _DrawTexturedQuads:
    bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
    nop
    
-    add #32, r5     ! r5 += sizeof(VERTEX)
-    rts             ! return after executing instruction in delay slot
-    mov r5,r0       ! r0 = r5
+    TransformEnd

-.align 2
-CMD_TEX_VERT: .long 0xe0000000
-CMD_TEX_EOS:  .long 0xf0000000
+.align 4

 .global _VP_TEX_HWIDTH
 _VP_TEX_HWIDTH:  .long 0
--- a/misc/dreamcast/ViewportTransform.S
+++ b/misc/dreamcast/ViewportTransform.S
@ -3,8 +3,8 @@
 ! =========================================================
 ! The SH4 can dual issue (i.e. parallel execution) two instructions
 ! as long as the groups of the two instructions are different:
-! * LS - most APU and FPU register load/stores
-! * EX - most APU arithmetic instructions
+! * LS - most ALU and FPU register load/stores
+! * EX - most ALU arithmetic instructions
 ! * MT - TST, CMP, NOP, MOV Rm,Rn
 ! * FE - most FPU arithmetic instructions
 ! * CO - other instructions (NOTE: Cannot be exeucted in parallel)
@ -14,7 +14,6 @@
 ! * Latency: Interval between the issue of an instruction and the generation of its result (completion)
 ! * Latency is also the interval between the execution of two instructions with an interdependent relationship.
 !   (although different cases may either increase or decrease Latency)
-!


 ! =========================================================
@ -38,6 +37,8 @@
 !r5 = dst pointer ARG
 !r6 = quads count ARG
 !r7 = ?
+!r10  = PVR_CMD_VERTEX
+!r11  = PVR_CMD_VERTEX_EOL

 !fr0  = temp
 !fr1  = u
@ -55,6 +56,34 @@
 !fv4  = XYZW


+! =========================================================
+! ========================= TRANSFORM SETUP ===============
+! =========================================================
+.macro TransformSetup
+    mov r4,r3        ! MT, r3  = src
+    mov.l r10, @-r15 ! LS, push(r10)
+    add #-32, r5     ! EX, r5 -= sizeof(VERTEX)
+    mov.l r11, @-r15 ! LS, push(r11)
+    mov #0xE0, r10   ! EX, r10 = 0x00 00 00 E0
+    pref @r3         ! LS, PREFETCH r3 (first vertex)
+    shll16 r10       ! EX, r10 = 0x00 E0 00 00
+    shll8  r10       ! EX, r10 = 0xE0 00 00 00 (PVR_CMD_VERTEX)
+    mov #0xF0, r11   ! EX, r11 = 0x00 00 00 F0
+    shll16 r11       ! EX, r11 = 0x00 F0 00 00
+    shll8  r11       ! EX, r11 = 0xF0 00 00 00 (PVR_CMD_VERTEX_EOL)
+    nop              ! MT, align to even boundary
+.endm
+
+.macro TransformEnd
+    mov.l @r15+, r11 ! LS, pop(r11)
+    mov.l @r15+, r10 ! LS, pop(r10)
+
+    add #32, r5      ! EX, r5 += sizeof(VERTEX)
+    rts              ! CO, return after executing instruction in delay slot
+    mov r5,r0        ! MT, r0 = r5
+.endm
+
+
 ! =========================================================
 ! ========================= VERTEX LOADING ================
 ! =========================================================
@ -108,7 +137,7 @@
    fmov.s  fr6,@-r5 ! LS, dst->z = Z
    fmov.s  fr5,@-r5 ! LS, dst->y = Y
    fmov.s  fr4,@-r5 ! LS, dst->x = X
-    mov.l   r1,@-r5  ! LS, dst->flags = CMD_VERT
+    mov.l   r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
 .endm

 .macro ProcessVertex2
@ -124,7 +153,7 @@
    fmov.s  fr5,@-r5 ! LS, dst->y = Y
    or      r2,r0    ! EX, CLIPFLAGS |= tmp (T << 1)
    fmov.s  fr4,@-r5 ! LS, dst->x = X
-    mov.l   r1,@-r5  ! LS, dst->flags = CMD_VERT
+    mov.l   r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
 .endm

 .macro ProcessVertex3
@ -140,11 +169,12 @@
    shll2   r2       ! EX, tmp = tmp << 2
    fmov.s  fr4,@-r5 ! LS, dst->x = X
    or      r2,r0    ! EX, CLIPFLAGS |= tmp (T << 2)
-    mov.l   r1,@-r5  ! LS, dst->flags = CMD_VERT
+    mov.l   r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
 .endm

-.macro ProcessVertex4 eos_addr
+.macro ProcessVertex4
    fmov.s  fr7,@-r5 ! LS, dst->w = W
+    or      r11,r0   ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
    fmov.s  fr3,@-r5 ! LS, dst->c = C
    fneg    fr7      ! LS, W = -W
    fmov.s  fr2,@-r5 ! LS, dst->v = V
@ -154,12 +184,10 @@
    fmov.s  fr6,@-r5 ! LS, dst->z = Z
    shll2   r2       ! EX, tmp = tmp << 2
    fmov.s  fr5,@-r5 ! LS, dst->y = Y
-    add     r2,r2    ! EX, tmp = (tmp << 2) + (tmp << 2)
+    add     r2,r2    ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)
    fmov.s  fr4,@-r5 ! LS, dst->x = X
-    mov.l \eos_addr, r1 ! LS, r1  = GPU EOS command
    or      r2,r0    ! EX, CLIPFLAGS |= tmp (T << 3)
-    or      r0,r1    ! EX, r1 |= CLIPFLAGS
-    mov.l   r1,@-r5  ! LS, dst->flags = GPU EOS | CLIPFLAGS
+    mov.l   r0,@-r5  ! LS, dst->flags = PVR_CMD_VERTEX_EOL | CLIPFLAGS
 .endm


--- a/readme.md
+++ b/readme.md
@ -156,7 +156,7 @@ Although the regular linux compiliation flags will work fine, to take full advan

 ## Compiling - macOS

-```cc -fno-math-errno *.c interop_cocoa.m -o ClassiCube -framework Cocoa -framework OpenGL -framework IOKit -lobjc```
+```cc -fno-math-errno *.c Window_cocoa.m -o ClassiCube -framework Cocoa -framework OpenGL -framework IOKit -lobjc```

 Note: You may need to install Xcode before you can compile ClassiCube