N64: Save 68 RCP cycles per quad

2025-09-09 15:28:21 -04:00 · 2025-07-16 22:12:19 +10:00 · 2025-07-16 22:12:19 +10:00 · a52fdf90e7
commit a52fdf90e7
parent abbfe4181b
2 changed files with 185 additions and 95 deletions
--- a/misc/n64/gpu.c
+++ b/misc/n64/gpu.c
@ -41,8 +41,6 @@ enum {
 };
 typedef struct {
 	int16_t  mvp_matrix_i[4][4];
    uint16_t mvp_matrix_f[4][4];
    int16_t vp_scale[4];
    int16_t vp_offset[4];
    uint16_t tex_size[2];
--- a/misc/n64/rsp_gpu.S
+++ b/misc/n64/rsp_gpu.S
@ -1,9 +1,40 @@
 #include <rsp_queue.inc>
 #include <rdpq_macros.h>
-#define MATRIX_SIZE           64
+
 #define GUARD_BAND_FACTOR 2
-    .data
+// 1 << VTX_SHIFT, keep in sync with gpu.c
 #define ONE_W K32
 #define xxxxXXXX h0
 #define yyyyYYYY h1
 #define zzzzZZZZ h2
 #define wwwwWWWW h3
 #define SCREEN_VTX_CS_POSi          0     // X, Y, Z, W (all 32-bit)
 #define SCREEN_VTX_CS_POSf          8     // X, Y, Z, W (all 32-bit)
 #define SCREEN_VTX_X               16
 #define SCREEN_VTX_Y               18
 #define SCREEN_VTX_Z               20
 #define SCREEN_VTX_CLIP_CODE       22
 #define SCREEN_VTX_PADDING         23
 #define SCREEN_VTX_RGBA            24
 #define SCREEN_VTX_S_T             28     // 28 S, 30 T
 #define SCREEN_VTX_W               32     // FIXME: this is duplicated in CS_POS
 #define SCREEN_VTX_INVW            36     // 32-bit
 #define SCREEN_VTX_SIZE            40
 //0-39 same as screenvtx
 #define PRIM_VTX_TRCODE            40    // trivial-reject clipping flags (against -w/+w)
 #define PRIM_VTX_SIZE              48
 #define V0_OFFSET 0 * PRIM_VTX_SIZE
 #define V1_OFFSET 1 * PRIM_VTX_SIZE
 #define V2_OFFSET 2 * PRIM_VTX_SIZE
 #define V3_OFFSET 3 * PRIM_VTX_SIZE
 .data
    RSPQ_BeginOverlayHeader
        RSPQ_DefineCommand GPUCmd_SetByte,       8   # 0x0
@ -23,9 +54,9 @@ BANNER1: .ascii "Rasky & Snacchus"
    RSPQ_BeginSavedState
    GPU_MATRIX_MVP:         .ds.b   128
 GL_STATE:
-    # This is the GL state that is also used by the pipeline.
+    # This is the GL state that is updated by CPU via GPUCmd_Set commands
    GL_MATRIX_MVP:          .ds.b   MATRIX_SIZE
    GL_VIEWPORT_SCALE:      .half   0,0,0,0
    GL_VIEWPORT_OFFSET:     .half   0,0,0,0
    GL_STATE_TEX_SIZE:      .half   0,0
@ -39,29 +70,12 @@ GL_STATE:
 CLIP_CODE_FACTORS:      .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR
 DRAW_TRI_RA:            .word 0
-#define SCREEN_VTX_CS_POSi          0     // X, Y, Z, W (all 32-bit)
+.bss
 #define SCREEN_VTX_CS_POSf          8     // X, Y, Z, W (all 32-bit)
 #define SCREEN_VTX_X               16
 #define SCREEN_VTX_Y               18
 #define SCREEN_VTX_Z               20
 #define SCREEN_VTX_CLIP_CODE       22
 #define SCREEN_VTX_PADDING         23
 #define SCREEN_VTX_RGBA            24
 #define SCREEN_VTX_S_T             28     // 28 S, 30 T
 #define SCREEN_VTX_W               32     // FIXME: this is duplicated in CS_POS
 #define SCREEN_VTX_INVW            36     // 32-bit
 #define SCREEN_VTX_SIZE            40
 	.bss
    .align 3
 #define VERTEX_CACHE_SIZE     4
 //0-39 same as screenvtx
 #define PRIM_VTX_TRCODE            40    // trivial-reject clipping flags (against -w/+w)
 #define PRIM_VTX_SIZE              42
-VERTEX_CACHE:   .dcb.b      PRIM_VTX_SIZE * VERTEX_CACHE_SIZE
+VERTEX_CACHE:   .dcb.b      PRIM_VTX_SIZE * 4
-    .text
+.text
    .func GPUCmd_SetByte
 GPUCmd_SetByte:
@ -104,29 +118,47 @@ GPUCmd_MatrixLoad:
    #define src         s6
    #define dst         s7
-    #define vrhs01_i     $v02
+    #define vmat0_i  $v02
-    #define vrhs01_f     $v03
+    #define vmat1_i  $v03
-    #define vrhs23_i     $v04
+    #define vmat2_i  $v04
-    #define vrhs23_f     $v05
+    #define vmat3_i  $v05
    #define vmat0_f  $v06
    #define vmat1_f  $v07
    #define vmat2_f  $v08
    #define vmat3_f  $v09
    addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
-    addi dst, zero, %lo(GL_MATRIX_MVP)
+    addi dst, zero, %lo(GPU_MATRIX_MVP)
-    # Load the matrix from command parameters (misaligned)
+    // Load the matrix from command parameters
-    lqv vrhs01_i, 0x00,src
+    ldv vmat0_i, 0x00,src
-    lrv vrhs01_i, 0x10,src
+    ldv vmat1_i, 0x08,src
-    lqv vrhs23_i, 0x10,src
+    ldv vmat2_i, 0x10,src
-    lrv vrhs23_i, 0x20,src
+    ldv vmat3_i, 0x18,src
-    lqv vrhs01_f, 0x20,src
+    ldv vmat0_f, 0x20,src
-    lrv vrhs01_f, 0x30,src
+    ldv vmat1_f, 0x28,src
-    lqv vrhs23_f, 0x30,src
+    ldv vmat2_f, 0x30,src
-    lrv vrhs23_f, 0x40,src
+    ldv vmat3_f, 0x38,src
-    sqv vrhs01_i, 0x00,dst
+	// Store the matrices, with each row stored twice
-    sqv vrhs23_i, 0x10,dst
+	// This is used by T&L to transform two vertices at once 
-    sqv vrhs01_f, 0x20,dst
+    sdv vmat0_i, 0x00,dst
    sdv vmat0_i, 0x08,dst
    sdv vmat1_i, 0x10,dst
    sdv vmat1_i, 0x18,dst
    sdv vmat2_i, 0x20,dst
    sdv vmat2_i, 0x28,dst
    sdv vmat3_i, 0x30,dst
    sdv vmat3_i, 0x38,dst
    sdv vmat0_f, 0x40,dst
    sdv vmat0_f, 0x48,dst
    sdv vmat1_f, 0x50,dst
    sdv vmat1_f, 0x58,dst
    sdv vmat2_f, 0x60,dst
    sdv vmat2_f, 0x68,dst
    sdv vmat3_f, 0x70,dst
    jr ra
-    sqv vrhs23_f, 0x30,dst
+    sdv vmat3_f, 0x78,dst
 #undef src
 #undef dst
@ -138,7 +170,6 @@ GPUCmd_DrawQuad:
    #define vtx         a0
    #define mtx_ptr     s0
    #define src_ptr     s4
 	#define vcount      s3
    #define v___        $v01
@ -164,74 +195,134 @@ GPUCmd_DrawQuad:
    addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
    li vtx, %lo(VERTEX_CACHE)
 	li vcount, 4
-    li mtx_ptr, %lo(GL_MATRIX_MVP)
+    li mtx_ptr, %lo(GPU_MATRIX_MVP)
-    ldv vmtx0_i.e0,  0x00,mtx_ptr
+    lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I  m00.I m01.I m02.I m03.I]
-    ldv vmtx1_i.e0,  0x08,mtx_ptr
+    lqv vmtx1_i, 0x10,mtx_ptr // etc
-    ldv vmtx2_i.e0,  0x10,mtx_ptr
+    lqv vmtx2_i, 0x20,mtx_ptr
-    ldv vmtx3_i.e0,  0x18,mtx_ptr
+    lqv vmtx3_i, 0x30,mtx_ptr
-    ldv vmtx0_f.e0,  0x20,mtx_ptr
+    lqv vmtx0_f, 0x40,mtx_ptr
-    ldv vmtx1_f.e0,  0x28,mtx_ptr
+    lqv vmtx1_f, 0x50,mtx_ptr
-    ldv vmtx2_f.e0,  0x30,mtx_ptr
+    lqv vmtx2_f, 0x60,mtx_ptr
-    ldv vmtx3_f.e0,  0x38,mtx_ptr
+    lqv vmtx3_f, 0x70,mtx_ptr
-upload_vertex:
+### VERTEX 0
-	ldv vpos,  0, src_ptr # Load X, Y, Z, W
+	ldv vpos.e0,  0, src_ptr // Load v0 X, Y, Z
-	llv vcol,  8, src_ptr # Load RGBA
+	ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
 	llv vtex, 12, src_ptr # Load U, V
 	# matrix multiply
-    vmudn v___,      vmtx0_f, vpos.h0
+    vmudn v___,      vmtx0_f, vpos.xxxxXXXX
-    vmadh v___,      vmtx0_i, vpos.h0
+    vmadh v___,      vmtx0_i, vpos.xxxxXXXX
-    vmadn v___,      vmtx1_f, vpos.h1
+    vmadn v___,      vmtx1_f, vpos.yyyyYYYY
-    vmadh v___,      vmtx1_i, vpos.h1
+    vmadh v___,      vmtx1_i, vpos.yyyyYYYY
-    vmadn v___,      vmtx2_f, vpos.h2
+    vmadn v___,      vmtx2_f, vpos.zzzzZZZZ
-    vmadh v___,      vmtx2_i, vpos.h2
+    vmadh v___,      vmtx2_i, vpos.zzzzZZZZ
-    vmadn v___,      vmtx3_f, vpos.h3
+    vmadn v___,      vmtx3_f, ONE_W
-    vmadh vcspos_i,  vmtx3_i, vpos.h3
+    vmadh vcspos_i,  vmtx3_i, ONE_W
    vmadn vcspos_f,  vzero,   vzero
-    slv vcol, SCREEN_VTX_RGBA, vtx
+	llv vcol.e0,  8, src_ptr // Load v0 RGBA
-    slv vtex, SCREEN_VTX_S_T,  vtx
+	llv vtex.e0, 12, src_ptr // Load v0 U, V
 	llv vcol.e2, 24, src_ptr // Load v1 RGBA
 	llv vtex.e2, 28, src_ptr // Load v1 U, V
    # 32-bit right shift by 5, to keep the clip space coordinates unscaled
    vmudm vcspos_i, vcspos_i, K2048
    vmadl vcspos_f, vcspos_f, K2048
-	addi vcount,  -1
+    slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx
-	addi src_ptr, 16
+    slv vtex.e0, SCREEN_VTX_S_T  + V0_OFFSET, vtx
-
+    slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx
-    sdv vcspos_i, SCREEN_VTX_CS_POSi,vtx
+    slv vtex.e2, SCREEN_VTX_S_T  + V1_OFFSET, vtx
    sdv vcspos_f, SCREEN_VTX_CS_POSf,vtx
    # Calculate and store clipping flags against CS.W.
    # These will be used for trivial rejections.
-    vch v___, vcspos_i, vcspos_i.w
+    vch v___, vcspos_i, vcspos_i.wwwwWWWW
-    vcl v___, vcspos_f, vcspos_f.w
+    vcl v___, vcspos_f, vcspos_f.wwwwWWWW
    cfc2 t0, COP2_CTRL_VCC
-    andi t0, 0x707   # Isolate X/Y/Z flags
+    sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx
    sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx
    sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx
    sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx
-    # Compress flags to 8 bit
+###################### VERTEX 2
-    srl t1, t0, 5
+	ldv vpos.e0, 32, src_ptr // Load v2 X, Y, Z
-    andi t0, 0x7
+	ldv vpos.e4, 48, src_ptr // Load v3 X, Y, Z
-    or t0, t1
+    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
-    sb t0, PRIM_VTX_TRCODE(vtx)
+    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
    andi t2, t2, 0x7   // Isolate lo clip flags
    or   t2, t1        // Merge clip flags (compressed to 6 bits)
-	bnez vcount, upload_vertex
+	# matrix multiply
-	addi vtx, PRIM_VTX_SIZE
+    vmudn v___,      vmtx0_f, vpos.xxxxXXXX
    vmadh v___,      vmtx0_i, vpos.xxxxXXXX
    vmadn v___,      vmtx1_f, vpos.yyyyYYYY
    sb   t2, (PRIM_VTX_TRCODE + V0_OFFSET)(vtx)
    vmadh v___,      vmtx1_i, vpos.yyyyYYYY
 	srl  t0, t0, 4
    vmadn v___,      vmtx2_f, vpos.zzzzZZZZ
    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
    vmadh v___,      vmtx2_i, vpos.zzzzZZZZ
    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
    vmadn v___,      vmtx3_f, ONE_W
    andi t2, t2, 0x7   // Isolate lo clip flags
    vmadh vcspos_i,  vmtx3_i, ONE_W
    or   t2, t1        // Merge clip flags (compressed to 6 bits)
    vmadn vcspos_f,  vzero,   vzero
    sb   t2, (PRIM_VTX_TRCODE + V1_OFFSET)(vtx)
 	llv vcol.e4, 40, src_ptr # Load v2 RGBA
 	llv vtex.e4, 44, src_ptr # Load v2 U, V
 	llv vcol.e6, 56, src_ptr # Load v3 RGBA
 	llv vtex.e6, 60, src_ptr # Load v3 U, V
    # 32-bit right shift by 5, to keep the clip space coordinates unscaled
    vmudm vcspos_i, vcspos_i, K2048
    vmadl vcspos_f, vcspos_f, K2048
    slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx
    slv vtex.e4, SCREEN_VTX_S_T  + V2_OFFSET, vtx
    slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx
    slv vtex.e6, SCREEN_VTX_S_T  + V3_OFFSET, vtx
    # Calculate and store clipping flags against CS.W.
    # These will be used for trivial rejections.
    vch v___, vcspos_i, vcspos_i.wwwwWWWW
    vcl v___, vcspos_f, vcspos_f.wwwwWWWW
    cfc2 t0, COP2_CTRL_VCC
    sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx
    sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx
    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
    andi t2, t2, 0x7   // Isolate lo clip flags
    or   t2, t1        // Merge clip flags (compressed to 6 bits)
    sb   t2, (PRIM_VTX_TRCODE + V2_OFFSET)(vtx)
 ###################### VERTEX 3
    sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx
    sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx
 	srl  t0, t0, 4
    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
    andi t2, t2, 0x7   // Isolate lo clip flags
    or   t2, t1        // Merge clip flags (compressed to 6 bits)
    sb   t2, (PRIM_VTX_TRCODE + V3_OFFSET)(vtx)
 	# now do the actual drawing
-	li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE
+	li a1, %lo(VERTEX_CACHE) + V0_OFFSET
-	li a2, %lo(VERTEX_CACHE) + 1*PRIM_VTX_SIZE
+	li a2, %lo(VERTEX_CACHE) + V1_OFFSET
 	jal GPUCmd_DrawTriangle
-	li a3, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE
+	li a3, %lo(VERTEX_CACHE) + V2_OFFSET
-	li a1, %lo(VERTEX_CACHE) + 0*PRIM_VTX_SIZE
+	li a1, %lo(VERTEX_CACHE) + V0_OFFSET
-	li a2, %lo(VERTEX_CACHE) + 2*PRIM_VTX_SIZE
+	li a2, %lo(VERTEX_CACHE) + V2_OFFSET
 	jal GPUCmd_DrawTriangle
-	li a3, %lo(VERTEX_CACHE) + 3*PRIM_VTX_SIZE
+	li a3, %lo(VERTEX_CACHE) + V3_OFFSET
    j RSPQ_Loop
    nop
@ -352,13 +443,14 @@ GL_CalcClipCodes:
    vch v___, vguard_i, vguard_i.w
    vcl v___, vguard_f, vguard_f.w
    cfc2 t0, COP2_CTRL_VCC
-    andi t0, 0x707
+    andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
-    srl t1, t0, 5
+    srl  t1, t2, 5     // Shift hi flags to be aligned next to lo flags
-    andi t0, 0x7
+    andi t2, t2, 0x7   // Isolate lo clip flags
-    or t0, t1
+    or   t2, t1        // Merge clip flags (compressed to 6 bits)
    jr ra
-    sb t0,  SCREEN_VTX_CLIP_CODE(dst)
+    sb t2,  SCREEN_VTX_CLIP_CODE(dst)
    #undef dst
    #undef vcspos_i