N64: Save 3 cycles in RSP T&L loop

2025-09-23 04:34:58 -04:00 · 2025-07-19 14:53:42 +10:00 · 2025-07-19 14:53:42 +10:00 · af4494284d
commit af4494284d
parent 00a1a49405
2 changed files with 56 additions and 79 deletions
--- a/misc/n64/gpu.c
+++ b/misc/n64/gpu.c
@ -29,15 +29,14 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){
 };

 enum {
-    GPU_CMD_SET_BYTE         = 0x0,
-    GPU_CMD_SET_SHORT        = 0x1,
-    GPU_CMD_SET_WORD         = 0x2,
-    GPU_CMD_SET_LONG         = 0x3,
+    GPU_CMD_SET_SHORT        = 0x0,
+    GPU_CMD_SET_WORD         = 0x1,
+    GPU_CMD_SET_LONG         = 0x2,

-    GPU_CMD_DRAW_QUAD        = 0x4,
-    GPU_CMD_MATRIX_LOAD      = 0x5,
+    GPU_CMD_DRAW_QUAD        = 0x3,
+    GPU_CMD_MATRIX_LOAD      = 0x4,

-	GPU_CMD_PUSH_RDP         = 0x6,
+	GPU_CMD_PUSH_RDP         = 0x5,
 };

 typedef struct {
@ -49,12 +48,6 @@ typedef struct {
    uint16_t tri_cull;
 } __attribute__((aligned(8), packed)) gpu_state;

-__attribute__((always_inline))
-static inline void gpu_set_byte(uint32_t offset, uint8_t value)
-{
-    rspq_write(gpup_id, GPU_CMD_SET_BYTE, offset, value);
-}
-
 __attribute__((always_inline))
 static inline void gpu_set_short(uint32_t offset, uint16_t value)
 {
@ -82,9 +75,6 @@ static inline void gpu_push_rdp(uint32_t a1, uint64_t a2)
    rdpq_write(2, gpup_id, GPU_CMD_PUSH_RDP, 0, a1, a2);
 }

-
-static float gpu_vp_scale[3];
-static float gpu_vp_offset[3];
 static bool  gpu_texturing;
 static void* gpu_pointer;
 static int   gpu_stride;
@ -191,34 +181,28 @@ static void gpuDrawArrays(uint32_t first, uint32_t count)
    }
 }

-static void gpuDepthRange(float n, float f)
-{
-    gpu_vp_scale[2]  = (f - n) * 0.5f;
-    gpu_vp_offset[2] = n + (f - n) * 0.5f;
-
-    gpu_set_short(offsetof(gpu_state, vp_scale[2]),  gpu_vp_scale[2]  * 4);
-    gpu_set_short(offsetof(gpu_state, vp_offset[2]), gpu_vp_offset[2] * 4);
-}
-
 static void gpuViewport(int x, int y, int w, int h)
 {
-    gpu_vp_scale[0]  = w * 0.5f;
-    gpu_vp_scale[1]  = h * -0.5f;
-    gpu_vp_offset[0] = x + w * 0.5f;
-    gpu_vp_offset[1] = y + h * 0.5f;
+    float vp_scale_x  = w * 0.5f;
+    float vp_scale_y  = h * -0.5f;
+    float vp_scale_z  = 0.5f;
+
+    float vp_offset_x = x + w * 0.5f;
+    float vp_offset_y = y + h * 0.5f;
+    float vp_offset_z = 0.5f;

    // Screen coordinates are s13.2
    #define SCREEN_XY_SCALE   4.0f
    #define SCREEN_Z_SCALE    32767.0f

    // * 2.0f to compensate for RSP reciprocal missing 1 bit
-    uint16_t scale_x  = gpu_vp_scale[0] * SCREEN_XY_SCALE * 2.0f;
-    uint16_t scale_y  = gpu_vp_scale[1] * SCREEN_XY_SCALE * 2.0f;
-    uint16_t scale_z  = gpu_vp_scale[2] * SCREEN_Z_SCALE  * 2.0f;
+    uint16_t scale_x  = vp_scale_x * SCREEN_XY_SCALE * 2.0f;
+    uint16_t scale_y  = vp_scale_y * SCREEN_XY_SCALE * 2.0f;
+    uint16_t scale_z  = vp_scale_z * SCREEN_Z_SCALE  * 2.0f;

-    uint16_t offset_x = gpu_vp_offset[0] * SCREEN_XY_SCALE;
-    uint16_t offset_y = gpu_vp_offset[1] * SCREEN_XY_SCALE;
-    uint16_t offset_z = gpu_vp_offset[2] * SCREEN_Z_SCALE;
+    uint16_t offset_x = vp_offset_x * SCREEN_XY_SCALE;
+    uint16_t offset_y = vp_offset_y * SCREEN_XY_SCALE;
+    uint16_t offset_z = vp_offset_z * SCREEN_Z_SCALE;

    gpu_set_long( 
        offsetof(gpu_state, vp_scale), 
@ -236,7 +220,6 @@ static void gpuSetCullFace(bool enabled) {

 static void gpu_init() {
    gpup_id = rspq_overlay_register(&rsp_gpu);
-    gpuDepthRange(0, 1);
 }

 static void gpu_close() {
--- a/misc/n64/rsp_gpu.S
+++ b/misc/n64/rsp_gpu.S
@ -35,15 +35,14 @@
 .data

    RSPQ_BeginOverlayHeader
-        RSPQ_DefineCommand GPUCmd_SetByte,       8   # 0x0
-        RSPQ_DefineCommand GPUCmd_SetShort,      8   # 0x1
-        RSPQ_DefineCommand GPUCmd_SetWord,       8   # 0x2
-        RSPQ_DefineCommand GPUCmd_SetLong,       12  # 0x3
+        RSPQ_DefineCommand GPUCmd_SetShort,      8   # 0x0
+        RSPQ_DefineCommand GPUCmd_SetWord,       8   # 0x1
+        RSPQ_DefineCommand GPUCmd_SetLong,       12  # 0x2

-        RSPQ_DefineCommand GPUCmd_DrawQuad,      68  # 0x4
-        RSPQ_DefineCommand GPUCmd_MatrixLoad,    68  # 0x5
+        RSPQ_DefineCommand GPUCmd_DrawQuad,      68  # 0x3
+        RSPQ_DefineCommand GPUCmd_MatrixLoad,    68  # 0x4

-        RSPQ_DefineCommand GPUCmd_PushRDP,       12  # 0x6
+        RSPQ_DefineCommand GPUCmd_PushRDP,       12  # 0x5
    RSPQ_EndOverlayHeader

    .align 4
@ -74,12 +73,6 @@ VERTEX_CACHE:   .dcb.b      SCREEN_VTX_SIZE * 4

 .text

-    .func GPUCmd_SetByte
-GPUCmd_SetByte:
-    jr ra
-    sb a1, %lo(GL_STATE)(a0)
-    .endfunc
-
    .func GPUCmd_SetShort
 GPUCmd_SetShort:
    jr ra
@ -344,7 +337,7 @@ GL_TnL:
    .align 3
    .func GPUCmd_DrawQuad
 GPUCmd_DrawQuad:
-    #define vtx         a0
+    #define vtx_ptr     a0
    #define mtx_ptr     s0
    #define src_ptr     s4

@ -373,9 +366,12 @@ GPUCmd_DrawQuad:
 	// t5 is used by GL_ClipTriangle

    addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
-    li vtx, %lo(VERTEX_CACHE)
+    li vtx_ptr,   %lo(VERTEX_CACHE)
+    li mtx_ptr,   %lo(GPU_MATRIX_MVP)
+
+	ldv vpos.e0,  0, src_ptr // Load v0 X, Y, Z
+	ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z

-    li mtx_ptr, %lo(GPU_MATRIX_MVP)
    lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I  m00.I m01.I m02.I m03.I]
    lqv vmtx1_i, 0x10,mtx_ptr // etc
    lqv vmtx2_i, 0x20,mtx_ptr
@ -388,9 +384,6 @@ GPUCmd_DrawQuad:
 // ########################
 // Vertex 0 and 1 transform
 // ########################
-	ldv vpos.e0,  0, src_ptr // Load v0 X, Y, Z
-	ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
-
 	// matrix multiply
    vmudn v___,      vmtx0_f, vpos.xxxxXXXX
    vmadh v___,      vmtx0_i, vpos.xxxxXXXX
@ -407,25 +400,25 @@ GPUCmd_DrawQuad:
 	llv vcol.e2, 24, src_ptr // Load v1 RGBA
 	llv vtex.e2, 28, src_ptr // Load v1 U, V

-    # 32-bit right shift by 5, to keep the clip space coordinates unscaled
+    // 32-bit right shift by 5, to keep the clip space coordinates unscaled
    vmudm vcspos_i, vcspos_i, K2048
    vmadl vcspos_f, vcspos_f, K2048

-    slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx
-    slv vtex.e0, SCREEN_VTX_S_T  + V0_OFFSET, vtx
-    slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx
-    slv vtex.e2, SCREEN_VTX_S_T  + V1_OFFSET, vtx
+    slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr
+    slv vtex.e0, SCREEN_VTX_S_T  + V0_OFFSET, vtx_ptr
+    slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr
+    slv vtex.e2, SCREEN_VTX_S_T  + V1_OFFSET, vtx_ptr

-    # Calculate and store clipping flags against CS.W.
-    # These will be used for trivial rejections.
+    // Calculate and store clipping flags against CS.W.
+    // These will be used for trivial rejections.
    vch v___, vcspos_i, vcspos_i.wwwwWWWW
    vcl v___, vcspos_f, vcspos_f.wwwwWWWW

    cfc2 tmp, COP2_CTRL_VCC
-    sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx
-    sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx
-    sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx
-    sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx
+    sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx_ptr
+    sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx_ptr
+    sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx_ptr
+    sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx_ptr

 // ########################
 // Vertex 2 and 3 transform
@ -437,7 +430,7 @@ GPUCmd_DrawQuad:
 	srl  tmp, tmp, 4
    andi v1_cflags, tmp, XYZ_CLIP_FLAGS

-	# matrix multiply
+	// matrix multiply
    vmudn v___,      vmtx0_f, vpos.xxxxXXXX
    vmadh v___,      vmtx0_i, vpos.xxxxXXXX
    vmadn v___,      vmtx1_f, vpos.yyyyYYYY
@ -453,31 +446,32 @@ GPUCmd_DrawQuad:
 	llv vcol.e6, 56, src_ptr # Load v3 RGBA
 	llv vtex.e6, 60, src_ptr # Load v3 U, V

-    # 32-bit right shift by 5, to keep the clip space coordinates unscaled
+    // 32-bit right shift by 5, to keep the clip space coordinates unscaled
    vmudm vcspos_i, vcspos_i, K2048
    vmadl vcspos_f, vcspos_f, K2048

-    slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx
-    slv vtex.e4, SCREEN_VTX_S_T  + V2_OFFSET, vtx
-    slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx
-    slv vtex.e6, SCREEN_VTX_S_T  + V3_OFFSET, vtx
+    slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
+    slv vtex.e4, SCREEN_VTX_S_T  + V2_OFFSET, vtx_ptr
+    slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
+    slv vtex.e6, SCREEN_VTX_S_T  + V3_OFFSET, vtx_ptr

-    # Calculate and store clipping flags against CS.W.
-    # These will be used for trivial rejections.
+    // Calculate and store clipping flags against CS.W.
+    // These will be used for trivial rejections.
    vch v___, vcspos_i, vcspos_i.wwwwWWWW
    vcl v___, vcspos_f, vcspos_f.wwwwWWWW

    cfc2 tmp, COP2_CTRL_VCC
-    sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx
-    sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx
-    sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx
-    sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx
+    sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx_ptr
+    sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx_ptr
+    sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr
+    sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr

    andi v2_cflags, tmp, XYZ_CLIP_FLAGS
 	srl  tmp, tmp, 4
    andi v3_cflags, tmp, XYZ_CLIP_FLAGS
+
    #undef src_ptr
-    #undef vtx
+    #undef vtx_ptr
    #undef v___     

    #undef vmtx0_i