From af4494284d8f4e79f814e4b85975e6d0d400e996 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sat, 19 Jul 2025 14:53:42 +1000 Subject: [PATCH] N64: Save 3 cycles in RSP T&L loop --- misc/n64/gpu.c | 55 +++++++++++-------------------- misc/n64/rsp_gpu.S | 80 +++++++++++++++++++++------------------------- 2 files changed, 56 insertions(+), 79 deletions(-) diff --git a/misc/n64/gpu.c b/misc/n64/gpu.c index ebd11031a..201bf6ffb 100644 --- a/misc/n64/gpu.c +++ b/misc/n64/gpu.c @@ -29,15 +29,14 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){ }; enum { - GPU_CMD_SET_BYTE = 0x0, - GPU_CMD_SET_SHORT = 0x1, - GPU_CMD_SET_WORD = 0x2, - GPU_CMD_SET_LONG = 0x3, + GPU_CMD_SET_SHORT = 0x0, + GPU_CMD_SET_WORD = 0x1, + GPU_CMD_SET_LONG = 0x2, - GPU_CMD_DRAW_QUAD = 0x4, - GPU_CMD_MATRIX_LOAD = 0x5, + GPU_CMD_DRAW_QUAD = 0x3, + GPU_CMD_MATRIX_LOAD = 0x4, - GPU_CMD_PUSH_RDP = 0x6, + GPU_CMD_PUSH_RDP = 0x5, }; typedef struct { @@ -49,12 +48,6 @@ typedef struct { uint16_t tri_cull; } __attribute__((aligned(8), packed)) gpu_state; -__attribute__((always_inline)) -static inline void gpu_set_byte(uint32_t offset, uint8_t value) -{ - rspq_write(gpup_id, GPU_CMD_SET_BYTE, offset, value); -} - __attribute__((always_inline)) static inline void gpu_set_short(uint32_t offset, uint16_t value) { @@ -82,9 +75,6 @@ static inline void gpu_push_rdp(uint32_t a1, uint64_t a2) rdpq_write(2, gpup_id, GPU_CMD_PUSH_RDP, 0, a1, a2); } - -static float gpu_vp_scale[3]; -static float gpu_vp_offset[3]; static bool gpu_texturing; static void* gpu_pointer; static int gpu_stride; @@ -191,34 +181,28 @@ static void gpuDrawArrays(uint32_t first, uint32_t count) } } -static void gpuDepthRange(float n, float f) -{ - gpu_vp_scale[2] = (f - n) * 0.5f; - gpu_vp_offset[2] = n + (f - n) * 0.5f; - - gpu_set_short(offsetof(gpu_state, vp_scale[2]), gpu_vp_scale[2] * 4); - gpu_set_short(offsetof(gpu_state, vp_offset[2]), gpu_vp_offset[2] * 4); -} - static void gpuViewport(int x, int y, int w, int h) { - gpu_vp_scale[0] = w * 0.5f; - gpu_vp_scale[1] = h * -0.5f; - gpu_vp_offset[0] = x + w * 0.5f; - gpu_vp_offset[1] = y + h * 0.5f; + float vp_scale_x = w * 0.5f; + float vp_scale_y = h * -0.5f; + float vp_scale_z = 0.5f; + + float vp_offset_x = x + w * 0.5f; + float vp_offset_y = y + h * 0.5f; + float vp_offset_z = 0.5f; // Screen coordinates are s13.2 #define SCREEN_XY_SCALE 4.0f #define SCREEN_Z_SCALE 32767.0f // * 2.0f to compensate for RSP reciprocal missing 1 bit - uint16_t scale_x = gpu_vp_scale[0] * SCREEN_XY_SCALE * 2.0f; - uint16_t scale_y = gpu_vp_scale[1] * SCREEN_XY_SCALE * 2.0f; - uint16_t scale_z = gpu_vp_scale[2] * SCREEN_Z_SCALE * 2.0f; + uint16_t scale_x = vp_scale_x * SCREEN_XY_SCALE * 2.0f; + uint16_t scale_y = vp_scale_y * SCREEN_XY_SCALE * 2.0f; + uint16_t scale_z = vp_scale_z * SCREEN_Z_SCALE * 2.0f; - uint16_t offset_x = gpu_vp_offset[0] * SCREEN_XY_SCALE; - uint16_t offset_y = gpu_vp_offset[1] * SCREEN_XY_SCALE; - uint16_t offset_z = gpu_vp_offset[2] * SCREEN_Z_SCALE; + uint16_t offset_x = vp_offset_x * SCREEN_XY_SCALE; + uint16_t offset_y = vp_offset_y * SCREEN_XY_SCALE; + uint16_t offset_z = vp_offset_z * SCREEN_Z_SCALE; gpu_set_long( offsetof(gpu_state, vp_scale), @@ -236,7 +220,6 @@ static void gpuSetCullFace(bool enabled) { static void gpu_init() { gpup_id = rspq_overlay_register(&rsp_gpu); - gpuDepthRange(0, 1); } static void gpu_close() { diff --git a/misc/n64/rsp_gpu.S b/misc/n64/rsp_gpu.S index 5b084acf8..4a6969af4 100644 --- a/misc/n64/rsp_gpu.S +++ b/misc/n64/rsp_gpu.S @@ -35,15 +35,14 @@ .data RSPQ_BeginOverlayHeader - RSPQ_DefineCommand GPUCmd_SetByte, 8 # 0x0 - RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x1 - RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x2 - RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3 + RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0 + RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x1 + RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2 - RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x4 - RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x5 + RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3 + RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x4 - RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x6 + RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x5 RSPQ_EndOverlayHeader .align 4 @@ -74,12 +73,6 @@ VERTEX_CACHE: .dcb.b SCREEN_VTX_SIZE * 4 .text - .func GPUCmd_SetByte -GPUCmd_SetByte: - jr ra - sb a1, %lo(GL_STATE)(a0) - .endfunc - .func GPUCmd_SetShort GPUCmd_SetShort: jr ra @@ -344,7 +337,7 @@ GL_TnL: .align 3 .func GPUCmd_DrawQuad GPUCmd_DrawQuad: - #define vtx a0 + #define vtx_ptr a0 #define mtx_ptr s0 #define src_ptr s4 @@ -373,9 +366,12 @@ GPUCmd_DrawQuad: // t5 is used by GL_ClipTriangle addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64 - li vtx, %lo(VERTEX_CACHE) + li vtx_ptr, %lo(VERTEX_CACHE) + li mtx_ptr, %lo(GPU_MATRIX_MVP) + + ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z + ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z - li mtx_ptr, %lo(GPU_MATRIX_MVP) lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I] lqv vmtx1_i, 0x10,mtx_ptr // etc lqv vmtx2_i, 0x20,mtx_ptr @@ -388,9 +384,6 @@ GPUCmd_DrawQuad: // ######################## // Vertex 0 and 1 transform // ######################## - ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z - ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z - // matrix multiply vmudn v___, vmtx0_f, vpos.xxxxXXXX vmadh v___, vmtx0_i, vpos.xxxxXXXX @@ -407,25 +400,25 @@ GPUCmd_DrawQuad: llv vcol.e2, 24, src_ptr // Load v1 RGBA llv vtex.e2, 28, src_ptr // Load v1 U, V - # 32-bit right shift by 5, to keep the clip space coordinates unscaled + // 32-bit right shift by 5, to keep the clip space coordinates unscaled vmudm vcspos_i, vcspos_i, K2048 vmadl vcspos_f, vcspos_f, K2048 - slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx - slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx - slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx - slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx + slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr + slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr + slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr + slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr - # Calculate and store clipping flags against CS.W. - # These will be used for trivial rejections. + // Calculate and store clipping flags against CS.W. + // These will be used for trivial rejections. vch v___, vcspos_i, vcspos_i.wwwwWWWW vcl v___, vcspos_f, vcspos_f.wwwwWWWW cfc2 tmp, COP2_CTRL_VCC - sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx - sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx - sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx - sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx + sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx_ptr + sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx_ptr + sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx_ptr + sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx_ptr // ######################## // Vertex 2 and 3 transform @@ -437,7 +430,7 @@ GPUCmd_DrawQuad: srl tmp, tmp, 4 andi v1_cflags, tmp, XYZ_CLIP_FLAGS - # matrix multiply + // matrix multiply vmudn v___, vmtx0_f, vpos.xxxxXXXX vmadh v___, vmtx0_i, vpos.xxxxXXXX vmadn v___, vmtx1_f, vpos.yyyyYYYY @@ -453,31 +446,32 @@ GPUCmd_DrawQuad: llv vcol.e6, 56, src_ptr # Load v3 RGBA llv vtex.e6, 60, src_ptr # Load v3 U, V - # 32-bit right shift by 5, to keep the clip space coordinates unscaled + // 32-bit right shift by 5, to keep the clip space coordinates unscaled vmudm vcspos_i, vcspos_i, K2048 vmadl vcspos_f, vcspos_f, K2048 - slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx - slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx - slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx - slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx + slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr + slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr + slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr + slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr - # Calculate and store clipping flags against CS.W. - # These will be used for trivial rejections. + // Calculate and store clipping flags against CS.W. + // These will be used for trivial rejections. vch v___, vcspos_i, vcspos_i.wwwwWWWW vcl v___, vcspos_f, vcspos_f.wwwwWWWW cfc2 tmp, COP2_CTRL_VCC - sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx - sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx - sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx - sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx + sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx_ptr + sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx_ptr + sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr + sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr andi v2_cflags, tmp, XYZ_CLIP_FLAGS srl tmp, tmp, 4 andi v3_cflags, tmp, XYZ_CLIP_FLAGS + #undef src_ptr - #undef vtx + #undef vtx_ptr #undef v___ #undef vmtx0_i