N64: Save 3 cycles in RSP T&L loop

This commit is contained in:
UnknownShadow200 2025-07-19 14:53:42 +10:00
parent 00a1a49405
commit af4494284d
2 changed files with 56 additions and 79 deletions

View File

@ -29,15 +29,14 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){
}; };
enum { enum {
GPU_CMD_SET_BYTE = 0x0, GPU_CMD_SET_SHORT = 0x0,
GPU_CMD_SET_SHORT = 0x1, GPU_CMD_SET_WORD = 0x1,
GPU_CMD_SET_WORD = 0x2, GPU_CMD_SET_LONG = 0x2,
GPU_CMD_SET_LONG = 0x3,
GPU_CMD_DRAW_QUAD = 0x4, GPU_CMD_DRAW_QUAD = 0x3,
GPU_CMD_MATRIX_LOAD = 0x5, GPU_CMD_MATRIX_LOAD = 0x4,
GPU_CMD_PUSH_RDP = 0x6, GPU_CMD_PUSH_RDP = 0x5,
}; };
typedef struct { typedef struct {
@ -49,12 +48,6 @@ typedef struct {
uint16_t tri_cull; uint16_t tri_cull;
} __attribute__((aligned(8), packed)) gpu_state; } __attribute__((aligned(8), packed)) gpu_state;
__attribute__((always_inline))
static inline void gpu_set_byte(uint32_t offset, uint8_t value)
{
rspq_write(gpup_id, GPU_CMD_SET_BYTE, offset, value);
}
__attribute__((always_inline)) __attribute__((always_inline))
static inline void gpu_set_short(uint32_t offset, uint16_t value) static inline void gpu_set_short(uint32_t offset, uint16_t value)
{ {
@ -82,9 +75,6 @@ static inline void gpu_push_rdp(uint32_t a1, uint64_t a2)
rdpq_write(2, gpup_id, GPU_CMD_PUSH_RDP, 0, a1, a2); rdpq_write(2, gpup_id, GPU_CMD_PUSH_RDP, 0, a1, a2);
} }
static float gpu_vp_scale[3];
static float gpu_vp_offset[3];
static bool gpu_texturing; static bool gpu_texturing;
static void* gpu_pointer; static void* gpu_pointer;
static int gpu_stride; static int gpu_stride;
@ -191,34 +181,28 @@ static void gpuDrawArrays(uint32_t first, uint32_t count)
} }
} }
static void gpuDepthRange(float n, float f)
{
gpu_vp_scale[2] = (f - n) * 0.5f;
gpu_vp_offset[2] = n + (f - n) * 0.5f;
gpu_set_short(offsetof(gpu_state, vp_scale[2]), gpu_vp_scale[2] * 4);
gpu_set_short(offsetof(gpu_state, vp_offset[2]), gpu_vp_offset[2] * 4);
}
static void gpuViewport(int x, int y, int w, int h) static void gpuViewport(int x, int y, int w, int h)
{ {
gpu_vp_scale[0] = w * 0.5f; float vp_scale_x = w * 0.5f;
gpu_vp_scale[1] = h * -0.5f; float vp_scale_y = h * -0.5f;
gpu_vp_offset[0] = x + w * 0.5f; float vp_scale_z = 0.5f;
gpu_vp_offset[1] = y + h * 0.5f;
float vp_offset_x = x + w * 0.5f;
float vp_offset_y = y + h * 0.5f;
float vp_offset_z = 0.5f;
// Screen coordinates are s13.2 // Screen coordinates are s13.2
#define SCREEN_XY_SCALE 4.0f #define SCREEN_XY_SCALE 4.0f
#define SCREEN_Z_SCALE 32767.0f #define SCREEN_Z_SCALE 32767.0f
// * 2.0f to compensate for RSP reciprocal missing 1 bit // * 2.0f to compensate for RSP reciprocal missing 1 bit
uint16_t scale_x = gpu_vp_scale[0] * SCREEN_XY_SCALE * 2.0f; uint16_t scale_x = vp_scale_x * SCREEN_XY_SCALE * 2.0f;
uint16_t scale_y = gpu_vp_scale[1] * SCREEN_XY_SCALE * 2.0f; uint16_t scale_y = vp_scale_y * SCREEN_XY_SCALE * 2.0f;
uint16_t scale_z = gpu_vp_scale[2] * SCREEN_Z_SCALE * 2.0f; uint16_t scale_z = vp_scale_z * SCREEN_Z_SCALE * 2.0f;
uint16_t offset_x = gpu_vp_offset[0] * SCREEN_XY_SCALE; uint16_t offset_x = vp_offset_x * SCREEN_XY_SCALE;
uint16_t offset_y = gpu_vp_offset[1] * SCREEN_XY_SCALE; uint16_t offset_y = vp_offset_y * SCREEN_XY_SCALE;
uint16_t offset_z = gpu_vp_offset[2] * SCREEN_Z_SCALE; uint16_t offset_z = vp_offset_z * SCREEN_Z_SCALE;
gpu_set_long( gpu_set_long(
offsetof(gpu_state, vp_scale), offsetof(gpu_state, vp_scale),
@ -236,7 +220,6 @@ static void gpuSetCullFace(bool enabled) {
static void gpu_init() { static void gpu_init() {
gpup_id = rspq_overlay_register(&rsp_gpu); gpup_id = rspq_overlay_register(&rsp_gpu);
gpuDepthRange(0, 1);
} }
static void gpu_close() { static void gpu_close() {

View File

@ -35,15 +35,14 @@
.data .data
RSPQ_BeginOverlayHeader RSPQ_BeginOverlayHeader
RSPQ_DefineCommand GPUCmd_SetByte, 8 # 0x0 RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x1 RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x1
RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x2 RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x4 RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3
RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x5 RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x4
RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x6 RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x5
RSPQ_EndOverlayHeader RSPQ_EndOverlayHeader
.align 4 .align 4
@ -74,12 +73,6 @@ VERTEX_CACHE: .dcb.b SCREEN_VTX_SIZE * 4
.text .text
.func GPUCmd_SetByte
GPUCmd_SetByte:
jr ra
sb a1, %lo(GL_STATE)(a0)
.endfunc
.func GPUCmd_SetShort .func GPUCmd_SetShort
GPUCmd_SetShort: GPUCmd_SetShort:
jr ra jr ra
@ -344,7 +337,7 @@ GL_TnL:
.align 3 .align 3
.func GPUCmd_DrawQuad .func GPUCmd_DrawQuad
GPUCmd_DrawQuad: GPUCmd_DrawQuad:
#define vtx a0 #define vtx_ptr a0
#define mtx_ptr s0 #define mtx_ptr s0
#define src_ptr s4 #define src_ptr s4
@ -373,9 +366,12 @@ GPUCmd_DrawQuad:
// t5 is used by GL_ClipTriangle // t5 is used by GL_ClipTriangle
addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64 addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
li vtx, %lo(VERTEX_CACHE) li vtx_ptr, %lo(VERTEX_CACHE)
li mtx_ptr, %lo(GPU_MATRIX_MVP)
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
li mtx_ptr, %lo(GPU_MATRIX_MVP)
lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I] lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I]
lqv vmtx1_i, 0x10,mtx_ptr // etc lqv vmtx1_i, 0x10,mtx_ptr // etc
lqv vmtx2_i, 0x20,mtx_ptr lqv vmtx2_i, 0x20,mtx_ptr
@ -388,9 +384,6 @@ GPUCmd_DrawQuad:
// ######################## // ########################
// Vertex 0 and 1 transform // Vertex 0 and 1 transform
// ######################## // ########################
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
// matrix multiply // matrix multiply
vmudn v___, vmtx0_f, vpos.xxxxXXXX vmudn v___, vmtx0_f, vpos.xxxxXXXX
vmadh v___, vmtx0_i, vpos.xxxxXXXX vmadh v___, vmtx0_i, vpos.xxxxXXXX
@ -407,25 +400,25 @@ GPUCmd_DrawQuad:
llv vcol.e2, 24, src_ptr // Load v1 RGBA llv vcol.e2, 24, src_ptr // Load v1 RGBA
llv vtex.e2, 28, src_ptr // Load v1 U, V llv vtex.e2, 28, src_ptr // Load v1 U, V
# 32-bit right shift by 5, to keep the clip space coordinates unscaled // 32-bit right shift by 5, to keep the clip space coordinates unscaled
vmudm vcspos_i, vcspos_i, K2048 vmudm vcspos_i, vcspos_i, K2048
vmadl vcspos_f, vcspos_f, K2048 vmadl vcspos_f, vcspos_f, K2048
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr
# Calculate and store clipping flags against CS.W. // Calculate and store clipping flags against CS.W.
# These will be used for trivial rejections. // These will be used for trivial rejections.
vch v___, vcspos_i, vcspos_i.wwwwWWWW vch v___, vcspos_i, vcspos_i.wwwwWWWW
vcl v___, vcspos_f, vcspos_f.wwwwWWWW vcl v___, vcspos_f, vcspos_f.wwwwWWWW
cfc2 tmp, COP2_CTRL_VCC cfc2 tmp, COP2_CTRL_VCC
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx_ptr
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx_ptr
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx_ptr
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx_ptr
// ######################## // ########################
// Vertex 2 and 3 transform // Vertex 2 and 3 transform
@ -437,7 +430,7 @@ GPUCmd_DrawQuad:
srl tmp, tmp, 4 srl tmp, tmp, 4
andi v1_cflags, tmp, XYZ_CLIP_FLAGS andi v1_cflags, tmp, XYZ_CLIP_FLAGS
# matrix multiply // matrix multiply
vmudn v___, vmtx0_f, vpos.xxxxXXXX vmudn v___, vmtx0_f, vpos.xxxxXXXX
vmadh v___, vmtx0_i, vpos.xxxxXXXX vmadh v___, vmtx0_i, vpos.xxxxXXXX
vmadn v___, vmtx1_f, vpos.yyyyYYYY vmadn v___, vmtx1_f, vpos.yyyyYYYY
@ -453,31 +446,32 @@ GPUCmd_DrawQuad:
llv vcol.e6, 56, src_ptr # Load v3 RGBA llv vcol.e6, 56, src_ptr # Load v3 RGBA
llv vtex.e6, 60, src_ptr # Load v3 U, V llv vtex.e6, 60, src_ptr # Load v3 U, V
# 32-bit right shift by 5, to keep the clip space coordinates unscaled // 32-bit right shift by 5, to keep the clip space coordinates unscaled
vmudm vcspos_i, vcspos_i, K2048 vmudm vcspos_i, vcspos_i, K2048
vmadl vcspos_f, vcspos_f, K2048 vmadl vcspos_f, vcspos_f, K2048
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr
# Calculate and store clipping flags against CS.W. // Calculate and store clipping flags against CS.W.
# These will be used for trivial rejections. // These will be used for trivial rejections.
vch v___, vcspos_i, vcspos_i.wwwwWWWW vch v___, vcspos_i, vcspos_i.wwwwWWWW
vcl v___, vcspos_f, vcspos_f.wwwwWWWW vcl v___, vcspos_f, vcspos_f.wwwwWWWW
cfc2 tmp, COP2_CTRL_VCC cfc2 tmp, COP2_CTRL_VCC
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx_ptr
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx_ptr
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr
andi v2_cflags, tmp, XYZ_CLIP_FLAGS andi v2_cflags, tmp, XYZ_CLIP_FLAGS
srl tmp, tmp, 4 srl tmp, tmp, 4
andi v3_cflags, tmp, XYZ_CLIP_FLAGS andi v3_cflags, tmp, XYZ_CLIP_FLAGS
#undef src_ptr #undef src_ptr
#undef vtx #undef vtx_ptr
#undef v___ #undef v___
#undef vmtx0_i #undef vmtx0_i