N64: Save 3 cycles in RSP T&L loop

This commit is contained in:
UnknownShadow200 2025-07-19 14:53:42 +10:00
parent 00a1a49405
commit af4494284d
2 changed files with 56 additions and 79 deletions

View File

@ -29,15 +29,14 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){
};
enum {
GPU_CMD_SET_BYTE = 0x0,
GPU_CMD_SET_SHORT = 0x1,
GPU_CMD_SET_WORD = 0x2,
GPU_CMD_SET_LONG = 0x3,
GPU_CMD_SET_SHORT = 0x0,
GPU_CMD_SET_WORD = 0x1,
GPU_CMD_SET_LONG = 0x2,
GPU_CMD_DRAW_QUAD = 0x4,
GPU_CMD_MATRIX_LOAD = 0x5,
GPU_CMD_DRAW_QUAD = 0x3,
GPU_CMD_MATRIX_LOAD = 0x4,
GPU_CMD_PUSH_RDP = 0x6,
GPU_CMD_PUSH_RDP = 0x5,
};
typedef struct {
@ -49,12 +48,6 @@ typedef struct {
uint16_t tri_cull;
} __attribute__((aligned(8), packed)) gpu_state;
__attribute__((always_inline))
static inline void gpu_set_byte(uint32_t offset, uint8_t value)
{
rspq_write(gpup_id, GPU_CMD_SET_BYTE, offset, value);
}
__attribute__((always_inline))
static inline void gpu_set_short(uint32_t offset, uint16_t value)
{
@ -82,9 +75,6 @@ static inline void gpu_push_rdp(uint32_t a1, uint64_t a2)
rdpq_write(2, gpup_id, GPU_CMD_PUSH_RDP, 0, a1, a2);
}
static float gpu_vp_scale[3];
static float gpu_vp_offset[3];
static bool gpu_texturing;
static void* gpu_pointer;
static int gpu_stride;
@ -191,34 +181,28 @@ static void gpuDrawArrays(uint32_t first, uint32_t count)
}
}
static void gpuDepthRange(float n, float f)
{
gpu_vp_scale[2] = (f - n) * 0.5f;
gpu_vp_offset[2] = n + (f - n) * 0.5f;
gpu_set_short(offsetof(gpu_state, vp_scale[2]), gpu_vp_scale[2] * 4);
gpu_set_short(offsetof(gpu_state, vp_offset[2]), gpu_vp_offset[2] * 4);
}
static void gpuViewport(int x, int y, int w, int h)
{
gpu_vp_scale[0] = w * 0.5f;
gpu_vp_scale[1] = h * -0.5f;
gpu_vp_offset[0] = x + w * 0.5f;
gpu_vp_offset[1] = y + h * 0.5f;
float vp_scale_x = w * 0.5f;
float vp_scale_y = h * -0.5f;
float vp_scale_z = 0.5f;
float vp_offset_x = x + w * 0.5f;
float vp_offset_y = y + h * 0.5f;
float vp_offset_z = 0.5f;
// Screen coordinates are s13.2
#define SCREEN_XY_SCALE 4.0f
#define SCREEN_Z_SCALE 32767.0f
// * 2.0f to compensate for RSP reciprocal missing 1 bit
uint16_t scale_x = gpu_vp_scale[0] * SCREEN_XY_SCALE * 2.0f;
uint16_t scale_y = gpu_vp_scale[1] * SCREEN_XY_SCALE * 2.0f;
uint16_t scale_z = gpu_vp_scale[2] * SCREEN_Z_SCALE * 2.0f;
uint16_t scale_x = vp_scale_x * SCREEN_XY_SCALE * 2.0f;
uint16_t scale_y = vp_scale_y * SCREEN_XY_SCALE * 2.0f;
uint16_t scale_z = vp_scale_z * SCREEN_Z_SCALE * 2.0f;
uint16_t offset_x = gpu_vp_offset[0] * SCREEN_XY_SCALE;
uint16_t offset_y = gpu_vp_offset[1] * SCREEN_XY_SCALE;
uint16_t offset_z = gpu_vp_offset[2] * SCREEN_Z_SCALE;
uint16_t offset_x = vp_offset_x * SCREEN_XY_SCALE;
uint16_t offset_y = vp_offset_y * SCREEN_XY_SCALE;
uint16_t offset_z = vp_offset_z * SCREEN_Z_SCALE;
gpu_set_long(
offsetof(gpu_state, vp_scale),
@ -236,7 +220,6 @@ static void gpuSetCullFace(bool enabled) {
static void gpu_init() {
gpup_id = rspq_overlay_register(&rsp_gpu);
gpuDepthRange(0, 1);
}
static void gpu_close() {

View File

@ -35,15 +35,14 @@
.data
RSPQ_BeginOverlayHeader
RSPQ_DefineCommand GPUCmd_SetByte, 8 # 0x0
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x1
RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x2
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0
RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x1
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x4
RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x5
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3
RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x4
RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x6
RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x5
RSPQ_EndOverlayHeader
.align 4
@ -74,12 +73,6 @@ VERTEX_CACHE: .dcb.b SCREEN_VTX_SIZE * 4
.text
.func GPUCmd_SetByte
GPUCmd_SetByte:
jr ra
sb a1, %lo(GL_STATE)(a0)
.endfunc
.func GPUCmd_SetShort
GPUCmd_SetShort:
jr ra
@ -344,7 +337,7 @@ GL_TnL:
.align 3
.func GPUCmd_DrawQuad
GPUCmd_DrawQuad:
#define vtx a0
#define vtx_ptr a0
#define mtx_ptr s0
#define src_ptr s4
@ -373,9 +366,12 @@ GPUCmd_DrawQuad:
// t5 is used by GL_ClipTriangle
addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
li vtx, %lo(VERTEX_CACHE)
li vtx_ptr, %lo(VERTEX_CACHE)
li mtx_ptr, %lo(GPU_MATRIX_MVP)
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
li mtx_ptr, %lo(GPU_MATRIX_MVP)
lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I]
lqv vmtx1_i, 0x10,mtx_ptr // etc
lqv vmtx2_i, 0x20,mtx_ptr
@ -388,9 +384,6 @@ GPUCmd_DrawQuad:
// ########################
// Vertex 0 and 1 transform
// ########################
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
// matrix multiply
vmudn v___, vmtx0_f, vpos.xxxxXXXX
vmadh v___, vmtx0_i, vpos.xxxxXXXX
@ -407,25 +400,25 @@ GPUCmd_DrawQuad:
llv vcol.e2, 24, src_ptr // Load v1 RGBA
llv vtex.e2, 28, src_ptr // Load v1 U, V
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
// 32-bit right shift by 5, to keep the clip space coordinates unscaled
vmudm vcspos_i, vcspos_i, K2048
vmadl vcspos_f, vcspos_f, K2048
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr
# Calculate and store clipping flags against CS.W.
# These will be used for trivial rejections.
// Calculate and store clipping flags against CS.W.
// These will be used for trivial rejections.
vch v___, vcspos_i, vcspos_i.wwwwWWWW
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
cfc2 tmp, COP2_CTRL_VCC
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx_ptr
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx_ptr
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx_ptr
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx_ptr
// ########################
// Vertex 2 and 3 transform
@ -437,7 +430,7 @@ GPUCmd_DrawQuad:
srl tmp, tmp, 4
andi v1_cflags, tmp, XYZ_CLIP_FLAGS
# matrix multiply
// matrix multiply
vmudn v___, vmtx0_f, vpos.xxxxXXXX
vmadh v___, vmtx0_i, vpos.xxxxXXXX
vmadn v___, vmtx1_f, vpos.yyyyYYYY
@ -453,31 +446,32 @@ GPUCmd_DrawQuad:
llv vcol.e6, 56, src_ptr # Load v3 RGBA
llv vtex.e6, 60, src_ptr # Load v3 U, V
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
// 32-bit right shift by 5, to keep the clip space coordinates unscaled
vmudm vcspos_i, vcspos_i, K2048
vmadl vcspos_f, vcspos_f, K2048
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr
# Calculate and store clipping flags against CS.W.
# These will be used for trivial rejections.
// Calculate and store clipping flags against CS.W.
// These will be used for trivial rejections.
vch v___, vcspos_i, vcspos_i.wwwwWWWW
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
cfc2 tmp, COP2_CTRL_VCC
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx_ptr
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx_ptr
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr
andi v2_cflags, tmp, XYZ_CLIP_FLAGS
srl tmp, tmp, 4
andi v3_cflags, tmp, XYZ_CLIP_FLAGS
#undef src_ptr
#undef vtx
#undef vtx_ptr
#undef v___
#undef vmtx0_i