N64: Optimise T&L further (complex world down to 10.3 ms)

This commit is contained in:
UnknownShadow200 2025-07-19 20:27:29 +10:00
parent af4494284d
commit da9b8209d6
2 changed files with 50 additions and 61 deletions

View File

@ -30,7 +30,7 @@ static rsp_ucode_t rsp_gpu = (rsp_ucode_t){
enum {
GPU_CMD_SET_SHORT = 0x0,
GPU_CMD_SET_WORD = 0x1,
GPU_CMD_SET_TEX_WORD = 0x1,
GPU_CMD_SET_LONG = 0x2,
GPU_CMD_DRAW_QUAD = 0x3,
@ -42,8 +42,8 @@ enum {
typedef struct {
int16_t vp_scale[4];
int16_t vp_offset[4];
uint16_t tex_size[2];
uint16_t tex_offset[2];
uint16_t tex_size[8];
uint16_t tex_offset[8];
uint16_t tri_cmd;
uint16_t tri_cull;
} __attribute__((aligned(8), packed)) gpu_state;
@ -55,9 +55,9 @@ static inline void gpu_set_short(uint32_t offset, uint16_t value)
}
__attribute__((always_inline))
static inline void gpu_set_word(uint32_t offset, uint32_t value)
static inline void gpu_set_tex_word(uint32_t offset, uint32_t value)
{
rspq_write(gpup_id, GPU_CMD_SET_WORD, offset, value);
rspq_write(gpup_id, GPU_CMD_SET_TEX_WORD, offset, value);
}
__attribute__((always_inline))
@ -97,12 +97,12 @@ static void gpuUpdateFormat(void)
static void gpuSetTexSize(uint16_t width, uint16_t height)
{
gpu_set_word(offsetof(gpu_state, tex_size[0]), (width << 16) | height);
gpu_set_tex_word(offsetof(gpu_state, tex_size[0]), (width << 16) | height);
}
static void gpuSetTexOffset(uint16_t width, uint16_t height)
{
gpu_set_word(offsetof(gpu_state, tex_offset[0]), (width << 16) | height);
gpu_set_tex_word(offsetof(gpu_state, tex_offset[0]), (width << 16) | height);
}

View File

@ -36,7 +36,7 @@
RSPQ_BeginOverlayHeader
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0
RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x1
RSPQ_DefineCommand GPUCmd_SetTexWord, 8 # 0x1
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3
@ -56,8 +56,8 @@ GL_STATE:
# This is the GL state that is updated by CPU via GPUCmd_Set commands
GL_VIEWPORT_SCALE: .half 0,0,0,0
GL_VIEWPORT_OFFSET: .half 0,0,0,0
GL_STATE_TEX_SIZE: .half 0,0
GL_STATE_TEX_OFFSET: .half 0,0
GL_STATE_TEX_SIZE: .half 0,0, 0,0, 0,0, 0,0
GL_STATE_TEX_OFFSET: .half 0,0, 0,0, 0,0, 0,0
GL_TRI_CMD: .half 0
GL_TRI_CULL: .half 0
@ -79,10 +79,14 @@ GPUCmd_SetShort:
sh a1, %lo(GL_STATE)(a0)
.endfunc
.func GPUCmd_SetWord
GPUCmd_SetWord:
// Store 4 times, so can be transformed by 4 vertices later
.func GPUCmd_SetTexWord
GPUCmd_SetTexWord:
sw a1, %lo(GL_STATE) + 0(a0)
sw a1, %lo(GL_STATE) + 4(a0)
sw a1, %lo(GL_STATE) + 8(a0)
jr ra
sw a1, %lo(GL_STATE) + 0(a0)
sw a1, %lo(GL_STATE) + 12(a0)
.endfunc
.func GPUCmd_SetLong
@ -279,39 +283,8 @@ GL_TnL:
#define v___ $v01
#define vcspos_f $v02
#define vcspos_i $v03
#define vtexsize $v06
#define vtexoffset $v07
#define vst $v08
#define vst_i $v28
#define vst_f $v29
move ra2, ra
llv vst, SCREEN_VTX_S_T, vtx # S + T
li t0, %lo(GL_STATE_TEX_SIZE)
llv vtexsize, 0,t0
llv vtexoffset, 4,t0
# Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
#vmudn v___, vst, vtexsize
# vmadh vst, vtexoffset, K1
#vmudn v___, vst, vtexsize
#vmudl vst, vst, vtexsize
vmudn vst_f, vst, vtexsize # ACC = vst * vtexsize, VST_F = ACC & 0xFFFF
#####vmadn vst_f, vtexoffset, K1
vmadh vst_i, vzero, vzero # ACC += zero * zero, VST_I = ACC >> 16
// Shift texture coords right 5 bits
vmudm v___, vst_i, K2048 # ACC = (vst_i << 11)
vmadl vst, vst_f, K2048 # ACC += (vst_f << 11) >> 16, VST = ACC & 0xFFFF
#undef vst_i
#undef vst_f
slv vst, SCREEN_VTX_S_T, vtx
ldv vcspos_f, SCREEN_VTX_CS_POSf,vtx
jal GL_CalcScreenSpace
ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx
@ -321,14 +294,11 @@ GL_TnL:
#undef vcspos_f
#undef vcspos_i
#undef vtexsize
#undef vtexoffset
#undef vtx
#undef v___
#undef vrgba
#undef vst
#undef s
.endfunc
@ -343,6 +313,11 @@ GPUCmd_DrawQuad:
#define v___ $v01
#define vst_i $v12
#define vst_f $v13
#define vtexsize $v14
#define vtexoffset $v15
#define vmtx0_i $v16 // m00 m01 m02 m03
#define vmtx0_f $v17
#define vmtx1_i $v18 // m10 m11 m12 m13
@ -404,10 +379,11 @@ GPUCmd_DrawQuad:
vmudm vcspos_i, vcspos_i, K2048
vmadl vcspos_f, vcspos_f, K2048
li t6, %lo(GL_STATE_TEX_SIZE)
lqv vtexsize, 0x00, t6
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr
lqv vtexoffset, 0x10, t6
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr
// Calculate and store clipping flags against CS.W.
// These will be used for trivial rejections.
@ -450,10 +426,13 @@ GPUCmd_DrawQuad:
vmudm vcspos_i, vcspos_i, K2048
vmadl vcspos_f, vcspos_f, K2048
// Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
vmudn vst_f, vtex, vtexsize # ACC = vtex * vtexsize, VST_F = ACC & 0xFFFF
#vmadn vst_f,vtexoffset, K1
vmadh vst_i, vzero, vzero # ACC += zero * zero, VST_I = ACC >> 16
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr
// Calculate and store clipping flags against CS.W.
// These will be used for trivial rejections.
@ -466,13 +445,19 @@ GPUCmd_DrawQuad:
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr
// Shift texture coords right 5 bits
vmudm v___, vst_i, K2048 # ACC = (vst_i << 11)
vmadl vtex, vst_f, K2048 # ACC += (vst_f << 11) >> 16, vtex = ACC & 0xFFFF
andi v2_cflags, tmp, XYZ_CLIP_FLAGS
srl tmp, tmp, 4
andi v3_cflags, tmp, XYZ_CLIP_FLAGS
#undef src_ptr
#undef vtx_ptr
#undef v___
#undef vst_i
#undef vst_f
#undef vtexsize
#undef vtexoffset
#undef vmtx0_i
#undef vmtx0_f
@ -487,20 +472,24 @@ GPUCmd_DrawQuad:
#undef vcspos_i
#undef vcspos_f
// ########################
// Trivial rejection check
// ########################
// ### Trivial rejection check ###
// If for any plane, all 4 vertices are outside the plane,
// then the quad is out of the viewport and can be trivially rejected
and tmp, v0_cflags, v1_cflags
and tmp, v2_cflags
and tmp, v3_cflags
bnez tmp, JrRa
nop
bnez tmp, JrRa // slv is delay slot
// ### Perform rest of T&L ###
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr
#undef vtx_ptr
#undef v___
#undef vtex
// ########################
// Perform rest of T&L
// ########################
jal GL_TnL
li s3, %lo(VERTEX_CACHE) + V0_OFFSET
jal GL_TnL