N64: Optimise T&L code further (now down to 9.8 ms)

This commit is contained in:
UnknownShadow200 2025-07-20 10:53:03 +10:00
parent da9b8209d6
commit d547f6e0a5
2 changed files with 125 additions and 41 deletions

View File

@ -32,6 +32,13 @@
#define V2_OFFSET 2 * SCREEN_VTX_SIZE #define V2_OFFSET 2 * SCREEN_VTX_SIZE
#define V3_OFFSET 3 * SCREEN_VTX_SIZE #define V3_OFFSET 3 * SCREEN_VTX_SIZE
.macro compressClipCodes
andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags
andi t2, t2, 0x7 // Isolate lo clip flags
or t2, t1 // Merge clip flags (compressed to 6 bits)
.endm
.data .data
RSPQ_BeginOverlayHeader RSPQ_BeginOverlayHeader
@ -158,6 +165,11 @@ GPUCmd_MatrixLoad:
#undef dst #undef dst
.endfunc .endfunc
// these persist across more than one function
#define vviewscale $v18
#define vviewoff $v19
#define vguardscale $v20
################################################################ ################################################################
# GL_CalcScreenSpace # GL_CalcScreenSpace
# #
@ -174,8 +186,6 @@ GL_CalcScreenSpace:
#define vcspos_i $v03 #define vcspos_i $v03
#define vinvw_f $v23 #define vinvw_f $v23
#define vinvw_i $v24 #define vinvw_i $v24
#define vviewscale $v25
#define vviewoff $v26
#define vscreenpos_i $v27 #define vscreenpos_i $v27
#define vscreenpos_f $v28 #define vscreenpos_f $v28
#define v___ $v29 #define v___ $v29
@ -187,11 +197,6 @@ GL_CalcScreenSpace:
vrcpl vinvw_f.w, vcspos_f.w vrcpl vinvw_f.w, vcspos_f.w
vrcph vinvw_i.w, vzero.e0 vrcph vinvw_i.w, vzero.e0
# Calculate screenspace coords
li t0, %lo(GL_VIEWPORT_SCALE)
ldv vviewscale, 0,t0
ldv vviewoff, 8,t0
vmudl v___, vcspos_f, vinvw_f.w vmudl v___, vcspos_f, vinvw_f.w
vmadm v___, vcspos_i, vinvw_f.w vmadm v___, vcspos_i, vinvw_f.w
vmadn vscreenpos_f, vcspos_f, vinvw_i.w vmadn vscreenpos_f, vcspos_f, vinvw_i.w
@ -214,8 +219,6 @@ GL_CalcScreenSpace:
#undef vcspos_i #undef vcspos_i
#undef vinvw_f #undef vinvw_f
#undef vinvw_i #undef vinvw_i
#undef vviewscale
#undef vviewoff
#undef vscreenpos_i #undef vscreenpos_i
#undef vscreenpos_f #undef vscreenpos_f
#undef v___ #undef v___
@ -242,20 +245,14 @@ GL_CalcClipCodes:
#define v___ $v29 #define v___ $v29
#define w e3 #define w e3
li t0, %lo(CLIP_CODE_FACTORS) vmudn vguard_f, vcspos_f, vguardscale
ldv vguard_i, 0,t0 vmadh vguard_i, vcspos_i, vguardscale
vmudn vguard_f, vcspos_f, vguard_i
vmadh vguard_i, vcspos_i, vguard_i
vch v___, vguard_i, vguard_i.w vch v___, vguard_i, vguard_i.w
vcl v___, vguard_f, vguard_f.w vcl v___, vguard_f, vguard_f.w
cfc2 t0, COP2_CTRL_VCC cfc2 t0, COP2_CTRL_VCC
andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags compressClipCodes
srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags
andi t2, t2, 0x7 // Isolate lo clip flags
or t2, t1 // Merge clip flags (compressed to 6 bits)
jr ra jr ra
sb t2, SCREEN_VTX_CLIP_CODE(dst) sb t2, SCREEN_VTX_CLIP_CODE(dst)
@ -273,39 +270,100 @@ GL_CalcClipCodes:
# GL_TnL # GL_TnL
# #
# Args: # Args:
# s2 = address of the vertex in DMEM (usually within VERTEX_CACHE)
# s3 = address of the vertex in DMEM (usually within VERTEX_CACHE) # s3 = address of the vertex in DMEM (usually within VERTEX_CACHE)
# #
################################################################ ################################################################
.func GL_TnL .func GL_TnL
GL_TnL: GL_TnL:
#define vtx s3 #define vtx1 s2
#define vtx2 s3
#define w e3
#define W e7
#define v___ $v01 #define v___ $v29
#define vcspos_f $v02 #define vcspos_f $v02
#define vcspos_i $v03 #define vcspos_i $v03
move ra2, ra
ldv vcspos_f, SCREEN_VTX_CS_POSf,vtx #define vinvw_f $v23
jal GL_CalcScreenSpace #define vinvw_i $v24
ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx #define vguard_f $v25
#define vguard_i $v26
#define vscreenpos_i $v27
#define vscreenpos_f $v28
j GL_CalcClipCodes ldv vcspos_i.e0, SCREEN_VTX_CS_POSi,vtx1
move ra, ra2 ldv vcspos_i.e4, SCREEN_VTX_CS_POSi,vtx2
ldv vcspos_f.e0, SCREEN_VTX_CS_POSf,vtx1
ldv vcspos_f.e4, SCREEN_VTX_CS_POSf,vtx2
# Calculate 32-bit inverse W
vrcph vinvw_i.w, vcspos_i.w
vrcpl vinvw_f.w, vcspos_f.w
vrcph vinvw_i.w, vzero.e0
vmudn vguard_f, vcspos_f, vguardscale
vmadh vguard_i, vcspos_i, vguardscale
# Calculate 32-bit inverse W
vrcph vinvw_i.W, vcspos_i.W
vrcpl vinvw_f.W, vcspos_f.W
vrcph vinvw_i.W, vzero.e0
vmudl v___, vcspos_f, vinvw_f.wwwwWWWW
vmadm v___, vcspos_i, vinvw_f.wwwwWWWW
vmadn vscreenpos_f, vcspos_f, vinvw_i.wwwwWWWW
vmadh vscreenpos_i, vcspos_i, vinvw_i.wwwwWWWW
vch v___, vguard_i, vguard_i.wwwwWWWW
vcl v___, vguard_f, vguard_f.wwwwWWWW
vmudn vscreenpos_f, vscreenpos_f, vviewscale
vmadh vscreenpos_i, vscreenpos_i, vviewscale
vadd vscreenpos_i, vviewoff
sdv vscreenpos_i.e0, SCREEN_VTX_X ,vtx1
ssv vcspos_i.w, SCREEN_VTX_W+0 ,vtx1
ssv vcspos_f.w, SCREEN_VTX_W+2 ,vtx1
ssv vinvw_i.w, SCREEN_VTX_INVW+0,vtx1
ssv vinvw_f.w, SCREEN_VTX_INVW+2,vtx1
sb zero, SCREEN_VTX_PADDING(vtx1)
sdv vscreenpos_i.e4, SCREEN_VTX_X ,vtx2
ssv vcspos_i.W, SCREEN_VTX_W+0 ,vtx2
ssv vcspos_f.W, SCREEN_VTX_W+2 ,vtx2
ssv vinvw_i.W, SCREEN_VTX_INVW+0,vtx2
ssv vinvw_f.W, SCREEN_VTX_INVW+2,vtx2
sb zero, SCREEN_VTX_PADDING(vtx2)
cfc2 t0, COP2_CTRL_VCC
compressClipCodes
sb t2, SCREEN_VTX_CLIP_CODE(vtx1)
srl t0, t0, 4
compressClipCodes
jr ra
sb t2, SCREEN_VTX_CLIP_CODE(vtx2)
#undef vinvw_f
#undef vinvw_i
#undef vscreenpos_i
#undef vscreenpos_f
#undef vguard_i
#undef vguard_f
#undef vcspos_f #undef vcspos_f
#undef vcspos_i #undef vcspos_i
#undef vtx #undef vtx1
#undef vtx2
#undef v___ #undef v___
#undef vrgba #undef w
#undef s .endfunc
.endfunc
.align 3 .align 3
.func GPUCmd_DrawQuad .func GPUCmd_DrawQuad
GPUCmd_DrawQuad: GPUCmd_DrawQuad:
#define vtx_ptr a0 #define vtx_ptr a0
#define mtx_ptr s0 #define mtx_ptr s0
@ -427,9 +485,9 @@ GPUCmd_DrawQuad:
vmadl vcspos_f, vcspos_f, K2048 vmadl vcspos_f, vcspos_f, K2048
// Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active) // Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
vmudn vst_f, vtex, vtexsize # ACC = vtex * vtexsize, VST_F = ACC & 0xFFFF vmudn vst_f, vtex, vtexsize // ACC = vtex * vtexsize, VST_F = ACC & 0xFFFF
#vmadn vst_f,vtexoffset, K1 #vmadn vst_f,vtexoffset, K1
vmadh vst_i, vzero, vzero # ACC += zero * zero, VST_I = ACC >> 16 vmadh vst_i, vzero, vzero // ACC += zero * zero, VST_I = ACC >> 16
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
@ -490,12 +548,22 @@ GPUCmd_DrawQuad:
#undef v___ #undef v___
#undef vtex #undef vtex
jal GL_TnL // Load viewport factors
li s3, %lo(VERTEX_CACHE) + V0_OFFSET li t0, %lo(GL_VIEWPORT_SCALE)
ldv vviewscale.e0, 0, t0
ldv vviewoff.e0, 8, t0
ldv vviewscale.e4, 0, t0
ldv vviewoff.e4, 8, t0
li t0, %lo(CLIP_CODE_FACTORS)
ldv vguardscale.e0, 0, t0
ldv vguardscale.e4, 0, t0
li s2, %lo(VERTEX_CACHE) + V0_OFFSET
jal GL_TnL jal GL_TnL
li s3, %lo(VERTEX_CACHE) + V1_OFFSET li s3, %lo(VERTEX_CACHE) + V1_OFFSET
jal GL_TnL
li s3, %lo(VERTEX_CACHE) + V2_OFFSET li s2, %lo(VERTEX_CACHE) + V2_OFFSET
jal GL_TnL jal GL_TnL
li s3, %lo(VERTEX_CACHE) + V3_OFFSET li s3, %lo(VERTEX_CACHE) + V3_OFFSET

View File

@ -78,6 +78,10 @@ GL_ClipTriangle:
#define voff1 $v15 #define voff1 $v15
#define vcache0 $v16 #define vcache0 $v16
#define vcache1 $v17 #define vcache1 $v17
// v18,v19 - reserved for viewport
// v20,v21 - reserved for vguard
#define vguard_f $v27
#define vguard_i $v28
#define v__ $v29 #define v__ $v29
move ra2, ra move ra2, ra
@ -283,9 +287,19 @@ gl_clip_no_swap:
sdv vint_i.e0, SCREEN_VTX_CS_POSi,intersection sdv vint_i.e0, SCREEN_VTX_CS_POSi,intersection
sdv vint_f.e0, SCREEN_VTX_CS_POSf,intersection sdv vint_f.e0, SCREEN_VTX_CS_POSf,intersection
suv vattr0.e0, SCREEN_VTX_RGBA ,intersection suv vattr0.e0, SCREEN_VTX_RGBA ,intersection
jal GL_CalcClipCodes
slv vattr0.e4, SCREEN_VTX_S_T ,intersection slv vattr0.e4, SCREEN_VTX_S_T ,intersection
# Update clip flags
vmudn vguard_f, vint_f, vguardscale // vint_f is vcspos_f
vmadh vguard_i, vint_i, vguardscale // vint_i is vcspos_i
vch v__, vguard_i, vguard_i.e3 // w
vcl v__, vguard_f, vguard_f.e3 // w
cfc2 t0, COP2_CTRL_VCC
compressClipCodes
sb t2, SCREEN_VTX_CLIP_CODE(intersection)
# Add intersection to the output list # Add intersection to the output list
add t0, out_list, out_count add t0, out_list, out_count
sh intersection, 0(t0) sh intersection, 0(t0)
@ -379,5 +393,7 @@ gl_clip_return:
#undef vattr0 #undef vattr0
#undef vattr1 #undef vattr1
#undef v__ #undef v__
#undef vguard_i
#undef vguard_f
.endfunc .endfunc