N64: Use optimised async triangle drawing (down to 7.8 ms on RSP)

This commit is contained in:
UnknownShadow200 2025-07-20 17:09:11 +10:00
parent d547f6e0a5
commit 6658154f2d

View File

@ -32,6 +32,8 @@
#define V2_OFFSET 2 * SCREEN_VTX_SIZE
#define V3_OFFSET 3 * SCREEN_VTX_SIZE
#define MAX_TRI_CMD_SIZE 0xB0
.macro compressClipCodes
andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags
@ -76,7 +78,12 @@ CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR
.bss
.align 3
VERTEX_CACHE: .dcb.b SCREEN_VTX_SIZE * 4
VERTEX_CACHE: .ds.b SCREEN_VTX_SIZE * 4
.align 4
// Enough for all 10 triangle commands in worse case quad clipped scenario
TRI_CMD_BUFFER: .ds.b (MAX_TRI_CMD_SIZE * 10)
.text
@ -211,8 +218,10 @@ GL_CalcScreenSpace:
ssv vcspos_f.w, SCREEN_VTX_W+2 ,dst
ssv vinvw_i.w, SCREEN_VTX_INVW+0,dst
ssv vinvw_f.w, SCREEN_VTX_INVW+2,dst
li t0, 0x3F
jr ra
sb zero, SCREEN_VTX_PADDING(dst)
sb t0, SCREEN_VTX_PADDING(dst)
#undef dst
#undef vcspos_f
@ -226,46 +235,6 @@ GL_CalcScreenSpace:
.endfunc
################################################################
# GL_CalcClipCodes
#
# Args:
# s3 = Destination vertex address
# $v02 = Clip space position (fractional part)
# $v03 = Clip space position (integer part)
#
################################################################
.func GL_CalcClipCodes
GL_CalcClipCodes:
#define dst s3
#define vcspos_f $v02
#define vcspos_i $v03
#define vguard_f $v27
#define vguard_i $v28
#define v___ $v29
#define w e3
vmudn vguard_f, vcspos_f, vguardscale
vmadh vguard_i, vcspos_i, vguardscale
vch v___, vguard_i, vguard_i.w
vcl v___, vguard_f, vguard_f.w
cfc2 t0, COP2_CTRL_VCC
compressClipCodes
jr ra
sb t2, SCREEN_VTX_CLIP_CODE(dst)
#undef dst
#undef vcspos_i
#undef vcspos_f
#undef vguard_i
#undef vguard_f
#undef v___
#undef w
.endfunc
################################################################
# GL_TnL
#
@ -321,20 +290,21 @@ GL_TnL:
vmudn vscreenpos_f, vscreenpos_f, vviewscale
vmadh vscreenpos_i, vscreenpos_i, vviewscale
vadd vscreenpos_i, vviewoff
li t0, 0x3F
sdv vscreenpos_i.e0, SCREEN_VTX_X ,vtx1
ssv vcspos_i.w, SCREEN_VTX_W+0 ,vtx1
ssv vcspos_f.w, SCREEN_VTX_W+2 ,vtx1
ssv vinvw_i.w, SCREEN_VTX_INVW+0,vtx1
ssv vinvw_f.w, SCREEN_VTX_INVW+2,vtx1
sb zero, SCREEN_VTX_PADDING(vtx1)
sb t0, SCREEN_VTX_PADDING(vtx1)
sdv vscreenpos_i.e4, SCREEN_VTX_X ,vtx2
ssv vcspos_i.W, SCREEN_VTX_W+0 ,vtx2
ssv vcspos_f.W, SCREEN_VTX_W+2 ,vtx2
ssv vinvw_i.W, SCREEN_VTX_INVW+0,vtx2
ssv vinvw_f.W, SCREEN_VTX_INVW+2,vtx2
sb zero, SCREEN_VTX_PADDING(vtx2)
sb t0, SCREEN_VTX_PADDING(vtx2)
cfc2 t0, COP2_CTRL_VCC
compressClipCodes
@ -608,6 +578,7 @@ GPUCmd_DrawQuad:
jal GL_ClipTriangle
move s2, zero
li s3, %lo(TRI_CMD_BUFFER)
beqz v1, gl_draw_triangle_end
addi s2, -6
lhu s5, 0(s1)
@ -622,17 +593,16 @@ gl_draw_single_triangle:
addi vtx3, SCREEN_VTX_X
lhu a0, %lo(GL_TRI_CMD)
jal RDPQ_Triangle_Send_Async
lh v0, %lo(GL_TRI_CULL)
jal RDPQ_Triangle
li s3, %lo(RDPQ_CMD_STAGING)
jal RDPQ_Send
li s4, %lo(RDPQ_CMD_STAGING)
blt s1, s2, gl_draw_clipped_triangles_loop
addi s1, 2
gl_draw_triangle_end:
jal RDPQ_Triangle_Send_End
nop
j RSPQ_Loop
nop
.endfunc
@ -642,28 +612,26 @@ gl_draw_triangle_end:
################################################################
.func DrawQuadForRDP
DrawQuadForRDP:
lhu a0, %lo(GL_TRI_CMD)
lh v0, %lo(GL_TRI_CULL)
li s3, %lo(TRI_CMD_BUFFER)
li a1, %lo(VERTEX_CACHE) + V0_OFFSET + SCREEN_VTX_X
li a2, %lo(VERTEX_CACHE) + V1_OFFSET + SCREEN_VTX_X
li a3, %lo(VERTEX_CACHE) + V2_OFFSET + SCREEN_VTX_X
jal RDPQ_Triangle
li s3, %lo(RDPQ_CMD_STAGING)
jal RDPQ_Send
li s4, %lo(RDPQ_CMD_STAGING)
lhu a0, %lo(GL_TRI_CMD)
lh v0, %lo(GL_TRI_CULL)
jal RDPQ_Triangle_Send_Async
lhu a0, %lo(GL_TRI_CMD)
li a1, %lo(VERTEX_CACHE) + V2_OFFSET + SCREEN_VTX_X
li a2, %lo(VERTEX_CACHE) + V3_OFFSET + SCREEN_VTX_X
li a3, %lo(VERTEX_CACHE) + V0_OFFSET + SCREEN_VTX_X
jal RDPQ_Triangle
li s3, %lo(RDPQ_CMD_STAGING)
lh v0, %lo(GL_TRI_CULL)
jal RDPQ_Triangle_Send_Async
lhu a0, %lo(GL_TRI_CMD)
jal RDPQ_Send
li s4, %lo(RDPQ_CMD_STAGING)
jal RDPQ_Triangle_Send_End
nop
RDPQ_Triangle_Cull:
RDPQ_Triangle_Clip:
j RSPQ_Loop
nop
.endfunc