mirror of
https://github.com/ClassiCube/ClassiCube.git
synced 2025-08-03 10:47:39 -04:00
N64: Optimise RSP transform code, reduces time from ~13.6 to ~11.1 ms on a moderately complex world with ~3,500 quads
This commit is contained in:
parent
19a4cb61ea
commit
9f93969211
@ -11,6 +11,8 @@
|
||||
#define zzzzZZZZ h2
|
||||
#define wwwwWWWW h3
|
||||
|
||||
#define XYZ_CLIP_FLAGS 0x707 // Isolate -X/Y/Z and +X/Y/Z clipping flags
|
||||
|
||||
|
||||
#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit)
|
||||
#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit)
|
||||
@ -25,14 +27,10 @@
|
||||
#define SCREEN_VTX_INVW 36 // 32-bit
|
||||
#define SCREEN_VTX_SIZE 40
|
||||
|
||||
//0-39 same as screenvtx
|
||||
#define PRIM_VTX_TRCODE 40 // trivial-reject clipping flags (against -w/+w)
|
||||
#define PRIM_VTX_SIZE 48
|
||||
|
||||
#define V0_OFFSET 0 * PRIM_VTX_SIZE
|
||||
#define V1_OFFSET 1 * PRIM_VTX_SIZE
|
||||
#define V2_OFFSET 2 * PRIM_VTX_SIZE
|
||||
#define V3_OFFSET 3 * PRIM_VTX_SIZE
|
||||
#define V0_OFFSET 0 * SCREEN_VTX_SIZE
|
||||
#define V1_OFFSET 1 * SCREEN_VTX_SIZE
|
||||
#define V2_OFFSET 2 * SCREEN_VTX_SIZE
|
||||
#define V3_OFFSET 3 * SCREEN_VTX_SIZE
|
||||
|
||||
.data
|
||||
|
||||
@ -73,7 +71,7 @@ DRAW_TRI_RA: .word 0
|
||||
.bss
|
||||
.align 3
|
||||
|
||||
VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * 4
|
||||
VERTEX_CACHE: .dcb.b SCREEN_VTX_SIZE * 4
|
||||
|
||||
.text
|
||||
|
||||
@ -164,193 +162,6 @@ GPUCmd_MatrixLoad:
|
||||
#undef dst
|
||||
.endfunc
|
||||
|
||||
.align 3
|
||||
.func GPUCmd_DrawQuad
|
||||
GPUCmd_DrawQuad:
|
||||
#define vtx a0
|
||||
#define mtx_ptr s0
|
||||
#define src_ptr s4
|
||||
|
||||
#define v___ $v01
|
||||
|
||||
#define vmtx0_i $v16 // m00 m01 m02 m03
|
||||
#define vmtx0_f $v17
|
||||
#define vmtx1_i $v18 // m10 m11 m12 m13
|
||||
#define vmtx1_f $v19
|
||||
#define vmtx2_i $v20 // m20 m21 m22 m23
|
||||
#define vmtx2_f $v21
|
||||
#define vmtx3_i $v22 // m30 m31 m32 m03
|
||||
#define vmtx3_f $v23
|
||||
|
||||
#define vpos $v24
|
||||
#define vcol $v25
|
||||
#define vtex $v26
|
||||
#define vcspos_i $v28
|
||||
#define vcspos_f $v29
|
||||
|
||||
#define x e0
|
||||
#define y e1
|
||||
#define z e2
|
||||
#define w e3
|
||||
|
||||
addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
|
||||
li vtx, %lo(VERTEX_CACHE)
|
||||
|
||||
li mtx_ptr, %lo(GPU_MATRIX_MVP)
|
||||
lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I]
|
||||
lqv vmtx1_i, 0x10,mtx_ptr // etc
|
||||
lqv vmtx2_i, 0x20,mtx_ptr
|
||||
lqv vmtx3_i, 0x30,mtx_ptr
|
||||
lqv vmtx0_f, 0x40,mtx_ptr
|
||||
lqv vmtx1_f, 0x50,mtx_ptr
|
||||
lqv vmtx2_f, 0x60,mtx_ptr
|
||||
lqv vmtx3_f, 0x70,mtx_ptr
|
||||
|
||||
### VERTEX 0
|
||||
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
|
||||
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
|
||||
|
||||
# matrix multiply
|
||||
vmudn v___, vmtx0_f, vpos.xxxxXXXX
|
||||
vmadh v___, vmtx0_i, vpos.xxxxXXXX
|
||||
vmadn v___, vmtx1_f, vpos.yyyyYYYY
|
||||
vmadh v___, vmtx1_i, vpos.yyyyYYYY
|
||||
vmadn v___, vmtx2_f, vpos.zzzzZZZZ
|
||||
vmadh v___, vmtx2_i, vpos.zzzzZZZZ
|
||||
vmadn v___, vmtx3_f, ONE_W
|
||||
vmadh vcspos_i, vmtx3_i, ONE_W
|
||||
vmadn vcspos_f, vzero, vzero
|
||||
|
||||
llv vcol.e0, 8, src_ptr // Load v0 RGBA
|
||||
llv vtex.e0, 12, src_ptr // Load v0 U, V
|
||||
llv vcol.e2, 24, src_ptr // Load v1 RGBA
|
||||
llv vtex.e2, 28, src_ptr // Load v1 U, V
|
||||
|
||||
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
||||
vmudm vcspos_i, vcspos_i, K2048
|
||||
vmadl vcspos_f, vcspos_f, K2048
|
||||
|
||||
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx
|
||||
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx
|
||||
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx
|
||||
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx
|
||||
|
||||
# Calculate and store clipping flags against CS.W.
|
||||
# These will be used for trivial rejections.
|
||||
vch v___, vcspos_i, vcspos_i.wwwwWWWW
|
||||
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
|
||||
|
||||
cfc2 t0, COP2_CTRL_VCC
|
||||
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx
|
||||
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx
|
||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx
|
||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx
|
||||
|
||||
###################### VERTEX 2
|
||||
ldv vpos.e0, 32, src_ptr // Load v2 X, Y, Z
|
||||
ldv vpos.e4, 48, src_ptr // Load v3 X, Y, Z
|
||||
andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
|
||||
srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags
|
||||
andi t2, t2, 0x7 // Isolate lo clip flags
|
||||
or t2, t1 // Merge clip flags (compressed to 6 bits)
|
||||
|
||||
# matrix multiply
|
||||
vmudn v___, vmtx0_f, vpos.xxxxXXXX
|
||||
vmadh v___, vmtx0_i, vpos.xxxxXXXX
|
||||
vmadn v___, vmtx1_f, vpos.yyyyYYYY
|
||||
sb t2, (PRIM_VTX_TRCODE + V0_OFFSET)(vtx)
|
||||
vmadh v___, vmtx1_i, vpos.yyyyYYYY
|
||||
srl t0, t0, 4
|
||||
vmadn v___, vmtx2_f, vpos.zzzzZZZZ
|
||||
andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
|
||||
vmadh v___, vmtx2_i, vpos.zzzzZZZZ
|
||||
srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags
|
||||
vmadn v___, vmtx3_f, ONE_W
|
||||
andi t2, t2, 0x7 // Isolate lo clip flags
|
||||
vmadh vcspos_i, vmtx3_i, ONE_W
|
||||
or t2, t1 // Merge clip flags (compressed to 6 bits)
|
||||
vmadn vcspos_f, vzero, vzero
|
||||
sb t2, (PRIM_VTX_TRCODE + V1_OFFSET)(vtx)
|
||||
|
||||
llv vcol.e4, 40, src_ptr # Load v2 RGBA
|
||||
llv vtex.e4, 44, src_ptr # Load v2 U, V
|
||||
llv vcol.e6, 56, src_ptr # Load v3 RGBA
|
||||
llv vtex.e6, 60, src_ptr # Load v3 U, V
|
||||
|
||||
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
||||
vmudm vcspos_i, vcspos_i, K2048
|
||||
vmadl vcspos_f, vcspos_f, K2048
|
||||
|
||||
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx
|
||||
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx
|
||||
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx
|
||||
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx
|
||||
|
||||
# Calculate and store clipping flags against CS.W.
|
||||
# These will be used for trivial rejections.
|
||||
vch v___, vcspos_i, vcspos_i.wwwwWWWW
|
||||
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
|
||||
|
||||
cfc2 t0, COP2_CTRL_VCC
|
||||
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx
|
||||
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx
|
||||
|
||||
andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
|
||||
srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags
|
||||
andi t2, t2, 0x7 // Isolate lo clip flags
|
||||
or t2, t1 // Merge clip flags (compressed to 6 bits)
|
||||
sb t2, (PRIM_VTX_TRCODE + V2_OFFSET)(vtx)
|
||||
|
||||
###################### VERTEX 3
|
||||
|
||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx
|
||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx
|
||||
|
||||
srl t0, t0, 4
|
||||
andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
|
||||
srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags
|
||||
andi t2, t2, 0x7 // Isolate lo clip flags
|
||||
or t2, t1 // Merge clip flags (compressed to 6 bits)
|
||||
sb t2, (PRIM_VTX_TRCODE + V3_OFFSET)(vtx)
|
||||
|
||||
# now do the actual drawing
|
||||
li a1, %lo(VERTEX_CACHE) + V0_OFFSET
|
||||
li a2, %lo(VERTEX_CACHE) + V1_OFFSET
|
||||
jal GPUCmd_DrawTriangle
|
||||
li a3, %lo(VERTEX_CACHE) + V2_OFFSET
|
||||
|
||||
li a1, %lo(VERTEX_CACHE) + V0_OFFSET
|
||||
li a2, %lo(VERTEX_CACHE) + V2_OFFSET
|
||||
jal GPUCmd_DrawTriangle
|
||||
li a3, %lo(VERTEX_CACHE) + V3_OFFSET
|
||||
|
||||
j RSPQ_Loop
|
||||
nop
|
||||
#undef src_ptr
|
||||
#undef vtx
|
||||
|
||||
#undef x
|
||||
#undef y
|
||||
#undef z
|
||||
#undef w
|
||||
|
||||
#undef v___
|
||||
|
||||
#undef vmtx0_i
|
||||
#undef vmtx0_f
|
||||
#undef vmtx1_i
|
||||
#undef vmtx1_f
|
||||
#undef vmtx2_i
|
||||
#undef vmtx2_f
|
||||
#undef vmtx3_i
|
||||
#undef vmtx3_f
|
||||
|
||||
#undef vpos
|
||||
#undef vcspos_i
|
||||
#undef vcspos_f
|
||||
|
||||
.endfunc
|
||||
|
||||
################################################################
|
||||
# GL_CalcScreenSpace
|
||||
#
|
||||
@ -506,17 +317,11 @@ GL_TnL:
|
||||
#undef vst_i
|
||||
#undef vst_f
|
||||
|
||||
lbu t0, PRIM_VTX_TRCODE(vtx)
|
||||
slv vst, SCREEN_VTX_S_T, vtx
|
||||
|
||||
ldv vcspos_f, SCREEN_VTX_CS_POSf,vtx
|
||||
ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx
|
||||
|
||||
# Mark this vertex as having T&L applied
|
||||
ori t0, 0x80
|
||||
|
||||
jal GL_CalcScreenSpace
|
||||
sb t0, PRIM_VTX_TRCODE(vtx)
|
||||
ldv vcspos_i, SCREEN_VTX_CS_POSi,vtx
|
||||
|
||||
j GL_CalcClipCodes
|
||||
move ra, ra2
|
||||
@ -536,52 +341,237 @@ GL_TnL:
|
||||
.endfunc
|
||||
|
||||
|
||||
.func GPUCmd_DrawTriangle
|
||||
GPUCmd_DrawTriangle:
|
||||
#define vtx1 a1
|
||||
#define vtx2 a2
|
||||
#define vtx3 a3
|
||||
#define trcode1 t6
|
||||
#define trcode2 t7
|
||||
#define trcode3 t8
|
||||
sw ra, %lo(DRAW_TRI_RA) # TODO find a register for this
|
||||
.align 3
|
||||
.func GPUCmd_DrawQuad
|
||||
GPUCmd_DrawQuad:
|
||||
#define vtx a0
|
||||
#define mtx_ptr s0
|
||||
#define src_ptr s4
|
||||
|
||||
# Trivial reject: if all the vertices are out of the same plane (at least one),
|
||||
# the triangle is out of the viewport.
|
||||
# NOTE: This deliberately uses lb instead of lbu so the sign bit is extended.
|
||||
# The MSB of each TR-code is a bit flag that is set if the vertex has already
|
||||
# had T&L applied once.
|
||||
lb trcode1, PRIM_VTX_TRCODE(vtx1)
|
||||
lb trcode2, PRIM_VTX_TRCODE(vtx2)
|
||||
lb trcode3, PRIM_VTX_TRCODE(vtx3)
|
||||
and t0, trcode1, trcode2
|
||||
and t0, trcode3
|
||||
andi t0, 0x3F
|
||||
bnez t0, JrRa
|
||||
#define v___ $v01
|
||||
|
||||
#define vmtx0_i $v16 // m00 m01 m02 m03
|
||||
#define vmtx0_f $v17
|
||||
#define vmtx1_i $v18 // m10 m11 m12 m13
|
||||
#define vmtx1_f $v19
|
||||
#define vmtx2_i $v20 // m20 m21 m22 m23
|
||||
#define vmtx2_f $v21
|
||||
#define vmtx3_i $v22 // m30 m31 m32 m03
|
||||
#define vmtx3_f $v23
|
||||
|
||||
#define vpos $v24
|
||||
#define vcol $v25
|
||||
#define vtex $v26
|
||||
#define vcspos_i $v28
|
||||
#define vcspos_f $v29
|
||||
|
||||
#define tmp t0
|
||||
#define v0_cflags t1
|
||||
#define v1_cflags t2
|
||||
#define v2_cflags t3
|
||||
#define v3_cflags t4
|
||||
|
||||
addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
|
||||
li vtx, %lo(VERTEX_CACHE)
|
||||
|
||||
li mtx_ptr, %lo(GPU_MATRIX_MVP)
|
||||
lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I]
|
||||
lqv vmtx1_i, 0x10,mtx_ptr // etc
|
||||
lqv vmtx2_i, 0x20,mtx_ptr
|
||||
lqv vmtx3_i, 0x30,mtx_ptr
|
||||
lqv vmtx0_f, 0x40,mtx_ptr
|
||||
lqv vmtx1_f, 0x50,mtx_ptr
|
||||
lqv vmtx2_f, 0x60,mtx_ptr
|
||||
lqv vmtx3_f, 0x70,mtx_ptr
|
||||
|
||||
// ########################
|
||||
// Vertex 0 and 1 transform
|
||||
// ########################
|
||||
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
|
||||
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
|
||||
|
||||
// matrix multiply
|
||||
vmudn v___, vmtx0_f, vpos.xxxxXXXX
|
||||
vmadh v___, vmtx0_i, vpos.xxxxXXXX
|
||||
vmadn v___, vmtx1_f, vpos.yyyyYYYY
|
||||
vmadh v___, vmtx1_i, vpos.yyyyYYYY
|
||||
vmadn v___, vmtx2_f, vpos.zzzzZZZZ
|
||||
vmadh v___, vmtx2_i, vpos.zzzzZZZZ
|
||||
vmadn v___, vmtx3_f, ONE_W
|
||||
vmadh vcspos_i, vmtx3_i, ONE_W
|
||||
vmadn vcspos_f, vzero, vzero
|
||||
|
||||
llv vcol.e0, 8, src_ptr // Load v0 RGBA
|
||||
llv vtex.e0, 12, src_ptr // Load v0 U, V
|
||||
llv vcol.e2, 24, src_ptr // Load v1 RGBA
|
||||
llv vtex.e2, 28, src_ptr // Load v1 U, V
|
||||
|
||||
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
||||
vmudm vcspos_i, vcspos_i, K2048
|
||||
vmadl vcspos_f, vcspos_f, K2048
|
||||
|
||||
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx
|
||||
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx
|
||||
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx
|
||||
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx
|
||||
|
||||
# Calculate and store clipping flags against CS.W.
|
||||
# These will be used for trivial rejections.
|
||||
vch v___, vcspos_i, vcspos_i.wwwwWWWW
|
||||
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
|
||||
|
||||
cfc2 tmp, COP2_CTRL_VCC
|
||||
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx
|
||||
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx
|
||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx
|
||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx
|
||||
|
||||
// ########################
|
||||
// Vertex 2 and 3 transform
|
||||
// ########################
|
||||
ldv vpos.e0, 32, src_ptr // Load v2 X, Y, Z
|
||||
ldv vpos.e4, 48, src_ptr // Load v3 X, Y, Z
|
||||
|
||||
andi v0_cflags, tmp, XYZ_CLIP_FLAGS
|
||||
srl tmp, tmp, 4
|
||||
andi v1_cflags, tmp, XYZ_CLIP_FLAGS
|
||||
|
||||
# matrix multiply
|
||||
vmudn v___, vmtx0_f, vpos.xxxxXXXX
|
||||
vmadh v___, vmtx0_i, vpos.xxxxXXXX
|
||||
vmadn v___, vmtx1_f, vpos.yyyyYYYY
|
||||
vmadh v___, vmtx1_i, vpos.yyyyYYYY
|
||||
vmadn v___, vmtx2_f, vpos.zzzzZZZZ
|
||||
vmadh v___, vmtx2_i, vpos.zzzzZZZZ
|
||||
vmadn v___, vmtx3_f, ONE_W
|
||||
vmadh vcspos_i, vmtx3_i, ONE_W
|
||||
vmadn vcspos_f, vzero, vzero
|
||||
|
||||
llv vcol.e4, 40, src_ptr # Load v2 RGBA
|
||||
llv vtex.e4, 44, src_ptr # Load v2 U, V
|
||||
llv vcol.e6, 56, src_ptr # Load v3 RGBA
|
||||
llv vtex.e6, 60, src_ptr # Load v3 U, V
|
||||
|
||||
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
||||
vmudm vcspos_i, vcspos_i, K2048
|
||||
vmadl vcspos_f, vcspos_f, K2048
|
||||
|
||||
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx
|
||||
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx
|
||||
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx
|
||||
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx
|
||||
|
||||
# Calculate and store clipping flags against CS.W.
|
||||
# These will be used for trivial rejections.
|
||||
vch v___, vcspos_i, vcspos_i.wwwwWWWW
|
||||
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
|
||||
|
||||
cfc2 tmp, COP2_CTRL_VCC
|
||||
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx
|
||||
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx
|
||||
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx
|
||||
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx
|
||||
|
||||
andi v2_cflags, tmp, XYZ_CLIP_FLAGS
|
||||
srl tmp, tmp, 4
|
||||
andi v3_cflags, tmp, XYZ_CLIP_FLAGS
|
||||
#undef src_ptr
|
||||
#undef vtx
|
||||
#undef v___
|
||||
|
||||
#undef vmtx0_i
|
||||
#undef vmtx0_f
|
||||
#undef vmtx1_i
|
||||
#undef vmtx1_f
|
||||
#undef vmtx2_i
|
||||
#undef vmtx2_f
|
||||
#undef vmtx3_i
|
||||
#undef vmtx3_f
|
||||
|
||||
#undef vpos
|
||||
#undef vcspos_i
|
||||
#undef vcspos_f
|
||||
|
||||
#define vtx1 a1
|
||||
#define vtx2 a2
|
||||
#define vtx3 a3
|
||||
#define vtx4 a0
|
||||
|
||||
// ########################
|
||||
// Trivial rejection check
|
||||
// ########################
|
||||
// If for any plane, all 4 vertices are outside the plane,
|
||||
// then the quad is out of the viewport and can be trivially rejected
|
||||
and tmp, v0_cflags, v1_cflags
|
||||
and tmp, v2_cflags
|
||||
and tmp, v3_cflags
|
||||
bnez tmp, JrRa
|
||||
nop
|
||||
|
||||
# Perform T&L for each vertex if we haven't already
|
||||
bgezal trcode1, GL_TnL
|
||||
move s3, vtx1
|
||||
// ########################
|
||||
// Perform rest of T&L
|
||||
// ########################
|
||||
jal GL_TnL
|
||||
li s3, %lo(VERTEX_CACHE) + V0_OFFSET
|
||||
jal GL_TnL
|
||||
li s3, %lo(VERTEX_CACHE) + V1_OFFSET
|
||||
jal GL_TnL
|
||||
li s3, %lo(VERTEX_CACHE) + V2_OFFSET
|
||||
jal GL_TnL
|
||||
li s3, %lo(VERTEX_CACHE) + V3_OFFSET
|
||||
|
||||
bgezal trcode2, GL_TnL
|
||||
move s3, vtx2
|
||||
// ########################
|
||||
// Guardband check
|
||||
// ########################
|
||||
// Check if all vertices fit within guardband
|
||||
lbu v0_cflags, (%lo(VERTEX_CACHE) + V0_OFFSET + SCREEN_VTX_CLIP_CODE)(zero)
|
||||
lbu v1_cflags, (%lo(VERTEX_CACHE) + V1_OFFSET + SCREEN_VTX_CLIP_CODE)(zero)
|
||||
lbu v2_cflags, (%lo(VERTEX_CACHE) + V2_OFFSET + SCREEN_VTX_CLIP_CODE)(zero)
|
||||
lbu v3_cflags, (%lo(VERTEX_CACHE) + V3_OFFSET + SCREEN_VTX_CLIP_CODE)(zero)
|
||||
|
||||
bgezal trcode3, GL_TnL
|
||||
move s3, vtx3
|
||||
or tmp, v0_cflags, v1_cflags
|
||||
or tmp, v2_cflags
|
||||
or tmp, v3_cflags
|
||||
|
||||
lbu t0, SCREEN_VTX_CLIP_CODE(vtx1)
|
||||
lbu t1, SCREEN_VTX_CLIP_CODE(vtx2)
|
||||
lbu t2, SCREEN_VTX_CLIP_CODE(vtx3)
|
||||
or t5, t0, t1
|
||||
or t5, t2
|
||||
beqz tmp, DrawQuadForRDP
|
||||
nop
|
||||
|
||||
#undef tmp
|
||||
#undef v0_cflags
|
||||
#undef v1_cflags
|
||||
#undef v2_cflags
|
||||
#undef v3_cflags
|
||||
|
||||
// ########################
|
||||
// Clipped triangle path
|
||||
// ########################
|
||||
// If not, go with slow clipping path
|
||||
|
||||
# now do the actual drawing
|
||||
li a1, %lo(VERTEX_CACHE) + V0_OFFSET
|
||||
li a2, %lo(VERTEX_CACHE) + V1_OFFSET
|
||||
jal DrawClippedTriangle
|
||||
li a3, %lo(VERTEX_CACHE) + V2_OFFSET
|
||||
|
||||
li a1, %lo(VERTEX_CACHE) + V0_OFFSET
|
||||
li a2, %lo(VERTEX_CACHE) + V2_OFFSET
|
||||
jal DrawClippedTriangle
|
||||
li a3, %lo(VERTEX_CACHE) + V3_OFFSET
|
||||
|
||||
j RSPQ_Loop
|
||||
nop
|
||||
.endfunc
|
||||
|
||||
################################################################
|
||||
# DrawClippedTriangle - Breaks a triangle into one or more clipped tris
|
||||
################################################################
|
||||
.func DrawClippedTriangle
|
||||
DrawClippedTriangle:
|
||||
sw ra, %lo(DRAW_TRI_RA) // TODO find a register for this
|
||||
|
||||
move s1, zero
|
||||
beqz t5, gl_draw_single_triangle
|
||||
move s2, zero
|
||||
|
||||
jal GL_ClipTriangle
|
||||
nop
|
||||
move s2, zero
|
||||
|
||||
beqz v1, gl_draw_triangle_end
|
||||
addi s2, -6
|
||||
@ -611,11 +601,42 @@ gl_draw_triangle_end:
|
||||
lw ra, %lo(DRAW_TRI_RA)
|
||||
jr ra
|
||||
nop
|
||||
.endfunc
|
||||
|
||||
################################################################
|
||||
# DrawQuadForRDP - Draws two triangles for a quad
|
||||
################################################################
|
||||
.func DrawQuadForRDP
|
||||
DrawQuadForRDP:
|
||||
lhu a0, %lo(GL_TRI_CMD)
|
||||
lh v0, %lo(GL_TRI_CULL)
|
||||
li a1, %lo(VERTEX_CACHE) + V0_OFFSET + SCREEN_VTX_X
|
||||
li a2, %lo(VERTEX_CACHE) + V1_OFFSET + SCREEN_VTX_X
|
||||
li a3, %lo(VERTEX_CACHE) + V2_OFFSET + SCREEN_VTX_X
|
||||
jal RDPQ_Triangle
|
||||
li s3, %lo(RDPQ_CMD_STAGING)
|
||||
|
||||
jal RDPQ_Send
|
||||
li s4, %lo(RDPQ_CMD_STAGING)
|
||||
|
||||
lhu a0, %lo(GL_TRI_CMD)
|
||||
lh v0, %lo(GL_TRI_CULL)
|
||||
li a1, %lo(VERTEX_CACHE) + V2_OFFSET + SCREEN_VTX_X
|
||||
li a2, %lo(VERTEX_CACHE) + V3_OFFSET + SCREEN_VTX_X
|
||||
li a3, %lo(VERTEX_CACHE) + V0_OFFSET + SCREEN_VTX_X
|
||||
jal RDPQ_Triangle
|
||||
li s3, %lo(RDPQ_CMD_STAGING)
|
||||
|
||||
jal RDPQ_Send
|
||||
li s4, %lo(RDPQ_CMD_STAGING)
|
||||
|
||||
j RSPQ_Loop
|
||||
nop
|
||||
.endfunc
|
||||
|
||||
#undef vtx1
|
||||
#undef vtx2
|
||||
#undef vtx3
|
||||
.endfunc
|
||||
|
||||
#include "rsp_gpu_clipping.inc"
|
||||
#include <rsp_rdpq.inc>
|
||||
|
Loading…
x
Reference in New Issue
Block a user