mirror of
https://github.com/ClassiCube/ClassiCube.git
synced 2025-08-03 18:57:27 -04:00
666 lines
18 KiB
ArmAsm
666 lines
18 KiB
ArmAsm
#include <rsp_queue.inc>
|
|
#include <rdpq_macros.h>
|
|
|
|
#define GUARD_BAND_FACTOR 2
|
|
|
|
// 1 << VTX_SHIFT, keep in sync with gpu.c
|
|
#define ONE_W K32
|
|
|
|
#define xxxxXXXX h0
|
|
#define yyyyYYYY h1
|
|
#define zzzzZZZZ h2
|
|
#define wwwwWWWW h3
|
|
|
|
#define XYZ_CLIP_FLAGS 0x707 // Isolate -X/Y/Z and +X/Y/Z clipping flags
|
|
|
|
|
|
#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit)
|
|
#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit)
|
|
#define SCREEN_VTX_X 16
|
|
#define SCREEN_VTX_Y 18
|
|
#define SCREEN_VTX_Z 20
|
|
#define SCREEN_VTX_CLIP_CODE 22
|
|
#define SCREEN_VTX_PADDING 23
|
|
#define SCREEN_VTX_RGBA 24
|
|
#define SCREEN_VTX_S_T 28 // 28 S, 30 T
|
|
#define SCREEN_VTX_W 32 // FIXME: this is duplicated in CS_POS
|
|
#define SCREEN_VTX_INVW 36 // 32-bit
|
|
#define SCREEN_VTX_SIZE 40
|
|
|
|
#define V0_OFFSET 0 * SCREEN_VTX_SIZE
|
|
#define V1_OFFSET 1 * SCREEN_VTX_SIZE
|
|
#define V2_OFFSET 2 * SCREEN_VTX_SIZE
|
|
#define V3_OFFSET 3 * SCREEN_VTX_SIZE
|
|
|
|
#define MAX_TRI_CMD_SIZE 0xB0
|
|
|
|
.macro compressClipCodes
|
|
andi t2, t0, 0x707 // Isolate X/Y/Z clipping flags
|
|
srl t1, t2, 5 // Shift hi flags to be aligned next to lo flags
|
|
andi t2, t2, 0x7 // Isolate lo clip flags
|
|
or t2, t1 // Merge clip flags (compressed to 6 bits)
|
|
.endm
|
|
|
|
.data
|
|
|
|
RSPQ_BeginOverlayHeader
|
|
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x0
|
|
RSPQ_DefineCommand GPUCmd_SetTexWord, 8 # 0x1
|
|
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x2
|
|
|
|
RSPQ_DefineCommand GPUCmd_DrawQuad, 68 # 0x3
|
|
RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x4
|
|
|
|
RSPQ_DefineCommand GPUCmd_PushRDP, 12 # 0x5
|
|
RSPQ_EndOverlayHeader
|
|
|
|
.align 4
|
|
BANNER0: .ascii " RSP OpenGL T&L "
|
|
BANNER1: .ascii "Rasky & Snacchus"
|
|
|
|
RSPQ_BeginSavedState
|
|
|
|
GPU_MATRIX_MVP: .ds.b 128
|
|
GL_STATE:
|
|
# This is the GL state that is updated by CPU via GPUCmd_Set commands
|
|
GL_VIEWPORT_SCALE: .half 0,0,0,0
|
|
GL_VIEWPORT_OFFSET: .half 0,0,0,0
|
|
GL_STATE_TEX_SIZE: .half 0,0, 0,0, 0,0, 0,0
|
|
GL_STATE_TEX_OFFSET: .half 0,0, 0,0, 0,0, 0,0
|
|
GL_TRI_CMD: .half 0
|
|
GL_TRI_CULL: .half 0
|
|
|
|
RSPQ_EndSavedState
|
|
|
|
.align 4
|
|
CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR
|
|
|
|
.bss
|
|
.align 3
|
|
|
|
VERTEX_CACHE: .ds.b SCREEN_VTX_SIZE * 4
|
|
|
|
.align 4
|
|
// Enough for all 10 triangle commands in worse case quad clipped scenario
|
|
TRI_CMD_BUFFER: .ds.b (MAX_TRI_CMD_SIZE * 10)
|
|
|
|
|
|
.text
|
|
|
|
.func GPUCmd_SetShort
|
|
GPUCmd_SetShort:
|
|
jr ra
|
|
sh a1, %lo(GL_STATE)(a0)
|
|
.endfunc
|
|
|
|
// Store 4 times, so can be transformed by 4 vertices later
|
|
.func GPUCmd_SetTexWord
|
|
GPUCmd_SetTexWord:
|
|
sw a1, %lo(GL_STATE) + 0(a0)
|
|
sw a1, %lo(GL_STATE) + 4(a0)
|
|
sw a1, %lo(GL_STATE) + 8(a0)
|
|
jr ra
|
|
sw a1, %lo(GL_STATE) + 12(a0)
|
|
.endfunc
|
|
|
|
.func GPUCmd_SetLong
|
|
GPUCmd_SetLong:
|
|
sw a2, %lo(GL_STATE) + 4(a0)
|
|
jr ra
|
|
sw a1, %lo(GL_STATE) + 0(a0)
|
|
.endfunc
|
|
|
|
|
|
.func GPUCmd_PushRDP
|
|
GPUCmd_PushRDP:
|
|
# RDP command is expected in a0 and a1
|
|
move a0, a1
|
|
move a1, a2
|
|
|
|
jal_and_j RDPQ_Write8, RDPQ_Finalize
|
|
.endfunc
|
|
|
|
|
|
.func GPUCmd_MatrixLoad
|
|
GPUCmd_MatrixLoad:
|
|
#define src t4
|
|
#define dst t5
|
|
|
|
#define vmat0_i $v02
|
|
#define vmat1_i $v03
|
|
#define vmat2_i $v04
|
|
#define vmat3_i $v05
|
|
#define vmat0_f $v06
|
|
#define vmat1_f $v07
|
|
#define vmat2_f $v08
|
|
#define vmat3_f $v09
|
|
|
|
addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
|
|
addi dst, zero, %lo(GPU_MATRIX_MVP)
|
|
|
|
// Load the matrix from command parameters
|
|
ldv vmat0_i, 0x00,src
|
|
ldv vmat1_i, 0x08,src
|
|
ldv vmat2_i, 0x10,src
|
|
ldv vmat3_i, 0x18,src
|
|
ldv vmat0_f, 0x20,src
|
|
ldv vmat1_f, 0x28,src
|
|
ldv vmat2_f, 0x30,src
|
|
ldv vmat3_f, 0x38,src
|
|
|
|
// Store the matrices, with each row stored twice
|
|
// This is used by T&L to transform two vertices at once
|
|
sdv vmat0_i, 0x00,dst
|
|
sdv vmat0_i, 0x08,dst
|
|
sdv vmat1_i, 0x10,dst
|
|
sdv vmat1_i, 0x18,dst
|
|
sdv vmat2_i, 0x20,dst
|
|
sdv vmat2_i, 0x28,dst
|
|
sdv vmat3_i, 0x30,dst
|
|
sdv vmat3_i, 0x38,dst
|
|
sdv vmat0_f, 0x40,dst
|
|
sdv vmat0_f, 0x48,dst
|
|
sdv vmat1_f, 0x50,dst
|
|
sdv vmat1_f, 0x58,dst
|
|
sdv vmat2_f, 0x60,dst
|
|
sdv vmat2_f, 0x68,dst
|
|
sdv vmat3_f, 0x70,dst
|
|
jr ra
|
|
sdv vmat3_f, 0x78,dst
|
|
|
|
#undef src
|
|
#undef dst
|
|
.endfunc
|
|
|
|
// these persist across more than one function
|
|
#define vviewscale $v18
|
|
#define vviewoff $v19
|
|
#define vguardscale $v20
|
|
|
|
################################################################
|
|
# GL_CalcScreenSpace
|
|
#
|
|
# Args:
|
|
# a0 = Destination vertex address
|
|
# $v02 = Clip space position (fractional part)
|
|
# $v03 = Clip space position (integer part)
|
|
#
|
|
################################################################
|
|
.func GL_CalcScreenSpace
|
|
GL_CalcScreenSpace:
|
|
#define dst a0
|
|
#define vcspos_f $v02
|
|
#define vcspos_i $v03
|
|
#define vinvw_f $v23
|
|
#define vinvw_i $v24
|
|
#define vscreenpos_i $v27
|
|
#define vscreenpos_f $v28
|
|
#define v___ $v29
|
|
#define w e3
|
|
|
|
ldv vcspos_i, SCREEN_VTX_CS_POSi, dst
|
|
ldv vcspos_f, SCREEN_VTX_CS_POSf, dst
|
|
|
|
li t0, %lo(GL_VIEWPORT_SCALE)
|
|
ldv vviewscale.e0, 0, t0
|
|
ldv vviewoff.e0, 8, t0
|
|
|
|
# Calculate 32-bit inverse W
|
|
# TODO: NR?
|
|
vrcph vinvw_i.w, vcspos_i.w
|
|
vrcpl vinvw_f.w, vcspos_f.w
|
|
vrcph vinvw_i.w, vzero.e0
|
|
|
|
vmudl v___, vcspos_f, vinvw_f.w
|
|
vmadm v___, vcspos_i, vinvw_f.w
|
|
vmadn vscreenpos_f, vcspos_f, vinvw_i.w
|
|
vmadh vscreenpos_i, vcspos_i, vinvw_i.w
|
|
|
|
li t0, 0x3F
|
|
vmudn v___, vscreenpos_f, vviewscale
|
|
vmadh v___, vscreenpos_i, vviewscale
|
|
vmadh vscreenpos_i, vviewoff, K1
|
|
|
|
ssv vcspos_i.w, SCREEN_VTX_W+0 ,dst
|
|
ssv vcspos_f.w, SCREEN_VTX_W+2 ,dst
|
|
ssv vinvw_i.w, SCREEN_VTX_INVW+0,dst
|
|
ssv vinvw_f.w, SCREEN_VTX_INVW+2,dst
|
|
sdv vscreenpos_i, SCREEN_VTX_X ,dst
|
|
|
|
jr ra
|
|
sb t0, SCREEN_VTX_PADDING(dst)
|
|
|
|
#undef dst
|
|
#undef vcspos_f
|
|
#undef vcspos_i
|
|
#undef vinvw_f
|
|
#undef vinvw_i
|
|
#undef vscreenpos_i
|
|
#undef vscreenpos_f
|
|
#undef v___
|
|
#undef w
|
|
|
|
.endfunc
|
|
|
|
################################################################
|
|
# GL_TnL
|
|
#
|
|
# Args:
|
|
# a2 = address of the vertex in DMEM (usually within VERTEX_CACHE)
|
|
# a3 = address of the vertex in DMEM (usually within VERTEX_CACHE)
|
|
#
|
|
################################################################
|
|
.func GL_TnL
|
|
GL_TnL:
|
|
#define vtx1 a2
|
|
#define vtx2 a3
|
|
#define w e3
|
|
#define W e7
|
|
|
|
#define v___ $v29
|
|
#define vcspos_f $v02
|
|
#define vcspos_i $v03
|
|
|
|
#define vinvw_f $v23
|
|
#define vinvw_i $v24
|
|
#define vguard_f $v25
|
|
#define vguard_i $v26
|
|
#define vscreenpos_i $v27
|
|
#define vscreenpos_f $v28
|
|
|
|
//emux_trace_start
|
|
|
|
ldv vcspos_i.e0, SCREEN_VTX_CS_POSi,vtx1
|
|
ldv vcspos_i.e4, SCREEN_VTX_CS_POSi,vtx2
|
|
ldv vcspos_f.e0, SCREEN_VTX_CS_POSf,vtx1
|
|
ldv vcspos_f.e4, SCREEN_VTX_CS_POSf,vtx2
|
|
li t1, 0x3F
|
|
|
|
// Calculate 32-bit inverse W for vertex 1
|
|
vrcph vinvw_i.w, vcspos_i.w
|
|
vrcpl vinvw_f.w, vcspos_f.w
|
|
vrcph vinvw_i.w, vzero.e0
|
|
|
|
vmudn vguard_f, vcspos_f, vguardscale
|
|
vmadh vguard_i, vcspos_i, vguardscale
|
|
|
|
// Calculate 32-bit inverse W for vertex 2
|
|
vrcph vinvw_i.W, vcspos_i.W
|
|
vrcpl vinvw_f.W, vcspos_f.W
|
|
vrcph vinvw_i.W, vzero.e0
|
|
|
|
ssv vcspos_i.w, SCREEN_VTX_W+0, vtx1
|
|
ssv vcspos_f.w, SCREEN_VTX_W+2, vtx1
|
|
ssv vcspos_i.W, SCREEN_VTX_W+0, vtx2
|
|
ssv vcspos_f.W, SCREEN_VTX_W+2, vtx2
|
|
|
|
vmudl v___, vcspos_f, vinvw_f.wwwwWWWW
|
|
vmadm v___, vcspos_i, vinvw_f.wwwwWWWW
|
|
vmadn vscreenpos_f, vcspos_f, vinvw_i.wwwwWWWW
|
|
vmadh vscreenpos_i, vcspos_i, vinvw_i.wwwwWWWW
|
|
|
|
vch v___, vguard_i, vguard_i.wwwwWWWW
|
|
vcl v___, vguard_f, vguard_f.wwwwWWWW
|
|
|
|
vmudn v___, vscreenpos_f, vviewscale
|
|
vmadh v___, vscreenpos_i, vviewscale
|
|
vmadh vscreenpos_i, vviewoff, K1
|
|
|
|
ssv vinvw_i.w, SCREEN_VTX_INVW+0,vtx1
|
|
ssv vinvw_f.w, SCREEN_VTX_INVW+2,vtx1
|
|
ssv vinvw_i.W, SCREEN_VTX_INVW+0,vtx2
|
|
ssv vinvw_f.W, SCREEN_VTX_INVW+2,vtx2
|
|
cfc2 t0, COP2_CTRL_VCC
|
|
|
|
sdv vscreenpos_i.e0, SCREEN_VTX_X ,vtx1
|
|
sdv vscreenpos_i.e4, SCREEN_VTX_X ,vtx2
|
|
sb t1, SCREEN_VTX_PADDING(vtx1)
|
|
sb t1, SCREEN_VTX_PADDING(vtx2)
|
|
|
|
compressClipCodes # TODO move to overlap with vector ops
|
|
sb t2, SCREEN_VTX_CLIP_CODE(vtx1)
|
|
|
|
srl t0, t0, 4
|
|
compressClipCodes # TODO move to overlap with vector ops
|
|
//emux_trace_stop
|
|
jr ra
|
|
sb t2, SCREEN_VTX_CLIP_CODE(vtx2)
|
|
|
|
|
|
#undef vinvw_f
|
|
#undef vinvw_i
|
|
#undef vscreenpos_i
|
|
#undef vscreenpos_f
|
|
|
|
#undef vguard_i
|
|
#undef vguard_f
|
|
#undef vcspos_f
|
|
#undef vcspos_i
|
|
|
|
#undef vtx1
|
|
#undef vtx2
|
|
#undef v___
|
|
#undef w
|
|
.endfunc
|
|
|
|
|
|
.align 3
|
|
.func GPUCmd_DrawQuad
|
|
GPUCmd_DrawQuad:
|
|
#define vtx_ptr a0
|
|
#define mtx_ptr v0
|
|
#define src_ptr v1
|
|
|
|
#define v___ $v01
|
|
|
|
#define vst_i $v12
|
|
#define vst_f $v13
|
|
#define vtexsize $v14
|
|
#define vtexoffset $v15
|
|
|
|
#define vmtx0_i $v16 // m00 m01 m02 m03
|
|
#define vmtx0_f $v17
|
|
#define vmtx1_i $v18 // m10 m11 m12 m13
|
|
#define vmtx1_f $v19
|
|
#define vmtx2_i $v20 // m20 m21 m22 m23
|
|
#define vmtx2_f $v21
|
|
#define vmtx3_i $v22 // m30 m31 m32 m03
|
|
#define vmtx3_f $v23
|
|
|
|
#define vpos $v24
|
|
#define vcol $v25
|
|
#define vtex $v26
|
|
#define vcspos_i $v28
|
|
#define vcspos_f $v29
|
|
|
|
#define tmp t0
|
|
#define v0_cflags t1
|
|
#define v1_cflags t2
|
|
#define v2_cflags t3
|
|
#define v3_cflags t4
|
|
// t5 is used by GL_ClipTriangle
|
|
|
|
addi src_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
|
|
li vtx_ptr, %lo(VERTEX_CACHE)
|
|
li mtx_ptr, %lo(GPU_MATRIX_MVP)
|
|
|
|
ldv vpos.e0, 0, src_ptr // Load v0 X, Y, Z
|
|
ldv vpos.e4, 16, src_ptr // Load v1 X, Y, Z
|
|
|
|
lqv vmtx0_i, 0x00,mtx_ptr // [m00.I m01.I m02.I m03.I m00.I m01.I m02.I m03.I]
|
|
lqv vmtx1_i, 0x10,mtx_ptr // etc
|
|
lqv vmtx2_i, 0x20,mtx_ptr
|
|
lqv vmtx3_i, 0x30,mtx_ptr
|
|
lqv vmtx0_f, 0x40,mtx_ptr
|
|
lqv vmtx1_f, 0x50,mtx_ptr
|
|
lqv vmtx2_f, 0x60,mtx_ptr
|
|
lqv vmtx3_f, 0x70,mtx_ptr
|
|
|
|
// ########################
|
|
// Vertex 0 and 1 transform
|
|
// ########################
|
|
// matrix multiply
|
|
vmudn v___, vmtx0_f, vpos.xxxxXXXX
|
|
vmadh v___, vmtx0_i, vpos.xxxxXXXX
|
|
vmadn v___, vmtx1_f, vpos.yyyyYYYY
|
|
vmadh v___, vmtx1_i, vpos.yyyyYYYY
|
|
vmadn v___, vmtx2_f, vpos.zzzzZZZZ
|
|
vmadh v___, vmtx2_i, vpos.zzzzZZZZ
|
|
vmadn v___, vmtx3_f, ONE_W
|
|
vmadh vcspos_i, vmtx3_i, ONE_W
|
|
vmadn vcspos_f, vzero, vzero
|
|
|
|
llv vcol.e0, 8, src_ptr // Load v0 RGBA
|
|
llv vtex.e0, 12, src_ptr // Load v0 U, V
|
|
llv vcol.e2, 24, src_ptr // Load v1 RGBA
|
|
llv vtex.e2, 28, src_ptr // Load v1 U, V
|
|
|
|
// 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
|
vmudm vcspos_i, vcspos_i, K2048
|
|
vmadl vcspos_f, vcspos_f, K2048
|
|
|
|
li t6, %lo(GL_STATE_TEX_SIZE)
|
|
lqv vtexsize, 0x00, t6
|
|
slv vcol.e0, SCREEN_VTX_RGBA + V0_OFFSET, vtx_ptr
|
|
lqv vtexoffset, 0x10, t6
|
|
slv vcol.e2, SCREEN_VTX_RGBA + V1_OFFSET, vtx_ptr
|
|
|
|
// Calculate and store clipping flags against CS.W.
|
|
// These will be used for trivial rejections.
|
|
vch v___, vcspos_i, vcspos_i.wwwwWWWW
|
|
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
|
|
|
|
cfc2 tmp, COP2_CTRL_VCC
|
|
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V0_OFFSET, vtx_ptr
|
|
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V0_OFFSET, vtx_ptr
|
|
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V1_OFFSET, vtx_ptr
|
|
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V1_OFFSET, vtx_ptr
|
|
|
|
// ########################
|
|
// Vertex 2 and 3 transform
|
|
// ########################
|
|
ldv vpos.e0, 32, src_ptr // Load v2 X, Y, Z
|
|
ldv vpos.e4, 48, src_ptr // Load v3 X, Y, Z
|
|
|
|
andi v0_cflags, tmp, XYZ_CLIP_FLAGS
|
|
srl tmp, tmp, 4
|
|
andi v1_cflags, tmp, XYZ_CLIP_FLAGS
|
|
|
|
// matrix multiply
|
|
vmudn v___, vmtx0_f, vpos.xxxxXXXX
|
|
vmadh v___, vmtx0_i, vpos.xxxxXXXX
|
|
vmadn v___, vmtx1_f, vpos.yyyyYYYY
|
|
vmadh v___, vmtx1_i, vpos.yyyyYYYY
|
|
vmadn v___, vmtx2_f, vpos.zzzzZZZZ
|
|
vmadh v___, vmtx2_i, vpos.zzzzZZZZ
|
|
vmadn v___, vmtx3_f, ONE_W
|
|
vmadh vcspos_i, vmtx3_i, ONE_W
|
|
vmadn vcspos_f, vzero, vzero
|
|
|
|
llv vcol.e4, 40, src_ptr # Load v2 RGBA
|
|
llv vtex.e4, 44, src_ptr # Load v2 U, V
|
|
llv vcol.e6, 56, src_ptr # Load v3 RGBA
|
|
llv vtex.e6, 60, src_ptr # Load v3 U, V
|
|
|
|
// 32-bit right shift by 5, to keep the clip space coordinates unscaled
|
|
vmudm vcspos_i, vcspos_i, K2048
|
|
vmadl vcspos_f, vcspos_f, K2048
|
|
|
|
// Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
|
|
vmudn vst_f, vtex, vtexsize // ACC = vtex * vtexsize, VST_F = ACC & 0xFFFF
|
|
#vmadn vst_f,vtexoffset, K1
|
|
vmadh vst_i, vzero, vzero // ACC += zero * zero, VST_I = ACC >> 16
|
|
|
|
slv vcol.e4, SCREEN_VTX_RGBA + V2_OFFSET, vtx_ptr
|
|
slv vcol.e6, SCREEN_VTX_RGBA + V3_OFFSET, vtx_ptr
|
|
|
|
// Calculate and store clipping flags against CS.W.
|
|
// These will be used for trivial rejections.
|
|
vch v___, vcspos_i, vcspos_i.wwwwWWWW
|
|
vcl v___, vcspos_f, vcspos_f.wwwwWWWW
|
|
|
|
cfc2 tmp, COP2_CTRL_VCC
|
|
sdv vcspos_i.e0, SCREEN_VTX_CS_POSi + V2_OFFSET, vtx_ptr
|
|
sdv vcspos_f.e0, SCREEN_VTX_CS_POSf + V2_OFFSET, vtx_ptr
|
|
sdv vcspos_i.e4, SCREEN_VTX_CS_POSi + V3_OFFSET, vtx_ptr
|
|
sdv vcspos_f.e4, SCREEN_VTX_CS_POSf + V3_OFFSET, vtx_ptr
|
|
|
|
// Shift texture coords right 5 bits
|
|
vmudm v___, vst_i, K2048 # ACC = (vst_i << 11)
|
|
vmadl vtex, vst_f, K2048 # ACC += (vst_f << 11) >> 16, vtex = ACC & 0xFFFF
|
|
|
|
andi v2_cflags, tmp, XYZ_CLIP_FLAGS
|
|
srl tmp, tmp, 4
|
|
andi v3_cflags, tmp, XYZ_CLIP_FLAGS
|
|
|
|
#undef src_ptr
|
|
#undef vst_i
|
|
#undef vst_f
|
|
#undef vtexsize
|
|
#undef vtexoffset
|
|
|
|
#undef vmtx0_i
|
|
#undef vmtx0_f
|
|
#undef vmtx1_i
|
|
#undef vmtx1_f
|
|
#undef vmtx2_i
|
|
#undef vmtx2_f
|
|
#undef vmtx3_i
|
|
#undef vmtx3_f
|
|
|
|
#undef vpos
|
|
#undef vcspos_i
|
|
#undef vcspos_f
|
|
|
|
// ### Trivial rejection check ###
|
|
// If for any plane, all 4 vertices are outside the plane,
|
|
// then the quad is out of the viewport and can be trivially rejected
|
|
and tmp, v0_cflags, v1_cflags
|
|
and tmp, v2_cflags
|
|
and tmp, v3_cflags
|
|
bnez tmp, JrRa // slv is delay slot
|
|
|
|
// ### Perform rest of T&L ###
|
|
slv vtex.e0, SCREEN_VTX_S_T + V0_OFFSET, vtx_ptr
|
|
slv vtex.e2, SCREEN_VTX_S_T + V1_OFFSET, vtx_ptr
|
|
slv vtex.e4, SCREEN_VTX_S_T + V2_OFFSET, vtx_ptr
|
|
slv vtex.e6, SCREEN_VTX_S_T + V3_OFFSET, vtx_ptr
|
|
|
|
#undef vtx_ptr
|
|
#undef v___
|
|
#undef vtex
|
|
|
|
// Load viewport factors
|
|
li t0, %lo(GL_VIEWPORT_SCALE)
|
|
ldv vviewscale.e0, 0, t0
|
|
ldv vviewoff.e0, 8, t0
|
|
ldv vviewscale.e4, 0, t0
|
|
ldv vviewoff.e4, 8, t0
|
|
|
|
li t0, %lo(CLIP_CODE_FACTORS)
|
|
ldv vguardscale.e0, 0, t0
|
|
ldv vguardscale.e4, 0, t0
|
|
|
|
li a2, %lo(VERTEX_CACHE) + V0_OFFSET
|
|
jal GL_TnL
|
|
li a3, %lo(VERTEX_CACHE) + V1_OFFSET
|
|
|
|
li a2, %lo(VERTEX_CACHE) + V2_OFFSET
|
|
jal GL_TnL
|
|
li a3, %lo(VERTEX_CACHE) + V3_OFFSET
|
|
|
|
// ########################
|
|
// Guardband check
|
|
// ########################
|
|
// Check if all vertices fit within guardband
|
|
lbu v0_cflags, (%lo(VERTEX_CACHE) + V0_OFFSET + SCREEN_VTX_CLIP_CODE)(zero)
|
|
lbu v1_cflags, (%lo(VERTEX_CACHE) + V1_OFFSET + SCREEN_VTX_CLIP_CODE)(zero)
|
|
lbu v2_cflags, (%lo(VERTEX_CACHE) + V2_OFFSET + SCREEN_VTX_CLIP_CODE)(zero)
|
|
lbu v3_cflags, (%lo(VERTEX_CACHE) + V3_OFFSET + SCREEN_VTX_CLIP_CODE)(zero)
|
|
|
|
or tmp, v0_cflags, v1_cflags
|
|
or tmp, v2_cflags
|
|
or tmp, v3_cflags
|
|
|
|
#undef v0_cflags
|
|
#undef v1_cflags
|
|
#undef v2_cflags
|
|
#undef v3_cflags
|
|
|
|
// If all 4 vertices are inside guardband, no need to clip
|
|
beqz tmp, DrawQuadForRDP
|
|
move t5, tmp // GL_ClipTriangle expects this in t5 instead
|
|
#undef tmp
|
|
|
|
// ###########################
|
|
// Slow clipped triangles path
|
|
// ###########################
|
|
#define vtx1 a1
|
|
#define vtx2 a2
|
|
#define vtx3 a3
|
|
#define vtx4 a0
|
|
|
|
li vtx1, %lo(VERTEX_CACHE) + V0_OFFSET
|
|
li vtx2, %lo(VERTEX_CACHE) + V1_OFFSET
|
|
li vtx3, %lo(VERTEX_CACHE) + V2_OFFSET
|
|
li vtx4, %lo(VERTEX_CACHE) + V3_OFFSET
|
|
|
|
// t5 = which guardband planes need to be clipped against
|
|
move s1, zero
|
|
jal GL_ClipTriangle
|
|
move s2, zero
|
|
|
|
li s3, %lo(TRI_CMD_BUFFER)
|
|
beqz v1, gl_draw_triangle_end
|
|
addi s2, -6
|
|
lhu s5, 0(s1)
|
|
|
|
jal GL_CalcScreenSpace
|
|
lhu a0, 0(s1)
|
|
jal GL_CalcScreenSpace
|
|
lhu a0, 2(s1)
|
|
|
|
gl_draw_clipped_triangles_loop:
|
|
move vtx1, s5
|
|
lhu vtx2, 2(s1)
|
|
lhu vtx3, 4(s1)
|
|
|
|
jal GL_CalcScreenSpace
|
|
move a0, vtx3
|
|
|
|
gl_draw_single_triangle:
|
|
addi vtx1, SCREEN_VTX_X
|
|
addi vtx2, SCREEN_VTX_X
|
|
addi vtx3, SCREEN_VTX_X
|
|
|
|
lhu a0, %lo(GL_TRI_CMD)
|
|
jal RDPQ_Triangle_Send_Async
|
|
lh v0, %lo(GL_TRI_CULL)
|
|
|
|
blt s1, s2, gl_draw_clipped_triangles_loop
|
|
addi s1, 2
|
|
|
|
gl_draw_triangle_end:
|
|
jal RDPQ_Triangle_Send_End
|
|
nop
|
|
|
|
j RSPQ_Loop
|
|
nop
|
|
.endfunc
|
|
|
|
################################################################
|
|
# DrawQuadForRDP - Draws two triangles for a quad
|
|
################################################################
|
|
.func DrawQuadForRDP
|
|
DrawQuadForRDP:
|
|
li s3, %lo(TRI_CMD_BUFFER)
|
|
li a1, %lo(VERTEX_CACHE) + V0_OFFSET + SCREEN_VTX_X
|
|
li a2, %lo(VERTEX_CACHE) + V1_OFFSET + SCREEN_VTX_X
|
|
li a3, %lo(VERTEX_CACHE) + V2_OFFSET + SCREEN_VTX_X
|
|
lh v0, %lo(GL_TRI_CULL)
|
|
jal RDPQ_Triangle_Send_Async
|
|
lhu a0, %lo(GL_TRI_CMD)
|
|
|
|
li a1, %lo(VERTEX_CACHE) + V2_OFFSET + SCREEN_VTX_X
|
|
li a2, %lo(VERTEX_CACHE) + V3_OFFSET + SCREEN_VTX_X
|
|
li a3, %lo(VERTEX_CACHE) + V0_OFFSET + SCREEN_VTX_X
|
|
lh v0, %lo(GL_TRI_CULL)
|
|
jal RDPQ_Triangle_Send_Async
|
|
lhu a0, %lo(GL_TRI_CMD)
|
|
|
|
jal RDPQ_Triangle_Send_End
|
|
nop
|
|
|
|
RDPQ_Triangle_Cull:
|
|
RDPQ_Triangle_Clip:
|
|
j RSPQ_Loop
|
|
nop
|
|
.endfunc
|
|
|
|
#undef vtx1
|
|
#undef vtx2
|
|
#undef vtx3
|
|
|
|
#include "rsp_gpu_clipping.inc"
|
|
#include <rsp_rdpq.inc>
|