ClassiCube/misc/n64/rsp_gpu.S
2025-04-30 20:54:35 +10:00

537 lines
13 KiB
ArmAsm

#include <rsp_queue.inc>
#include <rdpq_macros.h>
#include "gl_constants.h"
.data
RSPQ_BeginOverlayHeader
RSPQ_DefineCommand GPUCmd_SetByte, 8 # 0x0
RSPQ_DefineCommand GPUCmd_SetShort, 8 # 0x1
RSPQ_DefineCommand GPUCmd_SetWord, 8 # 0x2
RSPQ_DefineCommand GPUCmd_SetLong, 12 # 0x3
RSPQ_DefineCommand GPUCmd_DrawTriangle, 8 # 0x4
RSPQ_DefineCommand GPUCmd_UploadVertex, 24 # 0x5
RSPQ_DefineCommand GPUCmd_MatrixLoad, 68 # 0x6
RSPQ_EndOverlayHeader
.align 4
BANNER0: .ascii " RSP OpenGL T&L "
BANNER1: .ascii "Rasky & Snacchus"
RSPQ_BeginSavedState
GL_STATE:
# This is the GL state that is also used by the pipeline.
GL_MATRIX_MVP: .ds.b MATRIX_SIZE
GL_VIEWPORT_SCALE: .half 0,0,0,0
GL_VIEWPORT_OFFSET: .half 0,0,0,0
GL_STATE_TEX_SIZE: .half 0,0
GL_STATE_TEX_OFFSET: .half 0,0
GL_TRI_CMD: .half 0
GL_TRI_CULL: .half 0
.align 3
VERTEX_CACHE: .dcb.b PRIM_VTX_SIZE * VERTEX_CACHE_SIZE
RSPQ_EndSavedState
.align 4
CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18
CLIP_CODE_FACTORS: .half 1, 1, GUARD_BAND_FACTOR, GUARD_BAND_FACTOR
#define CLIPPING_PLANE_COUNT 6
#define CLIPPING_CACHE_SIZE 9
#define CLIPPING_PLANE_SIZE 8
#define SCREEN_VTX_CS_POSi 0 // X, Y, Z, W (all 32-bit)
#define SCREEN_VTX_CS_POSf 8 // X, Y, Z, W (all 32-bit)
#define SCREEN_VTX_X 16
#define SCREEN_VTX_Y 18
#define SCREEN_VTX_Z 20
#define SCREEN_VTX_CLIP_CODE 22
#define SCREEN_VTX_PADDING 23
#define SCREEN_VTX_RGBA 24
#define SCREEN_VTX_S 28
#define SCREEN_VTX_T 30
#define SCREEN_VTX_W 32 // FIXME: this is duplicated in CS_POS
#define SCREEN_VTX_INVW 36 // 32-bit
#define SCREEN_VTX_SIZE 40
.text
.func GPUCmd_SetByte
GPUCmd_SetByte:
jr ra
sb a1, %lo(GL_STATE)(a0)
.endfunc
.func GPUCmd_SetShort
GPUCmd_SetShort:
jr ra
sh a1, %lo(GL_STATE)(a0)
.endfunc
.func GPUCmd_SetWord
GPUCmd_SetWord:
jr ra
sw a1, %lo(GL_STATE) + 0(a0)
.endfunc
.func GPUCmd_SetLong
GPUCmd_SetLong:
sw a2, %lo(GL_STATE) + 4(a0)
jr ra
sw a1, %lo(GL_STATE) + 0(a0)
.endfunc
########################################
# GPUCmd_UploadVertex
#
# Arguments:
# * 0x00 (a0): offset within VERTEX_CACHE
# * 0x04 (a1): object space X, Y (16-bit)
# * 0x08 (a2): object space Z, W (16-bit)
# * 0x0C (a3): RGBA (8-bit each one)
# * 0x10: S, T (16-bit)
# * 0x14: normal X, Y, Z (8-bit each one) (LSB must be 0)
#
########################################
.align 3
.func GPUCmd_UploadVertex
GPUCmd_UploadVertex:
#define vtx a0
#define mtx_ptr s0
#define cmd_ptr s4
#define v___ $v01
#define vmtx0_i $v16 // m00 m01 m02 m03
#define vmtx0_f $v17
#define vmtx1_i $v18 // m10 m11 m12 m13
#define vmtx1_f $v19
#define vmtx2_i $v20 // m20 m21 m22 m23
#define vmtx2_f $v21
#define vmtx3_i $v22 // m30 m31 m32 m03
#define vmtx3_f $v23
#define vpos $v24
#define vcol $v25
#define vtex $v26
#define vcspos_i $v28
#define vcspos_f $v29
#define x e0
#define y e1
#define z e2
#define w e3
addi cmd_ptr, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) + 4
sub cmd_ptr, rspq_cmd_size
ldv vpos, 0, cmd_ptr # Load X, Y, Z, W
ldv vcol, 8, cmd_ptr # Load R, G, B, A
llv vtex, 16, cmd_ptr # Load U, V
addi vtx, %lo(VERTEX_CACHE)
sdv vpos, PRIM_VTX_X ,vtx
sdv vcol, PRIM_VTX_R ,vtx
slv vtex, PRIM_VTX_TEX_S ,vtx
# == matrix multiply ==
li mtx_ptr, %lo(GL_MATRIX_MVP)
ldv vmtx0_i.e0, 0x00,mtx_ptr
ldv vmtx1_i.e0, 0x08,mtx_ptr
ldv vmtx2_i.e0, 0x10,mtx_ptr
ldv vmtx3_i.e0, 0x18,mtx_ptr
ldv vmtx0_f.e0, 0x20,mtx_ptr
ldv vmtx1_f.e0, 0x28,mtx_ptr
ldv vmtx2_f.e0, 0x30,mtx_ptr
ldv vmtx3_f.e0, 0x38,mtx_ptr
vmudn v___, vmtx0_f, vpos.h0
vmadh v___, vmtx0_i, vpos.h0
vmadn v___, vmtx1_f, vpos.h1
vmadh v___, vmtx1_i, vpos.h1
vmadn v___, vmtx2_f, vpos.h2
vmadh v___, vmtx2_i, vpos.h2
vmadn v___, vmtx3_f, vpos.h3
vmadh vcspos_i, vmtx3_i, vpos.h3
vmadn vcspos_f, vzero, vzero
# == end matrix multiply ==
# 32-bit right shift by 5, to keep the clip space coordinates unscaled
vmudm vcspos_i, vcspos_i, vshift8.e4
vmadl vcspos_f, vcspos_f, vshift8.e4
sdv vcspos_i, PRIM_VTX_CS_POSi,vtx
sdv vcspos_f, PRIM_VTX_CS_POSf,vtx
# Calculate and store clipping flags against CS.W.
# These will be used for trivial rejections.
vch v___, vcspos_i, vcspos_i.w
vcl v___, vcspos_f, vcspos_f.w
cfc2 t0, COP2_CTRL_VCC
andi t0, 0x707 # Isolate X/Y/Z flags
# Compress flags to 8 bit
srl t1, t0, 5
andi t0, 0x7
or t0, t1
jr ra
sb t0, PRIM_VTX_TRCODE(vtx)
#undef cmd_ptr
#undef vtx
#undef in_xy
#undef in_zw
#undef in_rgba
#undef vtx_id
#undef x
#undef y
#undef z
#undef w
#undef v___
#undef vmtx0_i
#undef vmtx0_f
#undef vmtx1_i
#undef vmtx1_f
#undef vmtx2_i
#undef vmtx2_f
#undef vmtx3_i
#undef vmtx3_f
#undef vpos
#undef vcspos_i
#undef vcspos_f
.endfunc
################################################################
# GL_CalcScreenSpace
#
# Args:
# s3 = Destination vertex address
# $v02 = Clip space position (fractional part)
# $v03 = Clip space position (integer part)
#
################################################################
.func GL_CalcScreenSpace
GL_CalcScreenSpace:
#define dst s3
#define vcspos_f $v02
#define vcspos_i $v03
#define vinvw_f $v23
#define vinvw_i $v24
#define vviewscale $v25
#define vviewoff $v26
#define vscreenpos_i $v27
#define vscreenpos_f $v28
#define v___ $v29
#define w e3
# Calculate 32-bit inverse W
# TODO: NR?
vrcph vinvw_i.w, vcspos_i.w
vrcpl vinvw_f.w, vcspos_f.w
vrcph vinvw_i.w, vzero.e0
# Calculate screenspace coords
li t0, %lo(GL_VIEWPORT_SCALE)
ldv vviewscale, 0,t0
ldv vviewoff, 8,t0
vmudl v___, vcspos_f, vinvw_f.w
vmadm v___, vcspos_i, vinvw_f.w
vmadn vscreenpos_f, vcspos_f, vinvw_i.w
vmadh vscreenpos_i, vcspos_i, vinvw_i.w
vmudn vscreenpos_f, vscreenpos_f, vviewscale
vmadh vscreenpos_i, vscreenpos_i, vviewscale
vadd vscreenpos_i, vviewoff
sdv vscreenpos_i, SCREEN_VTX_X ,dst
ssv vcspos_i.w, SCREEN_VTX_W+0 ,dst
ssv vcspos_f.w, SCREEN_VTX_W+2 ,dst
ssv vinvw_i.w, SCREEN_VTX_INVW+0,dst
ssv vinvw_f.w, SCREEN_VTX_INVW+2,dst
jr ra
sb zero, SCREEN_VTX_PADDING(dst)
#undef dst
#undef vcspos_f
#undef vcspos_i
#undef vinvw_f
#undef vinvw_i
#undef vviewscale
#undef vviewoff
#undef vscreenpos_i
#undef vscreenpos_f
#undef v___
#undef w
.endfunc
################################################################
# GL_CalcClipCodes
#
# Args:
# s3 = Destination vertex address
# $v02 = Clip space position (fractional part)
# $v03 = Clip space position (integer part)
#
################################################################
.func GL_CalcClipCodes
GL_CalcClipCodes:
#define dst s3
#define vcspos_f $v02
#define vcspos_i $v03
#define vguard_f $v27
#define vguard_i $v28
#define v___ $v29
#define w e3
li t0, %lo(CLIP_CODE_FACTORS)
ldv vguard_i, 0,t0
vmudn vguard_f, vcspos_f, vguard_i
vmadh vguard_i, vcspos_i, vguard_i
vch v___, vguard_i, vguard_i.w
vcl v___, vguard_f, vguard_f.w
cfc2 t0, COP2_CTRL_VCC
andi t0, 0x707
srl t1, t0, 5
andi t0, 0x7
or t0, t1
jr ra
sb t0, SCREEN_VTX_CLIP_CODE(dst)
#undef dst
#undef vcspos_i
#undef vcspos_f
#undef vguard_i
#undef vguard_f
#undef v___
#undef w
.endfunc
################################################################
# GL_TnL
#
# Args:
# s3 = address of the vertex in DMEM (usually within VERTEX_CACHE)
#
################################################################
.func GL_TnL
GL_TnL:
#define tmp_ptr s2
#define vtx s3
#define s e0
move ra2, ra
#define v___ $v01
#define vrgba $v04
ldv vrgba.e0, PRIM_VTX_R, vtx # R + G + B + A
ldv vrgba.e4, PRIM_VTX_R, vtx # R + G + B + A
#define vtexsize $v06
#define vtexoffset $v07
#define vst $v08
llv vst, PRIM_VTX_TEX_S,vtx # S + T
suv vrgba, SCREEN_VTX_RGBA,vtx
li s1, %lo(GL_STATE_TEX_SIZE)
llv vtexsize.s, 0,s1
llv vtexoffset.s, 4,s1
#define vst_i $v28
#define vst_f $v29
# Scale texcoord by texsize and subtract offset (to correct for bilinear sampling if active)
#vmudn v___, vst, vtexsize
# vmadh vst, vtexoffset, K1
#vmudn v___, vst, vtexsize
#vmadh vst, vtexoffset, K1
#vmudl vst, vst, vtexsize
vmudh v___, vst, vtexsize
vsar vst_i, COP2_ACC_HI
vsar vst_f, COP2_ACC_MD
vmudl vst_f, vst_f, K8192
vmadm vst_i, vst_i, K8192
vmadn vst, vzero, vzero
#undef vst_i
#undef vst_f
#undef q
lbu t0, PRIM_VTX_TRCODE(vtx)
#define vcspos_f $v02
#define vcspos_i $v03
ldv vcspos_f, PRIM_VTX_CS_POSf,vtx
ldv vcspos_i, PRIM_VTX_CS_POSi,vtx
# Mark this vertex as having T&L applied
ori t0, 0x80
sb t0, PRIM_VTX_TRCODE(vtx)
jal GL_CalcScreenSpace
slv vst.s, SCREEN_VTX_S,vtx
j GL_CalcClipCodes
move ra, ra2
#undef vcspos_f
#undef vcspos_i
#undef vtexsize
#undef vtexoffset
#undef vtx
#undef v___
#undef vrgba
#undef vst
#undef s
.endfunc
################################################################
# GPUCmd_DrawTriangle
#
# Arguments:
# a0: Bit 31..24: Command id
# Bit 11..0: Offset into vertex cache of vtx1
# a1: Bit 27..16: Offset into vertex cache of vtx2
# Bit 11..0: Offset into vertex cache of vtx3
#
################################################################
.func GPUCmd_DrawTriangle
GPUCmd_DrawTriangle:
#define vtx1 a1
#define vtx2 a2
#define vtx3 a3
#define trcode1 t6
#define trcode2 t7
#define trcode3 t8
addi vtx3, a1, %lo(VERTEX_CACHE)
srl vtx2, a1, 16
addi vtx2, %lo(VERTEX_CACHE)
addi vtx1, a0, %lo(VERTEX_CACHE)
# Trivial reject: if all the vertices are out of the same plane (at least one),
# the triangle is out of the viewport.
# NOTE: This deliberately uses lb instead of lbu so the sign bit is extended.
# The MSB of each TR-code is a bit flag that is set if the vertex has already
# had T&L applied once.
lb trcode1, PRIM_VTX_TRCODE(vtx1)
lb trcode2, PRIM_VTX_TRCODE(vtx2)
lb trcode3, PRIM_VTX_TRCODE(vtx3)
and t0, trcode1, trcode2
and t0, trcode3
andi t0, 0x3F
bnez t0, JrRa
nop
# Perform T&L for each vertex if we haven't already
bgezal trcode1, GL_TnL
move s3, vtx1
bgezal trcode2, GL_TnL
move s3, vtx2
bgezal trcode3, GL_TnL
move s3, vtx3
lbu t0, SCREEN_VTX_CLIP_CODE(vtx1)
lbu t1, SCREEN_VTX_CLIP_CODE(vtx2)
lbu t2, SCREEN_VTX_CLIP_CODE(vtx3)
or t5, t0, t1
or t5, t2
move s1, zero
beqz t5, gl_draw_single_triangle
move s2, zero
jal GL_ClipTriangle
nop
beqz v1, gl_draw_triangle_end
addi s2, -6
lhu s5, 0(s1)
gl_draw_clipped_triangles_loop:
move vtx1, s5
lhu vtx2, 2(s1)
lhu vtx3, 4(s1)
gl_draw_single_triangle:
addi vtx1, SCREEN_VTX_X
addi vtx2, SCREEN_VTX_X
addi vtx3, SCREEN_VTX_X
lhu a0, %lo(GL_TRI_CMD)
lh v0, %lo(GL_TRI_CULL)
jal RDPQ_Triangle
li s3, %lo(RDPQ_CMD_STAGING)
jal RDPQ_Send
li s4, %lo(RDPQ_CMD_STAGING)
blt s1, s2, gl_draw_clipped_triangles_loop
addi s1, 2
gl_draw_triangle_end:
j RSPQ_Loop
nop
#undef vtx1
#undef vtx2
#undef vtx3
.endfunc
GPUCmd_MatrixLoad:
#define src s6
#define dst s7
#define vrhs01_i $v02
#define vrhs01_f $v03
#define vrhs23_i $v04
#define vrhs23_f $v05
addi src, rspq_dmem_buf_ptr, %lo(RSPQ_DMEM_BUFFER) - 64
addi dst, zero, %lo(GL_MATRIX_MVP)
# Load the matrix from command parameters (misaligned)
lqv vrhs01_i, 0x00,src
lrv vrhs01_i, 0x10,src
lqv vrhs23_i, 0x10,src
lrv vrhs23_i, 0x20,src
lqv vrhs01_f, 0x20,src
lrv vrhs01_f, 0x30,src
lqv vrhs23_f, 0x30,src
lrv vrhs23_f, 0x40,src
sqv vrhs01_i, 0x00,dst
sqv vrhs23_i, 0x10,dst
sqv vrhs01_f, 0x20,dst
jr ra
sqv vrhs23_f, 0x30,dst
#include "rsp_gpu_clipping.inc"
#include <rsp_rdpq.inc>