ClassiCube/misc/n64/rsp_gpu_clipping.inc
2025-05-02 07:16:29 +10:00

381 lines
10 KiB
PHP

#define CLIPPING_PLANE_COUNT 6
#define CLIPPING_CACHE_SIZE 9
#define CLIPPING_PLANE_SIZE 8
.section .data.gl_clipping
.align 4
CLIP_PLANES:
.half 1, 0, 0, GUARD_BAND_FACTOR
.half 0, 1, 0, GUARD_BAND_FACTOR
.half 0, 0, 1, 1
.half 1, 0, 0, -GUARD_BAND_FACTOR
.half 0, 1, 0, -GUARD_BAND_FACTOR
.half 0, 0, 1, -1
.align 4
CACHE_OFFSETS: .half 2,4,6,8,10,12,14,16,18
.section .bss.gl_clipping
CLIP_CACHE: .dcb.b SCREEN_VTX_SIZE * CLIPPING_CACHE_SIZE
CLIP_CACHE_END:
CLIP_LISTS:
CLIP_LIST0: .dcb.w CLIPPING_CACHE_SIZE
CLIP_LIST1: .dcb.w CLIPPING_CACHE_SIZE
.section .text.gl_clipping
################################################################
# GL_ClipTriangle
# Clip a triangle against the view-frustum by using the Sutherland-Hodgman algorithm
# https://en.wikipedia.org/wiki/Sutherland%E2%80%93Hodgman_algorithm
# Args:
# a1-a3 = Vertices
# t5 = OR'd clip flags of the triangle's vertices
# Returns:
# s1 = Pointer to list of output vertices
# s2 = Pointer to end of list
################################################################
.func GL_ClipTriangle
GL_ClipTriangle:
#define out_count v1
#define clip_flags t5
#define plane_flag t6
#define in_count t7
#define in_end t8
#define in_list s0
#define out_list s1
#define plane s2
#define intersection s3
#define cur_ptr s4
#define prev_ptr s5
#define cur_vtx s6
#define prev_vtx s7
#define p0 k0
#define p1 k1
#define vtx1 a1
#define vtx2 a2
#define vtx3 a3
#define vplane $v01
#define vint_f $v02
#define vint_i $v03
#define vdot_i $v04
#define vdot_f $v05
#define vdiff_i $v06
#define vdiff_f $v07
#define va_i $v08
#define va_f $v09
#define vpos_i $v10
#define vpos_f $v11
#define vattr0 $v12
#define vattr1 $v13
#define voff0 $v14
#define voff1 $v15
#define vcache0 $v16
#define vcache1 $v17
#define v__ $v29
move ra2, ra
# Init in_list as empty
li in_list, %lo(CLIP_LIST0)
move in_count, zero
# Put three original vertices in the out_list
# (So after the initial swap they will be in the in_list)
li out_list, %lo(CLIP_LIST1)
sh vtx1, 0(out_list)
sh vtx2, 2(out_list)
sh vtx3, 4(out_list)
li out_count, 3*2
li plane, %lo(CLIP_PLANES)
li plane_flag, 1
# Load cache offsets
li t0, %lo(CACHE_OFFSETS)
vxor voff1, voff1
lqv voff0, 0,t0
lsv voff1, 16,t0
# Temporarily use the RDP staging area as a map of which cache slots are used
# Init to zero
li t0, %lo(RDPQ_CMD_STAGING)
sqv vzero, 0,t0
sqv vzero, 16,t0
# Iterate over the 6 clipping planes
gl_clip_plane_loop:
and t0, clip_flags, plane_flag
beqz t0, gl_clip_plane_loop_end
move t1, in_list
# Swap in and out lists
# If the out list is empty from the last iteration,
# the triangle has no visible points and we are done
beqz out_count, gl_clip_return
move in_list, out_list
move out_list, t1
move in_count, out_count
move out_count, zero
# Iterate over the egdes of the polygon in the input list
# The current edge is between cur_vtx and prev_vtx
move cur_ptr, in_list
add in_end, in_list, in_count
# Init the "previous" vertex to the last in the list for the wrap-around
addi prev_ptr, in_end, -2
gl_clip_edge_loop:
#define cur_flag t3
#define prev_flag t4
# Check which side of the plane the two vertices are on
lhu cur_vtx, 0(cur_ptr)
lhu prev_vtx, 0(prev_ptr)
lbu cur_flag, SCREEN_VTX_CLIP_CODE(cur_vtx)
lbu prev_flag, SCREEN_VTX_CLIP_CODE(prev_vtx)
and cur_flag, plane_flag
and prev_flag, plane_flag
# If they are on opposite sides, there is an intersection
xor t0, cur_flag, prev_flag
beqz t0, gl_clip_no_intersection
move p0, cur_vtx
# Swap the two points if necessary to make intersection calculation consistent
# This will make sure p0 is always inside and p1 is always outside
bnez prev_flag, gl_clip_no_swap
move p1, prev_vtx
xor p0, p0, p1
xor p1, p0, p1
xor p0, p0, p1
#undef prev_flag
gl_clip_no_swap:
# Calculate intersection of the line segment and the plane
li t0, %lo(RDPQ_CMD_STAGING)
lqv vcache0, 0,t0
lqv vcache1, 16,t0
# Repeat plane coefficients twice
ldv vplane.e0, 0,plane
ldv vplane.e4, 0,plane
# vpos: x0 y0 z0 w0 x1 y1 z1 w1
ldv vpos_i.e0, SCREEN_VTX_CS_POSi,p0
ldv vpos_f.e0, SCREEN_VTX_CS_POSf,p0
ldv vpos_i.e4, SCREEN_VTX_CS_POSi,p1
ldv vpos_f.e4, SCREEN_VTX_CS_POSf,p1
# vint: x1 y1 z1 w1
ldv vint_i.e0, SCREEN_VTX_CS_POSi,p1
ldv vint_f.e0, SCREEN_VTX_CS_POSf,p1
# vattr0: r0 g0 b0 a0 s0 t0
luv vattr0.e0, SCREEN_VTX_RGBA ,p0
llv vattr0.e4, SCREEN_VTX_S_T ,p0
# vattr1: r1 g1 b1 a1 s1 t1
luv vattr1.e0, SCREEN_VTX_RGBA ,p1
llv vattr1.e4, SCREEN_VTX_S_T ,p1
# Find first free slot in clip cache
# Add the values from the "used slots map" to the cache offsets
# After this, each lane will contain the offset of its corresponding cache slot,
# but only if the slot is not used. If it is used, it will contain some large value.
vaddc vcache0, voff0
vaddc vcache1, voff1
# Look for the smallest value, which will end up in vcache.e0
# Because used slots are marked as large values, they will never be found.
vlt vcache0, vcache0.q1
vlt vcache0, vcache0.h2
vlt vcache0, vcache0.e4
vlt vcache0, vcache1.e0
mfc2 t0, vcache0.e0
# Mark slot as used by storing some large value (careful of overflows!)
li t1, 0xFF
sh t1, %lo(RDPQ_CMD_STAGING)-2(t0)
# t0 is the index multiplied by 2
# intersection = t0 * 20 = t0 * 16 + t0 * 4
sll intersection, t0, 4
sll t1, t0, 2
add intersection, t1
# CAUTION: intersection might point to the same address as either p0 or p1,
# because one of them is the previous point, which could have been marked unused
# in the previous iteration. As long as we don't access p0 or p1 after writing to
# intersection, this is fine.
addi intersection, %lo(CLIP_CACHE) - SCREEN_VTX_SIZE
# Store the cache offset in unused memory (used later when finding the cache slot to mark as unused)
sb t0, SCREEN_VTX_PADDING(intersection)
# Compute dot products of both positions with the clip plane
# vdot.e0: d0 = dot(p0, plane)
# vdot.e4: d1 = dot(p1, plane)
vmudn vdot_f, vpos_f, vplane
vmadh vdot_i, vpos_i, vplane
vaddc vdot_f, vdot_f.q1
vadd vdot_i, vdot_i.q1
vaddc vdot_f, vdot_f.h2
vadd vdot_i, vdot_i.h2
# d0 - d1
vsubc vdiff_f, vdot_f, vdot_f.e4
vsub vdiff_i, vdot_i, vdot_i.e4
# 1 / (d0 - d1)
vrcph v__.e0, vdiff_i.e0
vrcpl va_f.e0, vdiff_f.e0
vrcph va_i.e0, vzero.e0
# a = d0 / (d0 - d1)
vmudl v__, va_f, vdot_f.e0
vmadm v__, va_i, vdot_f.e0
vmadn va_f, va_f, vdot_i.e0
# Prepare 0x7FFF in va_i.e0
vsubc va_i, vshift8, K1
# a = min(a, 1)
vge v__, va_f, vzero
vmrg va_f, va_f, va_i.e0
# Account for right shift introduced by vrcp
vmudn va_f, va_f, K2
# p1 - p0
vsubc vint_f, vpos_f
vsub vint_i, vpos_i
# attr1 - attr0
vsubc vattr1, vattr0
# Result of linear interpolation:
# p0 + a * (p1 - p0)
vmudl v__, vint_f, va_f.e0
vmadm v__, vint_i, va_f.e0
vmadn vint_f, vpos_f, K1
vmadh vint_i, vpos_i, K1
# a * (attr1 - attr0)
vmudm vattr1, vattr1, va_f.e0
# attr0 + a * (attr1 - attr0)
vaddc vattr0, vattr1
# Store results
sdv vint_i.e0, SCREEN_VTX_CS_POSi,intersection
sdv vint_f.e0, SCREEN_VTX_CS_POSf,intersection
suv vattr0.e0, SCREEN_VTX_RGBA ,intersection
jal GL_CalcClipCodes
slv vattr0.e4, SCREEN_VTX_S_T ,intersection
# Add intersection to the output list
add t0, out_list, out_count
sh intersection, 0(t0)
addi out_count, 2
gl_clip_no_intersection:
# If cur_vtx is inside, add it to the output list
bnez cur_flag, gl_clip_no_current
add t0, out_list, out_count
sh cur_vtx, 0(t0)
b gl_clip_edge_loop_end
addi out_count, 2
#undef cur_flag
gl_clip_no_current:
# Check if the vertex is stored in the clip cache
lbu t0, SCREEN_VTX_PADDING(cur_vtx)
beqz t0, gl_clip_edge_loop_end
# Reset the padding field to zero, so the screen space values won't be recalculated below
sb zero, SCREEN_VTX_PADDING(cur_vtx)
# If so, mark it as unused
sh zero, %lo(RDPQ_CMD_STAGING)-2(t0)
gl_clip_edge_loop_end:
# Advance to the next edge
addi cur_ptr, 2
blt cur_ptr, in_end, gl_clip_edge_loop
addi prev_ptr, cur_ptr, -2
gl_clip_plane_loop_end:
# Advance to the next clipping plane
sll plane_flag, 1
blt plane_flag, (1<<CLIPPING_PLANE_COUNT), gl_clip_plane_loop
addi plane, CLIPPING_PLANE_SIZE
#define cache_vtx s3
#define cache_end s5
# Calculate screen space values for new vertices (in the clip cache)
# TODO: maybe iterate over out_list instead
li cache_vtx, %lo(CLIP_CACHE)
li cache_end, %lo(CLIP_CACHE_END) - SCREEN_VTX_SIZE
gl_clip_finalize_loop:
lbu t0, SCREEN_VTX_PADDING(cache_vtx)
neg t0
# Only calculate screen space values if the vertex is actually used
ldv vint_i, SCREEN_VTX_CS_POSi,cache_vtx
bltzal t0, GL_CalcScreenSpace
ldv vint_f, SCREEN_VTX_CS_POSf,cache_vtx
blt cache_vtx, cache_end, gl_clip_finalize_loop
addi cache_vtx, SCREEN_VTX_SIZE
gl_clip_return:
# Done!
jr ra2
add s2, out_list, out_count
#undef cache_vtx
#undef cache_end
#undef clip_flags
#undef plane_flag
#undef in_count
#undef out_count
#undef in_end
#undef intersection
#undef in_list
#undef out_list
#undef plane
#undef cur_ptr
#undef prev_ptr
#undef cur_vtx
#undef prev_vtx
#undef p0
#undef p1
#undef vtx1
#undef vtx2
#undef vtx3
#undef vplane
#undef vpos_i
#undef vpos_f
#undef vdot_i
#undef vdot_f
#undef vdiff_i
#undef vdiff_f
#undef va_f
#undef vint_i
#undef vint_f
#undef vattr0
#undef vattr1
#undef v__
.endfunc