ClassiCube/misc/ps2/DrawColoured.S
2025-05-08 20:08:30 +10:00

245 lines
5.6 KiB
ArmAsm

# NOTE: vclipw.xyz takes 4 cycles to produce result, which must be accounted for
.macro FUNC name
.global \name
.type \name,%function
\name:
.endm
# mips ISA has explicit delay slots
# (i.e. instruction after branches/jumps are always unconditionally executed)
.set noreorder
# Note that registers are numbered for N32 ABI, but when disassembling
# in ghidra or PCSX2, they are disassembled as O32 ABI ?
# https://github.com/ps2dev/binutils-gdb/blob/e9cf3691bfa140469d52815a2307b00eecf7917c/gas/config/tc-mips.c#L2786
# global registers
#define V0001 $vf0 // hardware coded to (0,0,0,1)
#define MVP1 $vf1 // mvp.row1
#define MVP2 $vf2 // mvp.row2
#define MVP3 $vf3 // mvp.row3
#define MVP4 $vf4 // mvp.row4
#define CL_F $vf5 // clipping scale adjustments to match guardbands
#define VP_O $vf6 // viewport origin
#define VP_S $vf7 // viewport scale
# transform temp registers
#define POSCL $vf10 // TRANSFORMED(POS_[1234]) * CLIP_PLANES_ADJUST
#define POS_1 $vf11 // vertex 1 position
#define POS_2 $vf12 // vertex 2 position
#define POS_3 $vf13 // vertex 3 position
#define POS_4 $vf14 // vertex 4 position
#define _one $vf0w
#define POS1w $vf11w
#define POS2w $vf12w
#define POS3w $vf13w
#define POS4w $vf14w
#define SRC $a0
#define DST $a1
#define TMP $a2
#define COL1 $f12
#define COL2 $f13
#define COL3 $f14
#define COL4 $f15
#define Z_1 $f0
#define Z_2 $f1
#define Z_3 $f3
#define Z_4 $f4
#define W_1 $f16
#define W_2 $f17
#define W_3 $f18
#define W_4 $f19
#define XY_1 $t1
#define XY_2 $t2
#define XY_3 $t3
#define XY_4 $a3
#define Y_1 $a4
#define Y_2 $a5
#define Y_3 $a6
#define Y_4 $a7
.macro TransformVertex vpos
vmulaw $ACC, MVP4, V0001 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, MVP1, \vpos # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * VEC.x
vmadday $ACC, MVP2, \vpos # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * VEC.y
vmaddz \vpos, MVP3, \vpos # VEC[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * VEC.z
.endm
.macro BeginClip vpos
vmul POSCL, \vpos, CL_F # TMP = TRANSFORMED(VEC) * CLIP_PLANES_ADJUST
# begin clip flags calculation
vclipw.xyz POSCL, POSCL # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
.endm
.macro VPTransform vpos
vmulw.xyz \vpos, \vpos, \vpos # TMP.xyz = IN.xyz * IN.w (inverse W)
vmul.xyz \vpos, \vpos, VP_S # TMP.xyz = TMP * viewport_scale
vadd.xyz \vpos, \vpos, VP_O # TMP.xyz = TMP + viewport_origin
vftoi0.xyz \vpos, \vpos # TMP.xyz = int(TMP)
.endm
# Fully transforms 4 vertices with size of 16 bytes
# $a0 = addresss of src vertices
# $a1 = addresss of dst vertices
# $a2 = address of tmp vertex
# $v0 = address of final vertices (return value)
FUNC DrawColouredQuad
### VERTEX 1 ###
# LOAD VERTEX 1
ld $t0,0x00(SRC) # t0 = src[0].x,y
sd $t0,0x00(TMP) # tmp.x,y = t0
lw $t0,0x08(SRC) # t0 = src[0].z
sw $t0,0x08(TMP) # tmp.z = t0
lqc2 POS_1, 0x00(TMP) # V1 = tmp
TransformVertex POS_1
lwc1 COL1, 0x0C(SRC)
vdiv $Q, _one, POS1w
BeginClip POS_1
### VERTEX 2 ###
# LOAD VERTEX 2
ld $t0,0x10(SRC) # t0 = src[1].x,y
sd $t0,0x00(TMP) # tmp.x,y = t0
lw $t0,0x18(SRC) # t0 = src[1].z
sw $t0,0x08(TMP) # tmp.z = t0
lqc2 POS_2, 0x00(TMP) # V2 = tmp
TransformVertex POS_2
lwc1 COL2, 0x1C(SRC)
vmulq.w POS_1, V0001, $Q
vdiv $Q, _one, POS2w
BeginClip POS_2
### VERTEX 3 ###
# LOAD VERTEX 3
ld $t0,0x20(SRC) # t0 = src[2].x,y
sd $t0,0x00(TMP) # tmp.x,y = t0
lw $t0,0x28(SRC) # t0 = src[2].z
sw $t0,0x08(TMP) # tmp.z = t0
lqc2 POS_3, 0x00(TMP) # V3 = tmp
TransformVertex POS_3
lwc1 COL3, 0x2C(SRC)
vmulq.w POS_2, V0001, $Q
vdiv $Q, _one, POS3w
BeginClip POS_3
### VERTEX 4 ###
# LOAD VERTEX 4
ld $t0,0x30(SRC) # t0 = src[3].x,y
sd $t0,0x00(TMP) # tmp.x,y = t0
lw $t0,0x38(SRC) # t0 = src[3].z
sw $t0,0x08(TMP) # tmp.z = t0
lqc2 POS_4, 0x00(TMP) # V4 = tmp
TransformVertex POS_4
lwc1 COL4, 0x3C(SRC)
vmulq.w POS_3, V0001, $Q
vdiv $Q, _one, POS4w
BeginClip POS_4
vnop # adjust for delay
vnop # adjust for delay
vnop # adjust for delay
# STORE CLIP FLAGS 4 RESULT
vwaitq
vmulq.w POS_4, V0001, $Q
# check if any vertices would need clipping
cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
bnez $t0, any_clipped_vertices
nop
# output vertices
VPTransform POS_1
VPTransform POS_2
VPTransform POS_3
VPTransform POS_4
# Convert to register format
sqc2 POS_1, 0x00(TMP)
sqc2 POS_2, 0x10(TMP)
sqc2 POS_3, 0x20(TMP)
sqc2 POS_4, 0x30(TMP)
lhu XY_1, 0x00(TMP)
lhu Y_1, 0x04(TMP)
lwc1 Z_1, 0x08(TMP)
lwc1 W_1, 0x0C(TMP)
lhu XY_2, 0x10(TMP)
lhu Y_2, 0x14(TMP)
lwc1 Z_2, 0x18(TMP)
lwc1 W_2, 0x1C(TMP)
lhu XY_3, 0x20(TMP)
lhu Y_3, 0x24(TMP)
lwc1 Z_3, 0x28(TMP)
lwc1 W_3, 0x2C(TMP)
lhu XY_4, 0x30(TMP)
lhu Y_4, 0x34(TMP)
lwc1 Z_4, 0x38(TMP)
lwc1 W_4, 0x3C(TMP)
sll Y_1, Y_1, 16
sll Y_2, Y_2, 16
sll Y_3, Y_3, 16
sll Y_4, Y_4, 16
or XY_1, XY_1, Y_1
or XY_2, XY_2, Y_2
or XY_3, XY_3, Y_3
or XY_4, XY_4, Y_4
# write 1,2,3 3,4,1
swc1 COL1, 0x00(DST)
swc1 W_1, 0x04(DST)
sw XY_1, 0x08(DST)
swc1 Z_1, 0x0C(DST)
swc1 COL2, 0x10(DST)
swc1 W_2, 0x14(DST)
sw XY_2, 0x18(DST)
swc1 Z_2, 0x1C(DST)
swc1 COL3, 0x20(DST)
swc1 W_3, 0x24(DST)
sw XY_3, 0x28(DST)
swc1 Z_3, 0x2C(DST)
swc1 COL3, 0x30(DST)
swc1 W_3, 0x34(DST)
sw XY_3, 0x38(DST)
swc1 Z_3, 0x3C(DST)
swc1 COL4, 0x40(DST)
swc1 W_4, 0x44(DST)
sw XY_4, 0x48(DST)
swc1 Z_4, 0x4C(DST)
swc1 COL1, 0x50(DST)
swc1 W_1, 0x54(DST)
sw XY_1, 0x58(DST)
swc1 Z_1, 0x5C(DST)
addi DST, 16*6
# TODO clipping
any_clipped_vertices:
jr $ra
move $v0, DST