mirror of
https://github.com/ClassiCube/ClassiCube.git
synced 2025-08-03 18:57:27 -04:00
245 lines
5.6 KiB
ArmAsm
245 lines
5.6 KiB
ArmAsm
# NOTE: vclipw.xyz takes 4 cycles to produce result, which must be accounted for
|
|
|
|
.macro FUNC name
|
|
.global \name
|
|
.type \name,%function
|
|
\name:
|
|
.endm
|
|
|
|
# mips ISA has explicit delay slots
|
|
# (i.e. instruction after branches/jumps are always unconditionally executed)
|
|
.set noreorder
|
|
|
|
# Note that registers are numbered for N32 ABI, but when disassembling
|
|
# in ghidra or PCSX2, they are disassembled as O32 ABI ?
|
|
# https://github.com/ps2dev/binutils-gdb/blob/e9cf3691bfa140469d52815a2307b00eecf7917c/gas/config/tc-mips.c#L2786
|
|
|
|
# global registers
|
|
#define V0001 $vf0 // hardware coded to (0,0,0,1)
|
|
#define MVP1 $vf1 // mvp.row1
|
|
#define MVP2 $vf2 // mvp.row2
|
|
#define MVP3 $vf3 // mvp.row3
|
|
#define MVP4 $vf4 // mvp.row4
|
|
#define CL_F $vf5 // clipping scale adjustments to match guardbands
|
|
#define VP_O $vf6 // viewport origin
|
|
#define VP_S $vf7 // viewport scale
|
|
|
|
# transform temp registers
|
|
#define POSCL $vf10 // TRANSFORMED(POS_[1234]) * CLIP_PLANES_ADJUST
|
|
#define POS_1 $vf11 // vertex 1 position
|
|
#define POS_2 $vf12 // vertex 2 position
|
|
#define POS_3 $vf13 // vertex 3 position
|
|
#define POS_4 $vf14 // vertex 4 position
|
|
|
|
#define _one $vf0w
|
|
#define POS1w $vf11w
|
|
#define POS2w $vf12w
|
|
#define POS3w $vf13w
|
|
#define POS4w $vf14w
|
|
|
|
#define SRC $a0
|
|
#define DST $a1
|
|
#define TMP $a2
|
|
|
|
#define COL1 $f12
|
|
#define COL2 $f13
|
|
#define COL3 $f14
|
|
#define COL4 $f15
|
|
|
|
#define Z_1 $f0
|
|
#define Z_2 $f1
|
|
#define Z_3 $f3
|
|
#define Z_4 $f4
|
|
|
|
#define W_1 $f16
|
|
#define W_2 $f17
|
|
#define W_3 $f18
|
|
#define W_4 $f19
|
|
|
|
#define XY_1 $t1
|
|
#define XY_2 $t2
|
|
#define XY_3 $t3
|
|
#define XY_4 $a3
|
|
|
|
#define Y_1 $a4
|
|
#define Y_2 $a5
|
|
#define Y_3 $a6
|
|
#define Y_4 $a7
|
|
|
|
|
|
.macro TransformVertex vpos
|
|
vmulaw $ACC, MVP4, V0001 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
|
|
vmaddax $ACC, MVP1, \vpos # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * VEC.x
|
|
vmadday $ACC, MVP2, \vpos # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * VEC.y
|
|
vmaddz \vpos, MVP3, \vpos # VEC[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * VEC.z
|
|
.endm
|
|
|
|
.macro BeginClip vpos
|
|
vmul POSCL, \vpos, CL_F # TMP = TRANSFORMED(VEC) * CLIP_PLANES_ADJUST
|
|
# begin clip flags calculation
|
|
vclipw.xyz POSCL, POSCL # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
|
|
.endm
|
|
|
|
.macro VPTransform vpos
|
|
vmulw.xyz \vpos, \vpos, \vpos # TMP.xyz = IN.xyz * IN.w (inverse W)
|
|
vmul.xyz \vpos, \vpos, VP_S # TMP.xyz = TMP * viewport_scale
|
|
vadd.xyz \vpos, \vpos, VP_O # TMP.xyz = TMP + viewport_origin
|
|
vftoi0.xyz \vpos, \vpos # TMP.xyz = int(TMP)
|
|
.endm
|
|
|
|
# Fully transforms 4 vertices with size of 16 bytes
|
|
# $a0 = addresss of src vertices
|
|
# $a1 = addresss of dst vertices
|
|
# $a2 = address of tmp vertex
|
|
# $v0 = address of final vertices (return value)
|
|
FUNC DrawColouredQuad
|
|
|
|
### VERTEX 1 ###
|
|
# LOAD VERTEX 1
|
|
ld $t0,0x00(SRC) # t0 = src[0].x,y
|
|
sd $t0,0x00(TMP) # tmp.x,y = t0
|
|
lw $t0,0x08(SRC) # t0 = src[0].z
|
|
sw $t0,0x08(TMP) # tmp.z = t0
|
|
lqc2 POS_1, 0x00(TMP) # V1 = tmp
|
|
|
|
TransformVertex POS_1
|
|
lwc1 COL1, 0x0C(SRC)
|
|
vdiv $Q, _one, POS1w
|
|
BeginClip POS_1
|
|
|
|
### VERTEX 2 ###
|
|
# LOAD VERTEX 2
|
|
ld $t0,0x10(SRC) # t0 = src[1].x,y
|
|
sd $t0,0x00(TMP) # tmp.x,y = t0
|
|
lw $t0,0x18(SRC) # t0 = src[1].z
|
|
sw $t0,0x08(TMP) # tmp.z = t0
|
|
lqc2 POS_2, 0x00(TMP) # V2 = tmp
|
|
|
|
TransformVertex POS_2
|
|
lwc1 COL2, 0x1C(SRC)
|
|
vmulq.w POS_1, V0001, $Q
|
|
vdiv $Q, _one, POS2w
|
|
BeginClip POS_2
|
|
|
|
### VERTEX 3 ###
|
|
# LOAD VERTEX 3
|
|
ld $t0,0x20(SRC) # t0 = src[2].x,y
|
|
sd $t0,0x00(TMP) # tmp.x,y = t0
|
|
lw $t0,0x28(SRC) # t0 = src[2].z
|
|
sw $t0,0x08(TMP) # tmp.z = t0
|
|
lqc2 POS_3, 0x00(TMP) # V3 = tmp
|
|
|
|
TransformVertex POS_3
|
|
lwc1 COL3, 0x2C(SRC)
|
|
vmulq.w POS_2, V0001, $Q
|
|
vdiv $Q, _one, POS3w
|
|
BeginClip POS_3
|
|
|
|
### VERTEX 4 ###
|
|
# LOAD VERTEX 4
|
|
ld $t0,0x30(SRC) # t0 = src[3].x,y
|
|
sd $t0,0x00(TMP) # tmp.x,y = t0
|
|
lw $t0,0x38(SRC) # t0 = src[3].z
|
|
sw $t0,0x08(TMP) # tmp.z = t0
|
|
lqc2 POS_4, 0x00(TMP) # V4 = tmp
|
|
|
|
TransformVertex POS_4
|
|
lwc1 COL4, 0x3C(SRC)
|
|
vmulq.w POS_3, V0001, $Q
|
|
vdiv $Q, _one, POS4w
|
|
BeginClip POS_4
|
|
|
|
vnop # adjust for delay
|
|
vnop # adjust for delay
|
|
vnop # adjust for delay
|
|
|
|
# STORE CLIP FLAGS 4 RESULT
|
|
vwaitq
|
|
vmulq.w POS_4, V0001, $Q
|
|
|
|
# check if any vertices would need clipping
|
|
cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
|
|
bnez $t0, any_clipped_vertices
|
|
nop
|
|
|
|
# output vertices
|
|
VPTransform POS_1
|
|
VPTransform POS_2
|
|
VPTransform POS_3
|
|
VPTransform POS_4
|
|
|
|
# Convert to register format
|
|
sqc2 POS_1, 0x00(TMP)
|
|
sqc2 POS_2, 0x10(TMP)
|
|
sqc2 POS_3, 0x20(TMP)
|
|
sqc2 POS_4, 0x30(TMP)
|
|
|
|
lhu XY_1, 0x00(TMP)
|
|
lhu Y_1, 0x04(TMP)
|
|
lwc1 Z_1, 0x08(TMP)
|
|
lwc1 W_1, 0x0C(TMP)
|
|
|
|
lhu XY_2, 0x10(TMP)
|
|
lhu Y_2, 0x14(TMP)
|
|
lwc1 Z_2, 0x18(TMP)
|
|
lwc1 W_2, 0x1C(TMP)
|
|
|
|
lhu XY_3, 0x20(TMP)
|
|
lhu Y_3, 0x24(TMP)
|
|
lwc1 Z_3, 0x28(TMP)
|
|
lwc1 W_3, 0x2C(TMP)
|
|
|
|
lhu XY_4, 0x30(TMP)
|
|
lhu Y_4, 0x34(TMP)
|
|
lwc1 Z_4, 0x38(TMP)
|
|
lwc1 W_4, 0x3C(TMP)
|
|
|
|
sll Y_1, Y_1, 16
|
|
sll Y_2, Y_2, 16
|
|
sll Y_3, Y_3, 16
|
|
sll Y_4, Y_4, 16
|
|
|
|
or XY_1, XY_1, Y_1
|
|
or XY_2, XY_2, Y_2
|
|
or XY_3, XY_3, Y_3
|
|
or XY_4, XY_4, Y_4
|
|
|
|
# write 1,2,3 3,4,1
|
|
swc1 COL1, 0x00(DST)
|
|
swc1 W_1, 0x04(DST)
|
|
sw XY_1, 0x08(DST)
|
|
swc1 Z_1, 0x0C(DST)
|
|
|
|
swc1 COL2, 0x10(DST)
|
|
swc1 W_2, 0x14(DST)
|
|
sw XY_2, 0x18(DST)
|
|
swc1 Z_2, 0x1C(DST)
|
|
|
|
swc1 COL3, 0x20(DST)
|
|
swc1 W_3, 0x24(DST)
|
|
sw XY_3, 0x28(DST)
|
|
swc1 Z_3, 0x2C(DST)
|
|
|
|
swc1 COL3, 0x30(DST)
|
|
swc1 W_3, 0x34(DST)
|
|
sw XY_3, 0x38(DST)
|
|
swc1 Z_3, 0x3C(DST)
|
|
|
|
swc1 COL4, 0x40(DST)
|
|
swc1 W_4, 0x44(DST)
|
|
sw XY_4, 0x48(DST)
|
|
swc1 Z_4, 0x4C(DST)
|
|
|
|
swc1 COL1, 0x50(DST)
|
|
swc1 W_1, 0x54(DST)
|
|
sw XY_1, 0x58(DST)
|
|
swc1 Z_1, 0x5C(DST)
|
|
|
|
addi DST, 16*6
|
|
|
|
# TODO clipping
|
|
any_clipped_vertices:
|
|
jr $ra
|
|
move $v0, DST
|
|
|