ClassiCube/misc/ps2/VertexTransform.S
2024-06-24 20:11:49 +10:00

247 lines
7.4 KiB
ArmAsm

# REGISTER USAGE
# vf0 = hardware coded to (0,0,0,1)
# vf1 = mvp.row1
# vf2 = mvp.row2
# vf3 = mvp.row3
# vf4 = mvp.row4
# vf5 = clipping scale adjustments to match guardbands
# vf6 = viewport origin
# vf7 = viewport scale
# NOTE: vclipw.xyz takes 4 cycles to produce result, which must be accounted for
.align 4
.global LoadMvpMatrix
.type LoadMvpMatrix,%function
.global LoadClipScaleFactors
.type LoadClipScaleFactors,%function
.global LoadViewportOrigin
.type LoadViewportOrigin,%function
.global LoadViewportScale
.type LoadViewportScale,%function
.global TransformTexturedQuad
.type TransformTexturedQuad,%function
.global TransformColouredQuad
.type TransformColouredQuad,%function
.global ViewportTransform
.type ViewportTransform,%function
# Loads matrix into VU0 registers
# $a0 = addresss of mvp
LoadMvpMatrix:
lqc2 $vf1, 0x00($a0) # vf1 = mvp.row1
lqc2 $vf2, 0x10($a0) # vf2 = mvp.row2
lqc2 $vf3, 0x20($a0) # vf3 = mvp.row3
lqc2 $vf4, 0x30($a0) # vf4 = mvp.row4
jr $ra
nop
# Loads clipping scaling factors into VU0 registers
# $a0 = addresss of factors
LoadClipScaleFactors:
lqc2 $vf5, 0x00($a0) # vf5 = factors
jr $ra
nop
# Loads viewport origin into VU0 registers
# $a0 = addresss of origin
LoadViewportOrigin:
lqc2 $vf6, 0x00($a0) # vf6 = origin
jr $ra
nop
# Loads viewport scale into VU0 registers
# $a0 = addresss of scale
LoadViewportScale:
lqc2 $vf7, 0x00($a0) # vf7 = scale
jr $ra
nop
.macro TransformVertex1
# LOAD VERTEX 1
lqc2 $vf10, 0x00($a2) # IN = tmp
# TRANSFORM VERTEX 1
vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, $vf1, $vf10 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x
vmadday $ACC, $vf2, $vf10 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y
vmaddz $vf11, $vf3, $vf10 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z
sqc2 $vf11, 0x00($a1) # dst[0] = TRANSFORMED(V0)
vmul $vf10, $vf11, $vf5 # TMP = TRANSFORMED(V0) * CLIP_PLANES_ADJUST
# BEGIN CLIP FLAGS CALCULATION VERTEX 1
vclipw.xyz $vf10, $vf10 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
.endm
.macro TransformVertex2
# LOAD VERTEX 2
lqc2 $vf12, 0x00($a2) # IN = tmp
# TRANSFORM VERTEX 2
vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, $vf1, $vf12 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x
vmadday $ACC, $vf2, $vf12 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y
vmaddz $vf13, $vf3, $vf12 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z
sqc2 $vf13, 0x10($a1) # dst[1] = TRANSFORMED(V1)
vmul $vf12, $vf13, $vf5 # TMP = TRANSFORMED(V1) * CLIP_PLANES_ADJUST
# STORE CLIP FLAGS VERTEX 1 RESULT
cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
sw $t0,0x00($a3) # clip_flags[0] = t0
# BEGIN CLIP FLAGS CALCULATION VERTEX 2
vclipw.xyz $vf12, $vf12 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
.endm
.macro TransformVertex3
# LOAD VERTEX 3
lqc2 $vf14, 0x00($a2) # IN = tmp
# TRANSFORM VERTEX 3
vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, $vf1, $vf14 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x
vmadday $ACC, $vf2, $vf14 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y
vmaddz $vf15, $vf3, $vf14 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z
sqc2 $vf15, 0x20($a1) # dst[2] = TRANSFORMED(V2)
vmul $vf14, $vf15, $vf5 # TMP = TRANSFORMED(V2) * CLIP_PLANES_ADJUST
# STORE CLIP FLAGS VERTEX 2 RESULT
cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
sw $t0,0x04($a3) # clip_flags[1] = t0
# BEGIN CLIP FLAGS CALCULATION VERTEX 3
vclipw.xyz $vf14, $vf14 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
.endm
.macro TransformVertex4
# LOAD VERTEX 4
lqc2 $vf16, 0x00($a2) # IN = tmp
# TRANSFORM VERTEX 4
vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, $vf1, $vf16 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x
vmadday $ACC, $vf2, $vf16 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y
vmaddz $vf17, $vf3, $vf16 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z
vmul $vf16, $vf17, $vf5 # TMP = TRANSFORMED(V3) * CLIP_PLANES_ADJUST
# STORE CLIP FLAGS VERTEX 3 RESULT
cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
sw $t0,0x08($a3) # clip_flags[2] = t0
# BEGIN CLIP FLAGS CALCULATION VERTEX 4
vclipw.xyz $vf16, $vf16 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
.endm
.macro TransformFinish
# Vertex output
# dst[0] = V0 (done by TransformVertex1)
# dst[1] = V1 (done by TransformVertex2)
# dst[2] = V2 (done by TransformVertex3)
# dst[3] = V2
# dst[4] = V3
# dst[5] = V0
sqc2 $vf15, 0x30($a1) # dst[3] = TRANSFORMED(V2)
sqc2 $vf17, 0x40($a1) # dst[4] = TRANSFORMED(V3)
sqc2 $vf11, 0x50($a1) # dst[5] = TRANSFORMED(V0)
vnop # adjust for delay
# STORE CLIP FLAGS 4 RESULT
cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
sw $t0,0x0C($a3) # clip_flags[3] = t0
.endm
# Transforms 4 vertices with size of 24 bytes
# $a0 = addresss of src vertices
# $a1 = addresss of dst vertices
# $a2 = address of tmp vertex
# $a3 = address of clip flags
TransformTexturedQuad:
# LOAD 1.0 into W
lw $t0,ONE_VALUE # t0 = 1.0f
sw $t0,0x0C($a2) # tmp.w = f5
# LOAD VERTEX 1
ld $t0,0x00($a0) # t0 = src[0].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x08($a0) # t0 = src[0].z
sw $t0,0x08($a2) # tmp.z = t0
TransformVertex1
# LOAD VERTEX 2
ld $t0,0x18($a0) # t0 = src[1].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x20($a0) # t0 = src[1].z
sw $t0,0x08($a2) # tmp.z = t0
TransformVertex2
# LOAD VERTEX 3
ld $t0,0x30($a0) # t0 = src[2].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x38($a0) # t0 = src[2].z
sw $t0,0x08($a2) # tmp.z = t0
TransformVertex3
# LOAD VERTEX 4
ld $t0,0x48($a0) # t0 = src[3].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x50($a0) # t0 = src[3].z
sw $t0,0x08($a2) # tmp.z = t0
TransformVertex4
TransformFinish
jr $ra
nop
# Transforms 4 vertices with size of 16 bytes
# $a0 = addresss of src vertices
# $a1 = addresss of dst vertices
# $a2 = address of tmp vertex
# $a3 = address of clip flags
TransformColouredQuad:
# LOAD 1.0 into W
lw $t0,ONE_VALUE # t0 = 1.0f
sw $t0,0x0C($a2) # tmp.w = f5
# LOAD VERTEX 1
ld $t0,0x00($a0) # t0 = src[0].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x08($a0) # t0 = src[0].z
sw $t0,0x08($a2) # tmp.z = t0
TransformVertex1
# LOAD VERTEX 2
ld $t0,0x10($a0) # t0 = src[1].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x18($a0) # t0 = src[1].z
sw $t0,0x08($a2) # tmp.z = t0
TransformVertex2
# LOAD VERTEX 3
ld $t0,0x20($a0) # t0 = src[2].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x28($a0) # t0 = src[2].z
sw $t0,0x08($a2) # tmp.z = t0
TransformVertex3
# LOAD VERTEX 4
ld $t0,0x30($a0) # t0 = src[3].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x38($a0) # t0 = src[3].z
sw $t0,0x08($a2) # tmp.z = t0
TransformVertex4
TransformFinish
jr $ra
nop
.global ONE_VALUE
ONE_VALUE: .float 1.0
# $a0 = addresss of src
# $a1 = addresss of dst
ViewportTransform:
lqc2 $vf16, 0x00($a0) # IN = src
vmulw $vf17, $vf16, $vf16 # TMP = IN[xyzw] * IN.w (inverse W)
vmul $vf18, $vf17, $vf7 # TMP = TMP * viewport_scale
vadd $vf19, $vf18, $vf6 # TMP = TMP + viewport_origin
vftoi0 $vf19, $vf19 # TMP = int(TMP)
sqc2 $vf19, 0x00($a1) # dst = TMP
jr $ra
nop