ClassiCube/misc/dreamcast/VertexTransform.S
2024-07-07 09:26:12 +10:00

286 lines
9.2 KiB
ArmAsm

! =========================================================
! ======================== PROCESSOR INFO =================
! =========================================================
! The SH4 can dual issue (i.e. parallel execution) two instructions
! as long as the groups of the two instructions are different:
! * LS - most ALU and FPU register load/stores
! * EX - most ALU arithmetic instructions
! * MT - TST, CMP, NOP, MOV Rm,Rn
! * FE - most FPU arithmetic instructions
! * CO - other instructions (NOTE: Cannot be exeucted in parallel)
! Thee following general aspects of instructions are important to note per the SH4 manual:
! * Issue rate: Interval between the issue of an instruction and that of the next instruction
! * Latency: Interval between the issue of an instruction and the generation of its result (completion)
! * Latency is also the interval between the execution of two instructions with an interdependent relationship.
! (although different cases may either increase or decrease Latency)
! =========================================================
! ======================== REGISTER USAGES ================
! =========================================================
! SH4 C ABI:
! - R0 to R3 are return values (can be overwritten)
! - R4 to R7 are input arguments (can be overwritten)
! - R8 to R13 are non-volatile (must be restored at end)
! - R14 is the frame pointer (must be restored at end)
! - R15 is the stack pointer (must be restored at end)
! - FR0 to FR3 are return values (can be overwritten)
! - FR4 to FR11 are input arguments (can be overwritten)
! - FR12 to FR13 are non-volatile (must be restored at end)
!r0 = clip flags
!r1 = GPU command
!r2 = temp
!r3 = prefetch address
!r4 = src pointer ARG
!r5 = dst pointer ARG
!r6 = quads count ARG
!r7 = ?
!r10 = PVR_CMD_VERTEX
!r11 = PVR_CMD_VERTEX_EOL
!fr0 = temp
!fr1 = u
!fr2 = v
!fr3 = c
!fr4 = x
!fr5 = y
!fr6 = z
!fr7 = w
!fr8 = VIEWPORT_HWIDTH
!fr9 = VIEWPORT_HHEIGHT
!fr10 = VIEWPORT_X_PLUS_HWIDTH
!fr11 = VIEWPORT_Y_PLUS_HHEIGHT
!fv4 = XYZW
! =========================================================
! ========================= TRANSFORM SETUP ===============
! =========================================================
.macro TransformSetup
mov r4,r3 ! MT, r3 = src
mov.l r10, @-r15 ! LS, push(r10)
add #-32, r5 ! EX, r5 -= sizeof(VERTEX)
mov.l r11, @-r15 ! LS, push(r11)
mov #0xE0, r10 ! EX, r10 = 0x00 00 00 E0
pref @r3 ! LS, PREFETCH r3 (first vertex)
shll16 r10 ! EX, r10 = 0x00 E0 00 00
shll8 r10 ! EX, r10 = 0xE0 00 00 00 (PVR_CMD_VERTEX)
mov #0xF0, r11 ! EX, r11 = 0x00 00 00 F0
shll16 r11 ! EX, r11 = 0x00 F0 00 00
shll8 r11 ! EX, r11 = 0xF0 00 00 00 (PVR_CMD_VERTEX_EOL)
nop ! MT, align to even boundary
.endm
.macro TransformEnd
mov.l @r15+, r11 ! LS, pop(r11)
mov.l @r15+, r10 ! LS, pop(r10)
add #32, r5 ! EX, r5 += sizeof(VERTEX)
rts ! CO, return after executing instruction in delay slot
mov r5,r0 ! MT, r0 = r5
.endm
! =========================================================
! ========================= VERTEX LOADING ================
! =========================================================
.macro LoadColouredVertex
! LOAD XYZ
fmov @r4+, fr4 ! LS, X = src->x
fmov @r4+, fr5 ! LS, Y = src->y
fmov @r4+, fr6 ! LS, Z = src->z
fldi1 fr7 ! LS, W = 1.0
! PREPARE NEXT VERTEX
add #16, r3 ! EX, r3 += VERTEX_STRIDE
pref @r3 ! LS, PREFETCH r3 (next vertex)
add #64, r5 ! EX, r5 += 2 * sizeof(VERTEX)
! TRANSFORM VERTEX
ftrv xmtrx, fv4 ! FE, TRANSFORM(XYZW)
! LOAD ATTRIBUTES
fmov @r4+,fr3 ! LS, C = src->color
.endm
.macro LoadTexturedVertex
! LOAD XYZ
fmov @r4+, fr4 ! LS, X = src->x
fmov @r4+, fr5 ! LS, Y = src->y
fmov @r4+, fr6 ! LS, Z = src->z
fldi1 fr7 ! LS, W = 1.0
! PREPARE NEXT VERTEX
add #24, r3 ! EX, r3 += VERTEX_STRIDE
pref @r3 ! LS, PREFETCH r3 (next vertex)
add #64, r5 ! EX, r5 += 2 * sizeof(VERTEX)
! TRANSFORM VERTEX
ftrv xmtrx, fv4 ! FE, TRANSFORM(XYZW)
! LOAD ATTRIBUTES
fmov @r4+,fr3 ! LS, C = src->color
fmov @r4+,fr1 ! LS, U = src->u
fmov @r4+,fr2 ! LS, V = src->v
.endm
! =========================================================
! ========================= VERTEX OUTPUT =================
! =========================================================
! To take advantage of SH4 dual instruction processing,
! clipflag calculation and vertex output are interleaved
.macro ProcessVertex1
fmov.s fr7,@-r5 ! LS, dst->w = W
fmov.s fr3,@-r5 ! LS, dst->c = C
fldi0 fr0 ! LS, fr0 = 0.0
fmov.s fr2,@-r5 ! LS, dst->v = V
fcmp/gt fr0,fr6 ! FE, T = Z > 0
fmov.s fr1,@-r5 ! LS, dst->u = U
movt r0 ! EX, CLIPFLAGS = T
fmov.s fr6,@-r5 ! LS, dst->z = Z
fmov.s fr5,@-r5 ! LS, dst->y = Y
fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm
.macro ProcessVertex2
fmov.s fr7,@-r5 ! LS, dst->w = W
fmov.s fr3,@-r5 ! LS, dst->c = C
fldi0 fr0 ! LS, fr0 = 0.0
fmov.s fr2,@-r5 ! LS, dst->v = V
fcmp/gt fr0,fr6 ! FE, T = Z > 0
fmov.s fr1,@-r5 ! LS, dst->u = U
movt r2 ! EX, tmp = T
fmov.s fr6,@-r5 ! LS, dst->z = Z
add r2,r2 ! EX, tmp = tmp + tmp
fmov.s fr5,@-r5 ! LS, dst->y = Y
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 1)
fmov.s fr4,@-r5 ! LS, dst->x = X
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm
.macro ProcessVertex3
fmov.s fr7,@-r5 ! LS, dst->w = W
fmov.s fr3,@-r5 ! LS, dst->c = C
fldi0 fr0 ! LS, fr0 = 0.0
fmov.s fr2,@-r5 ! LS, dst->v = V
fcmp/gt fr0,fr6 ! FE, T = Z > 0
fmov.s fr1,@-r5 ! LS, dst->u = U
movt r2 ! EX, tmp = T
fmov.s fr6,@-r5 ! LS, dst->z = Z
fmov.s fr5,@-r5 ! LS, dst->y = Y
shll2 r2 ! EX, tmp = tmp << 2
fmov.s fr4,@-r5 ! LS, dst->x = X
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 2)
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
.endm
.macro ProcessVertex4
fmov.s fr7,@-r5 ! LS, dst->w = W
or r11,r0 ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
fmov.s fr3,@-r5 ! LS, dst->c = C
fldi0 fr0 ! LS, fr0 = 0.0
fmov.s fr2,@-r5 ! LS, dst->v = V
fcmp/gt fr0,fr6 ! FE, T = Z > 0
fmov.s fr1,@-r5 ! LS, dst->u = U
movt r2 ! EX, tmp = T
fmov.s fr6,@-r5 ! LS, dst->z = Z
shll2 r2 ! EX, tmp = tmp << 2
fmov.s fr5,@-r5 ! LS, dst->y = Y
add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)
fmov.s fr4,@-r5 ! LS, dst->x = X
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 3)
mov.l r0,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX_EOL | CLIPFLAGS
.endm
! =========================================================
! ==================== TEXTURED VERTEX TRANSFORM ==========
! =========================================================
.global _DrawTexturedQuads
.type _DrawTexturedQuads,%function
.align 4
_DrawTexturedQuads:
! Setup
TransformSetup
.T_TRANSFORM_QUAD:
LoadTexturedVertex
ProcessVertex1
LoadTexturedVertex
ProcessVertex2
LoadTexturedVertex
ProcessVertex3
LoadTexturedVertex
ProcessVertex4
! CLIPFLAGS TESTING
and #15,r0
cmp/eq #0,r0 ! T = r0 == 0 (all points invisible)
bt/s .T_NO_POINTS_VISIBLE ! if T goto NO_POINTS_VISIBLE
nop
bra .T_SOME_POINTS_VISIBLE
nop
.T_NO_POINTS_VISIBLE:
bra .T_LOOP_END ! jump to loop end after executing instruction in delay slot
add #-128, r5 ! r5 -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration
.T_SOME_POINTS_VISIBLE:
.T_LOOP_END:
dt r6 ! r6--; T = r6 == 0
bf .T_TRANSFORM_QUAD ! if !T then goto T_TRANSFORM_QUAD
nop
TransformEnd
! =========================================================
! ==================== COLOURED VERTEX TRANSFORM ==========
! =========================================================
.global _DrawColouredQuads
.type _DrawColouredQuads,%function
.align 4
_DrawColouredQuads:
! Setup
fldi0 fr1 ! U = 0
fldi0 fr2 ! V = 0
TransformSetup
.C_TRANSFORM_QUAD:
LoadColouredVertex
ProcessVertex1
LoadColouredVertex
ProcessVertex2
LoadColouredVertex
ProcessVertex3
LoadColouredVertex
ProcessVertex4
! CLIPFLAGS TESTING
and #15,r0
cmp/eq #0,r0 ! T = r0 == 0 (all points invisible)
bt/s .C_NO_POINTS_VISIBLE ! if T goto NO_POINTS_VISIBLE
nop
bra .C_SOME_POINTS_VISIBLE
nop
.C_NO_POINTS_VISIBLE:
bra .C_LOOP_END ! jump to loop end after executing instruction in delay slot
add #-128, r5 ! r5 -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad
.C_SOME_POINTS_VISIBLE:
.C_LOOP_END:
dt r6 ! r6--; T = r6 == 0
bf .C_TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
nop
TransformEnd