mirror of
https://github.com/ClassiCube/ClassiCube.git
synced 2025-09-11 16:45:48 -04:00
Dreamcast: Optimise vertex transform by a couple of cycles
This commit is contained in:
parent
74f3c424e8
commit
6153ff8c8a
@ -7,13 +7,10 @@ _DrawColouredQuads:
|
||||
! Setup
|
||||
fldi0 fr1 ! U = 0
|
||||
fldi0 fr2 ! V = 0
|
||||
mov r4,r3 ! r3 = src
|
||||
add #-32, r5 ! r5 -= sizeof(VERTEX)
|
||||
TransformSetup
|
||||
ViewportTransformSetup _VP_COL_HWIDTH
|
||||
|
||||
.TRANSFORM_QUAD:
|
||||
mov.l CMD_COL_VERT, r1 ! r1 = GPU VERT command
|
||||
|
||||
LoadColouredVertex
|
||||
ProcessVertex1
|
||||
|
||||
@ -24,7 +21,7 @@ _DrawColouredQuads:
|
||||
ProcessVertex3
|
||||
|
||||
LoadColouredVertex
|
||||
ProcessVertex4 CMD_COL_EOS
|
||||
ProcessVertex4
|
||||
|
||||
! CLIPFLAGS TESTING
|
||||
cmp/eq #0,r0 ! T = r0 == 0 (all points invisible)
|
||||
@ -44,13 +41,9 @@ _DrawColouredQuads:
|
||||
bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
|
||||
nop
|
||||
|
||||
add #32, r5 ! r5 += sizeof(VERTEX)
|
||||
rts ! return after executing instruction in delay slot
|
||||
mov r5,r0 ! r0 = r5
|
||||
TransformEnd
|
||||
|
||||
.align 2
|
||||
CMD_COL_VERT: .long 0xe0000000
|
||||
CMD_COL_EOS: .long 0xf0000000
|
||||
.align 4
|
||||
|
||||
.global _VP_COL_HWIDTH
|
||||
_VP_COL_HWIDTH: .long 0
|
||||
|
@ -5,13 +5,10 @@
|
||||
|
||||
_DrawTexturedQuads:
|
||||
! Setup
|
||||
mov r4,r3 ! r3 = src
|
||||
add #-32, r5 ! r5 -= sizeof(VERTEX)
|
||||
TransformSetup
|
||||
ViewportTransformSetup _VP_TEX_HWIDTH
|
||||
|
||||
.TRANSFORM_QUAD:
|
||||
mov.l CMD_TEX_VERT, r1 ! r1 = GPU VERT command
|
||||
|
||||
LoadTexturedVertex
|
||||
ProcessVertex1
|
||||
|
||||
@ -22,7 +19,7 @@ _DrawTexturedQuads:
|
||||
ProcessVertex3
|
||||
|
||||
LoadTexturedVertex
|
||||
ProcessVertex4 CMD_TEX_EOS
|
||||
ProcessVertex4
|
||||
|
||||
! CLIPFLAGS TESTING
|
||||
cmp/eq #0,r0 ! T = r0 == 0 (all points invisible)
|
||||
@ -42,13 +39,9 @@ _DrawTexturedQuads:
|
||||
bf .TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
|
||||
nop
|
||||
|
||||
add #32, r5 ! r5 += sizeof(VERTEX)
|
||||
rts ! return after executing instruction in delay slot
|
||||
mov r5,r0 ! r0 = r5
|
||||
TransformEnd
|
||||
|
||||
.align 2
|
||||
CMD_TEX_VERT: .long 0xe0000000
|
||||
CMD_TEX_EOS: .long 0xf0000000
|
||||
.align 4
|
||||
|
||||
.global _VP_TEX_HWIDTH
|
||||
_VP_TEX_HWIDTH: .long 0
|
||||
|
@ -3,8 +3,8 @@
|
||||
! =========================================================
|
||||
! The SH4 can dual issue (i.e. parallel execution) two instructions
|
||||
! as long as the groups of the two instructions are different:
|
||||
! * LS - most APU and FPU register load/stores
|
||||
! * EX - most APU arithmetic instructions
|
||||
! * LS - most ALU and FPU register load/stores
|
||||
! * EX - most ALU arithmetic instructions
|
||||
! * MT - TST, CMP, NOP, MOV Rm,Rn
|
||||
! * FE - most FPU arithmetic instructions
|
||||
! * CO - other instructions (NOTE: Cannot be exeucted in parallel)
|
||||
@ -14,7 +14,6 @@
|
||||
! * Latency: Interval between the issue of an instruction and the generation of its result (completion)
|
||||
! * Latency is also the interval between the execution of two instructions with an interdependent relationship.
|
||||
! (although different cases may either increase or decrease Latency)
|
||||
!
|
||||
|
||||
|
||||
! =========================================================
|
||||
@ -38,6 +37,8 @@
|
||||
!r5 = dst pointer ARG
|
||||
!r6 = quads count ARG
|
||||
!r7 = ?
|
||||
!r10 = PVR_CMD_VERTEX
|
||||
!r11 = PVR_CMD_VERTEX_EOL
|
||||
|
||||
!fr0 = temp
|
||||
!fr1 = u
|
||||
@ -55,6 +56,34 @@
|
||||
!fv4 = XYZW
|
||||
|
||||
|
||||
! =========================================================
|
||||
! ========================= TRANSFORM SETUP ===============
|
||||
! =========================================================
|
||||
.macro TransformSetup
|
||||
mov r4,r3 ! MT, r3 = src
|
||||
mov.l r10, @-r15 ! LS, push(r10)
|
||||
add #-32, r5 ! EX, r5 -= sizeof(VERTEX)
|
||||
mov.l r11, @-r15 ! LS, push(r11)
|
||||
mov #0xE0, r10 ! EX, r10 = 0x00 00 00 E0
|
||||
pref @r3 ! LS, PREFETCH r3 (first vertex)
|
||||
shll16 r10 ! EX, r10 = 0x00 E0 00 00
|
||||
shll8 r10 ! EX, r10 = 0xE0 00 00 00 (PVR_CMD_VERTEX)
|
||||
mov #0xF0, r11 ! EX, r11 = 0x00 00 00 F0
|
||||
shll16 r11 ! EX, r11 = 0x00 F0 00 00
|
||||
shll8 r11 ! EX, r11 = 0xF0 00 00 00 (PVR_CMD_VERTEX_EOL)
|
||||
nop ! MT, align to even boundary
|
||||
.endm
|
||||
|
||||
.macro TransformEnd
|
||||
mov.l @r15+, r11 ! LS, pop(r11)
|
||||
mov.l @r15+, r10 ! LS, pop(r10)
|
||||
|
||||
add #32, r5 ! EX, r5 += sizeof(VERTEX)
|
||||
rts ! CO, return after executing instruction in delay slot
|
||||
mov r5,r0 ! MT, r0 = r5
|
||||
.endm
|
||||
|
||||
|
||||
! =========================================================
|
||||
! ========================= VERTEX LOADING ================
|
||||
! =========================================================
|
||||
@ -108,7 +137,7 @@
|
||||
fmov.s fr6,@-r5 ! LS, dst->z = Z
|
||||
fmov.s fr5,@-r5 ! LS, dst->y = Y
|
||||
fmov.s fr4,@-r5 ! LS, dst->x = X
|
||||
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
|
||||
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
|
||||
.endm
|
||||
|
||||
.macro ProcessVertex2
|
||||
@ -124,7 +153,7 @@
|
||||
fmov.s fr5,@-r5 ! LS, dst->y = Y
|
||||
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 1)
|
||||
fmov.s fr4,@-r5 ! LS, dst->x = X
|
||||
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
|
||||
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
|
||||
.endm
|
||||
|
||||
.macro ProcessVertex3
|
||||
@ -140,11 +169,12 @@
|
||||
shll2 r2 ! EX, tmp = tmp << 2
|
||||
fmov.s fr4,@-r5 ! LS, dst->x = X
|
||||
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 2)
|
||||
mov.l r1,@-r5 ! LS, dst->flags = CMD_VERT
|
||||
mov.l r10,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX
|
||||
.endm
|
||||
|
||||
.macro ProcessVertex4 eos_addr
|
||||
.macro ProcessVertex4
|
||||
fmov.s fr7,@-r5 ! LS, dst->w = W
|
||||
or r11,r0 ! EX, CLIPFLAGS |= PVR_CMD_VERTEX_EOL
|
||||
fmov.s fr3,@-r5 ! LS, dst->c = C
|
||||
fneg fr7 ! LS, W = -W
|
||||
fmov.s fr2,@-r5 ! LS, dst->v = V
|
||||
@ -154,12 +184,10 @@
|
||||
fmov.s fr6,@-r5 ! LS, dst->z = Z
|
||||
shll2 r2 ! EX, tmp = tmp << 2
|
||||
fmov.s fr5,@-r5 ! LS, dst->y = Y
|
||||
add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2)
|
||||
add r2,r2 ! EX, tmp = (tmp << 2) + (tmp << 2) (T << 3)
|
||||
fmov.s fr4,@-r5 ! LS, dst->x = X
|
||||
mov.l \eos_addr, r1 ! LS, r1 = GPU EOS command
|
||||
or r2,r0 ! EX, CLIPFLAGS |= tmp (T << 3)
|
||||
or r0,r1 ! EX, r1 |= CLIPFLAGS
|
||||
mov.l r1,@-r5 ! LS, dst->flags = GPU EOS | CLIPFLAGS
|
||||
mov.l r0,@-r5 ! LS, dst->flags = PVR_CMD_VERTEX_EOL | CLIPFLAGS
|
||||
.endm
|
||||
|
||||
|
||||
|
@ -156,7 +156,7 @@ Although the regular linux compiliation flags will work fine, to take full advan
|
||||
|
||||
## Compiling - macOS
|
||||
|
||||
```cc -fno-math-errno *.c interop_cocoa.m -o ClassiCube -framework Cocoa -framework OpenGL -framework IOKit -lobjc```
|
||||
```cc -fno-math-errno *.c Window_cocoa.m -o ClassiCube -framework Cocoa -framework OpenGL -framework IOKit -lobjc```
|
||||
|
||||
Note: You may need to install Xcode before you can compile ClassiCube
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user