From bd223eb457715d01a3f7a5f8498a1591ed462029 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sat, 22 Jun 2024 20:35:06 +1000 Subject: [PATCH] PS2: Slightly optimise vertex transform --- misc/dreamcast/DrawColouredQuads.S | 2 +- misc/dreamcast/DrawTexturedQuads.S | 2 +- misc/dreamcast/Makefile | 2 +- ...{ViewportTransform.S => VertexTransform.S} | 0 misc/ps2/Makefile | 35 +++-- misc/ps2/VertexTransform.S | 135 ++++++++++++++++++ src/Graphics_PS2.c | 58 +++++--- 7 files changed, 201 insertions(+), 33 deletions(-) rename misc/dreamcast/{ViewportTransform.S => VertexTransform.S} (100%) create mode 100644 misc/ps2/VertexTransform.S diff --git a/misc/dreamcast/DrawColouredQuads.S b/misc/dreamcast/DrawColouredQuads.S index 44c37be0e..63751a1c7 100644 --- a/misc/dreamcast/DrawColouredQuads.S +++ b/misc/dreamcast/DrawColouredQuads.S @@ -1,4 +1,4 @@ -#include "ViewportTransform.S" +#include "VertexTransform.S" .global _DrawColouredQuads .align 4 .type _DrawColouredQuads,%function diff --git a/misc/dreamcast/DrawTexturedQuads.S b/misc/dreamcast/DrawTexturedQuads.S index 6de802512..49a937305 100644 --- a/misc/dreamcast/DrawTexturedQuads.S +++ b/misc/dreamcast/DrawTexturedQuads.S @@ -1,4 +1,4 @@ -#include "ViewportTransform.S" +#include "VertexTransform.S" .global _DrawTexturedQuads .align 4 .type _DrawTexturedQuads,%function diff --git a/misc/dreamcast/Makefile b/misc/dreamcast/Makefile index 2f8602e57..e090bc58b 100644 --- a/misc/dreamcast/Makefile +++ b/misc/dreamcast/Makefile @@ -77,7 +77,7 @@ $(BUILD_DIR)/%.o: third_party/bearssl/src/%.c kos-cc $(CFLAGS) -c $< -o $@ $(BUILD_DIR)/%.o: misc/dreamcast/%.S - kos-cc -c $< -o $@ + kos-cc $(DEPFLAGS) -c $< -o $@ # Dependency tracking $(DEPFILES): diff --git a/misc/dreamcast/ViewportTransform.S b/misc/dreamcast/VertexTransform.S similarity index 100% rename from misc/dreamcast/ViewportTransform.S rename to misc/dreamcast/VertexTransform.S diff --git a/misc/ps2/Makefile b/misc/ps2/Makefile index 539ebd321..bb220f3c0 100644 --- a/misc/ps2/Makefile +++ b/misc/ps2/Makefile @@ -1,10 +1,17 @@ ifeq ($(strip $(PS2SDK)),) $(error "PS2SDK must be set in your environment") endif +SOURCE_DIRS := src misc/ps2 BUILD_DIR = build-ps2 -CFILES := $(wildcard src/*.c) -OBJS := $(patsubst src/%.c, $(BUILD_DIR)/%.o, $(CFILES)) +S_FILES := $(foreach dir,$(SOURCE_DIRS),$(wildcard $(dir)/*.S)) +C_FILES := $(foreach dir,$(SOURCE_DIRS),$(wildcard $(dir)/*.c)) +OBJS := $(addprefix $(BUILD_DIR)/, $(notdir $(C_FILES:%.c=%.o) $(S_FILES:%.S=%.o))) + +# Dependency tracking +DEPFLAGS = -MT $@ -MMD -MP -MF $(BUILD_DIR)/$*.d +DEPFILES := $(OBJS:%.o=%.d) + IOP_MODS:= DEV9_irx.o NETMAN_irx.o SMAP_irx.o USBD_irx.o BDM_irx.o BDMFS_FATFS_irx.o USBMASS_BD_irx.o USBHDFSD_irx.o USBMOUSE_irx.o USBKBD_irx.o EE_BIN = ClassiCube-ps2.elf @@ -23,6 +30,8 @@ clean: $(BUILD_DIR): mkdir -p $@ + +include $(PS2SDK)/samples/Makefile.pref # Networking IRX modules $(BUILD_DIR)/DEV9_irx.c: $(PS2SDK)/iop/irx/ps2dev9.irx @@ -58,17 +67,27 @@ $(BUILD_DIR)/USBKBD_irx.c: $(PS2SDK)/iop/irx/ps2kbd.irx bin2c $< $@ USBKBD_irx -include $(PS2SDK)/samples/Makefile.pref +#--------------------------------------------------------------------------------- +# executable generation +#--------------------------------------------------------------------------------- +$(EE_BIN): $(EE_OBJS) + $(EE_CC) -T$(EE_LINKFILE) -O2 -o $(EE_BIN) $(EE_OBJS) $(EE_LDFLAGS) $(EE_LIBS) + +#--------------------------------------------------------------------------------- +# object generation +#--------------------------------------------------------------------------------- $(BUILD_DIR)/%.o: src/%.c - $(EE_CC) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@ + $(EE_CC) $(DEPFLAGS) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@ $(BUILD_DIR)/%.o: $(BUILD_DIR)/%.c # IOP modules $(EE_CC) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@ -$(BUILD_DIR)/%.o: %.S - $(EE_CC) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@ +$(BUILD_DIR)/%.o: misc/ps2/%.S + $(EE_CC) $(DEPFLAGS) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@ -$(EE_BIN): $(EE_OBJS) - $(EE_CC) -T$(EE_LINKFILE) -O2 -o $(EE_BIN) $(EE_OBJS) $(EE_LDFLAGS) $(EE_LIBS) +# Dependency tracking +$(DEPFILES): + +include $(wildcard $(DEPFILES)) diff --git a/misc/ps2/VertexTransform.S b/misc/ps2/VertexTransform.S new file mode 100644 index 000000000..ca0add2d1 --- /dev/null +++ b/misc/ps2/VertexTransform.S @@ -0,0 +1,135 @@ +# REGISTER USAGE +# vf0 = hardware coded to (0,0,0,1) +# vf1 = mvp.row1 +# vf2 = mvp.row2 +# vf3 = mvp.row3 +# vf4 = mvp.row4 +# vf5 = clipping scale adjustments to match guardbands +# NOTE: vclipw.xyz takes 4 cycles to produce result, which must be accounted for + +.align 4 + +.global LoadMvpMatrix +.type LoadMvpMatrix,%function +.global LoadClipScaleFactors +.type LoadClipScaleFactors,%function +.global TransformTexturedQuad +.type TransformTexturedQuad,%function + +# Loads matrix into VU0 registers +# $a0 = addresss of mvp +LoadMvpMatrix: + lqc2 $vf1, 0x00($a0) # vf1 = mvp.row1 + lqc2 $vf2, 0x10($a0) # vf2 = mvp.row2 + lqc2 $vf3, 0x20($a0) # vf3 = mvp.row3 + lqc2 $vf4, 0x30($a0) # vf4 = mvp.row4 + jr $ra + nop + + +# Loads clipping scaling factors into VU0 registers +# $a0 = addresss of factors +LoadClipScaleFactors: + lqc2 $vf5, 0x00($a0) # vf5 = factors + jr $ra + nop + +# Transforms 4 vertices with size of 24 bytes +# $a0 = addresss of src vertices +# $a1 = addresss of dst vertices +# $a2 = address of tmp vertex +# $a3 = address of clip flags +TransformTexturedQuad: + # LOAD 1.0 into W + lw $t0,ONE_VALUE # t0 = 1.0f + sw $t0,0x0C($a2) # tmp.w = f5 + + # LOAD VERTEX 1 + ld $t0,0x00($a0) # t0 = src[0].x,y + sd $t0,0x00($a2) # tmp.x,y = t0 + lw $t0,0x08($a0) # t0 = src[0].z + sw $t0,0x08($a2) # tmp.z = t0 + + # TRANSFORM VERTEX 1 + lqc2 $vf10, 0x00($a2) # IN = tmp + vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1) + vmaddax $ACC, $vf1, $vf10 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x + vmadday $ACC, $vf2, $vf10 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y + vmaddz $vf11, $vf3, $vf10 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z + sqc2 $vf11, 0x00($a1) # dst[0] = TRANSFORMED(V0) + #vmul $vf10, $vf11, $vf5 # TMP = TRANSFORMED(V0) * CLIP_PLANES_ADJUST + #vclipw.xyz $vf10, $vf10 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w)) + + # LOAD VERTEX 2 + ld $t0,0x18($a0) # t0 = src[1].x,y + sd $t0,0x00($a2) # tmp.x,y = t0 + lw $t0,0x20($a0) # t0 = src[1].z + sw $t0,0x08($a2) # tmp.z = t0 + #cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS] + #sw $t0,0x00($a3) # clip_flags[0] = t0 + + # TRANSFORM VERTEX 2 + lqc2 $vf12, 0x00($a2) # IN = tmp + vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1) + vmaddax $ACC, $vf1, $vf12 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x + vmadday $ACC, $vf2, $vf12 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y + vmaddz $vf13, $vf3, $vf12 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z + sqc2 $vf13, 0x10($a1) # dst[1] = TRANSFORMED(V1) + #vmul $vf12, $vf13, $vf5 # TMP = TRANSFORMED(V1) * CLIP_PLANES_ADJUST + #vclipw.xyz $vf12, $vf12 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w)) + + # LOAD VERTEX 3 + ld $t0,0x30($a0) # t0 = src[2].x,y + sd $t0,0x00($a2) # tmp.x,y = t0 + lw $t0,0x38($a0) # t0 = src[2].z + sw $t0,0x08($a2) # tmp.z = t0 + #cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS] + #sw $t0,0x04($a3) # clip_flags[1] = t0 + + # TRANSFORM VERTEX 3 + lqc2 $vf14, 0x00($a2) # IN = tmp + vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1) + vmaddax $ACC, $vf1, $vf14 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x + vmadday $ACC, $vf2, $vf14 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y + vmaddz $vf15, $vf3, $vf14 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z + sqc2 $vf15, 0x20($a1) # dst[2] = TRANSFORMED(V2) + #vmul $vf14, $vf15, $vf5 # TMP = TRANSFORMED(V2) * CLIP_PLANES_ADJUST + #vclipw.xyz $vf14, $vf14 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w)) + + # LOAD VERTEX 4 + ld $t0,0x48($a0) # t0 = src[3].x,y + sd $t0,0x00($a2) # tmp.x,y = t0 + lw $t0,0x50($a0) # t0 = src[3].z + sw $t0,0x08($a2) # tmp.z = t0 + #cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS] + #sw $t0,0x08($a3) # clip_flags[2] = t0 + + # TRANSFORM VERTEX 4 + lqc2 $vf16, 0x00($a2) # IN = tmp + vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1) + vmaddax $ACC, $vf1, $vf16 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x + vmadday $ACC, $vf2, $vf16 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y + vmaddz $vf17, $vf3, $vf16 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z + #vmul $vf16, $vf17, $vf5 # TMP = TRANSFORMED(V3) * CLIP_PLANES_ADJUST + #vclipw.xyz $vf16, $vf16 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w)) + + # Desired output + # dst[0] = V0 + # dst[1] = V1 + # dst[2] = V2 + # dst[3] = V2 + # dst[4] = V3 + # dst[5] = V0 + sqc2 $vf15, 0x30($a1) # dst[3] = TRANSFORMED(V2) + sqc2 $vf17, 0x40($a1) # dst[4] = TRANSFORMED(V3) + sqc2 $vf11, 0x50($a1) # dst[5] = TRANSFORMED(V0) + #vnop # adjust for delay + #cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS] + #sw $t0,0x0C($a3) # clip_flags[3] = t0 + jr $ra + nop + +.align 4 + +.global ONE_VALUE +ONE_VALUE: .float 1.0 diff --git a/src/Graphics_PS2.c b/src/Graphics_PS2.c index bbbe92485..b235340f5 100644 --- a/src/Graphics_PS2.c +++ b/src/Graphics_PS2.c @@ -435,6 +435,7 @@ typedef struct Matrix VU0_MATRIX __attribute__((aligned(16))); typedef struct Vec4 VU0_VECTOR __attribute__((aligned(16))); static VU0_MATRIX mvp; +extern void LoadMvpMatrix(VU0_MATRIX* matrix); void Gfx_LoadMatrix(MatrixType type, const struct Matrix* matrix) { if (type == MATRIX_VIEW) _view = *matrix; @@ -442,14 +443,7 @@ void Gfx_LoadMatrix(MatrixType type, const struct Matrix* matrix) { Matrix_Mul(&mvp, &_view, &_proj); // TODO - asm __volatile__( - "lqc2 $vf1, 0x00(%0) \n" // vf1 = mvp.row1 - "lqc2 $vf2, 0x10(%0) \n" // vf2 = mvp.row2 - "lqc2 $vf3, 0x20(%0) \n" // vf3 = mvp.row3 - "lqc2 $vf4, 0x30(%0) \n" // vf4 = mvp.row4 - : - : "r" (&mvp) - ); + LoadMvpMatrix(&mvp); } void Gfx_LoadIdentityMatrix(MatrixType type) { @@ -619,6 +613,8 @@ static u64* DrawTexturedTriangle(u64* dw, VU0_VECTOR* coords, return dw; } +extern void TransformTexturedQuad(void* src, VU0_VECTOR* dst, VU0_VECTOR* tmp, int* clip_flags); + static void DrawTexturedTriangles(int verticesCount, int startVertex) { struct VertexTextured* v = (struct VertexTextured*)gfx_vertices + startVertex; qword_t* base = q; @@ -626,29 +622,22 @@ static void DrawTexturedTriangles(int verticesCount, int startVertex) { u64* dw = (u64*)q; unsigned numVerts = 0; - VU0_VECTOR V[4]; + VU0_VECTOR V[6], tmp; + int clip[4]; for (int i = 0; i < verticesCount / 4; i++, v += 4) { - TransformVertex(v + 0, &V[0]); - TransformVertex(v + 1, &V[1]); - TransformVertex(v + 2, &V[2]); - TransformVertex(v + 3, &V[3]); + TransformTexturedQuad(v, V, &tmp, clip); - // V0, V1, V2 + //if (((clip[0] | clip[1] | clip[2]) & 0x3F) == 0) { if (NotClipped(V[0]) && NotClipped(V[1]) && NotClipped(V[2])) { dw = DrawTexturedTriangle(dw, V, v + 0, v + 1, v + 2); numVerts += 3; } - - VU0_VECTOR v0 = V[0]; - V[0] = V[2]; - V[1] = V[3]; - V[2] = v0; - // V2, V3, V0 - if (NotClipped(V[0]) && NotClipped(V[1]) && NotClipped(V[2])) { - dw = DrawTexturedTriangle(dw, V, v + 2, v + 3, v + 0); + //if (((clip[2] | clip[3] | clip[0]) & 0x3F) == 0) { + if (NotClipped(V[3]) && NotClipped(V[4]) && NotClipped(V[5])) { + dw = DrawTexturedTriangle(dw, V + 3, v + 2, v + 3, v + 0); numVerts += 3; } } @@ -821,11 +810,36 @@ void Gfx_OnWindowResize(void) { Gfx_SetScissor( 0, 0, Game.Width, Game.Height); } +extern void LoadClipScaleFactors(VU0_VECTOR* scale); void Gfx_SetViewport(int x, int y, int w, int h) { vp_hwidth = w / 2; vp_hheight = h / 2; vp_originX = ftoi4(2048 - (x / 2)); vp_originY = -ftoi4(2048 - (y / 2)); + + // The code below clips to the viewport clip planes + // For e.g. X this is [2048 - vp_width / 2, 2048 + vp_width / 2] + // However the guard band itself ranges from 0 to 4096 + // To reduce need to clip, clip against guard band on X/Y axes instead + /*return + xAdj >= -pos.w && xAdj <= pos.w && + yAdj >= -pos.w && yAdj <= pos.w && + pos.z >= -pos.w && pos.z <= pos.w;*/ + + // Rescale clip planes to guard band extent: + // X/W * vp_hwidth <= vp_hwidth -- clipping against viewport + // X/W <= 1 + // X <= W + // X/W * vp_hwidth <= 2048 -- clipping against guard band + // X/W <= 2048 / vp_hwidth + // X * vp_hwidth / 2048 <= W + VU0_VECTOR scale; + scale.x = vp_hwidth / 2048.0f; + scale.y = vp_hheight / 2048.0f; + scale.z = 1.0f; + scale.w = 1.0f; + + LoadClipScaleFactors(&scale); } void Gfx_SetScissor(int x, int y, int w, int h) {