PS2: Slightly optimise vertex transform

This commit is contained in:
UnknownShadow200 2024-06-22 20:35:06 +10:00
parent 90643e8077
commit bd223eb457
7 changed files with 201 additions and 33 deletions

View File

@ -1,4 +1,4 @@
#include "ViewportTransform.S"
#include "VertexTransform.S"
.global _DrawColouredQuads
.align 4
.type _DrawColouredQuads,%function

View File

@ -1,4 +1,4 @@
#include "ViewportTransform.S"
#include "VertexTransform.S"
.global _DrawTexturedQuads
.align 4
.type _DrawTexturedQuads,%function

View File

@ -77,7 +77,7 @@ $(BUILD_DIR)/%.o: third_party/bearssl/src/%.c
kos-cc $(CFLAGS) -c $< -o $@
$(BUILD_DIR)/%.o: misc/dreamcast/%.S
kos-cc -c $< -o $@
kos-cc $(DEPFLAGS) -c $< -o $@
# Dependency tracking
$(DEPFILES):

View File

@ -1,10 +1,17 @@
ifeq ($(strip $(PS2SDK)),)
$(error "PS2SDK must be set in your environment")
endif
SOURCE_DIRS := src misc/ps2
BUILD_DIR = build-ps2
CFILES := $(wildcard src/*.c)
OBJS := $(patsubst src/%.c, $(BUILD_DIR)/%.o, $(CFILES))
S_FILES := $(foreach dir,$(SOURCE_DIRS),$(wildcard $(dir)/*.S))
C_FILES := $(foreach dir,$(SOURCE_DIRS),$(wildcard $(dir)/*.c))
OBJS := $(addprefix $(BUILD_DIR)/, $(notdir $(C_FILES:%.c=%.o) $(S_FILES:%.S=%.o)))
# Dependency tracking
DEPFLAGS = -MT $@ -MMD -MP -MF $(BUILD_DIR)/$*.d
DEPFILES := $(OBJS:%.o=%.d)
IOP_MODS:= DEV9_irx.o NETMAN_irx.o SMAP_irx.o USBD_irx.o BDM_irx.o BDMFS_FATFS_irx.o USBMASS_BD_irx.o USBHDFSD_irx.o USBMOUSE_irx.o USBKBD_irx.o
EE_BIN = ClassiCube-ps2.elf
@ -23,6 +30,8 @@ clean:
$(BUILD_DIR):
mkdir -p $@
include $(PS2SDK)/samples/Makefile.pref
# Networking IRX modules
$(BUILD_DIR)/DEV9_irx.c: $(PS2SDK)/iop/irx/ps2dev9.irx
@ -58,17 +67,27 @@ $(BUILD_DIR)/USBKBD_irx.c: $(PS2SDK)/iop/irx/ps2kbd.irx
bin2c $< $@ USBKBD_irx
include $(PS2SDK)/samples/Makefile.pref
#---------------------------------------------------------------------------------
# executable generation
#---------------------------------------------------------------------------------
$(EE_BIN): $(EE_OBJS)
$(EE_CC) -T$(EE_LINKFILE) -O2 -o $(EE_BIN) $(EE_OBJS) $(EE_LDFLAGS) $(EE_LIBS)
#---------------------------------------------------------------------------------
# object generation
#---------------------------------------------------------------------------------
$(BUILD_DIR)/%.o: src/%.c
$(EE_CC) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@
$(EE_CC) $(DEPFLAGS) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@
$(BUILD_DIR)/%.o: $(BUILD_DIR)/%.c # IOP modules
$(EE_CC) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@
$(BUILD_DIR)/%.o: %.S
$(EE_CC) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@
$(BUILD_DIR)/%.o: misc/ps2/%.S
$(EE_CC) $(DEPFLAGS) $(EE_CFLAGS) $(EE_INCS) -c $< -o $@
$(EE_BIN): $(EE_OBJS)
$(EE_CC) -T$(EE_LINKFILE) -O2 -o $(EE_BIN) $(EE_OBJS) $(EE_LDFLAGS) $(EE_LIBS)
# Dependency tracking
$(DEPFILES):
include $(wildcard $(DEPFILES))

135
misc/ps2/VertexTransform.S Normal file
View File

@ -0,0 +1,135 @@
# REGISTER USAGE
# vf0 = hardware coded to (0,0,0,1)
# vf1 = mvp.row1
# vf2 = mvp.row2
# vf3 = mvp.row3
# vf4 = mvp.row4
# vf5 = clipping scale adjustments to match guardbands
# NOTE: vclipw.xyz takes 4 cycles to produce result, which must be accounted for
.align 4
.global LoadMvpMatrix
.type LoadMvpMatrix,%function
.global LoadClipScaleFactors
.type LoadClipScaleFactors,%function
.global TransformTexturedQuad
.type TransformTexturedQuad,%function
# Loads matrix into VU0 registers
# $a0 = addresss of mvp
LoadMvpMatrix:
lqc2 $vf1, 0x00($a0) # vf1 = mvp.row1
lqc2 $vf2, 0x10($a0) # vf2 = mvp.row2
lqc2 $vf3, 0x20($a0) # vf3 = mvp.row3
lqc2 $vf4, 0x30($a0) # vf4 = mvp.row4
jr $ra
nop
# Loads clipping scaling factors into VU0 registers
# $a0 = addresss of factors
LoadClipScaleFactors:
lqc2 $vf5, 0x00($a0) # vf5 = factors
jr $ra
nop
# Transforms 4 vertices with size of 24 bytes
# $a0 = addresss of src vertices
# $a1 = addresss of dst vertices
# $a2 = address of tmp vertex
# $a3 = address of clip flags
TransformTexturedQuad:
# LOAD 1.0 into W
lw $t0,ONE_VALUE # t0 = 1.0f
sw $t0,0x0C($a2) # tmp.w = f5
# LOAD VERTEX 1
ld $t0,0x00($a0) # t0 = src[0].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x08($a0) # t0 = src[0].z
sw $t0,0x08($a2) # tmp.z = t0
# TRANSFORM VERTEX 1
lqc2 $vf10, 0x00($a2) # IN = tmp
vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, $vf1, $vf10 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x
vmadday $ACC, $vf2, $vf10 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y
vmaddz $vf11, $vf3, $vf10 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z
sqc2 $vf11, 0x00($a1) # dst[0] = TRANSFORMED(V0)
#vmul $vf10, $vf11, $vf5 # TMP = TRANSFORMED(V0) * CLIP_PLANES_ADJUST
#vclipw.xyz $vf10, $vf10 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
# LOAD VERTEX 2
ld $t0,0x18($a0) # t0 = src[1].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x20($a0) # t0 = src[1].z
sw $t0,0x08($a2) # tmp.z = t0
#cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
#sw $t0,0x00($a3) # clip_flags[0] = t0
# TRANSFORM VERTEX 2
lqc2 $vf12, 0x00($a2) # IN = tmp
vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, $vf1, $vf12 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x
vmadday $ACC, $vf2, $vf12 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y
vmaddz $vf13, $vf3, $vf12 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z
sqc2 $vf13, 0x10($a1) # dst[1] = TRANSFORMED(V1)
#vmul $vf12, $vf13, $vf5 # TMP = TRANSFORMED(V1) * CLIP_PLANES_ADJUST
#vclipw.xyz $vf12, $vf12 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
# LOAD VERTEX 3
ld $t0,0x30($a0) # t0 = src[2].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x38($a0) # t0 = src[2].z
sw $t0,0x08($a2) # tmp.z = t0
#cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
#sw $t0,0x04($a3) # clip_flags[1] = t0
# TRANSFORM VERTEX 3
lqc2 $vf14, 0x00($a2) # IN = tmp
vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, $vf1, $vf14 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x
vmadday $ACC, $vf2, $vf14 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y
vmaddz $vf15, $vf3, $vf14 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z
sqc2 $vf15, 0x20($a1) # dst[2] = TRANSFORMED(V2)
#vmul $vf14, $vf15, $vf5 # TMP = TRANSFORMED(V2) * CLIP_PLANES_ADJUST
#vclipw.xyz $vf14, $vf14 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
# LOAD VERTEX 4
ld $t0,0x48($a0) # t0 = src[3].x,y
sd $t0,0x00($a2) # tmp.x,y = t0
lw $t0,0x50($a0) # t0 = src[3].z
sw $t0,0x08($a2) # tmp.z = t0
#cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
#sw $t0,0x08($a3) # clip_flags[2] = t0
# TRANSFORM VERTEX 4
lqc2 $vf16, 0x00($a2) # IN = tmp
vmulaw $ACC, $vf4, $vf0 # ACC[xyzw] = mvp.row3[xyzw] * 1.0f; (vf0.w is 1)
vmaddax $ACC, $vf1, $vf16 # ACC[xyzw] = ACC[xyzw] + mvp.row0[xyzw] * IN.x
vmadday $ACC, $vf2, $vf16 # ACC[xyzw] = ACC[xyzw] + mvp.row1[xyzw] * IN.y
vmaddz $vf17, $vf3, $vf16 # OUT[xyzw] = ACC[xyzw] + mvp.row2[xyzw] * IN.z
#vmul $vf16, $vf17, $vf5 # TMP = TRANSFORMED(V3) * CLIP_PLANES_ADJUST
#vclipw.xyz $vf16, $vf16 # CLIP_FLAGS.append(CLIP(TMP.xyz, TMP.w))
# Desired output
# dst[0] = V0
# dst[1] = V1
# dst[2] = V2
# dst[3] = V2
# dst[4] = V3
# dst[5] = V0
sqc2 $vf15, 0x30($a1) # dst[3] = TRANSFORMED(V2)
sqc2 $vf17, 0x40($a1) # dst[4] = TRANSFORMED(V3)
sqc2 $vf11, 0x50($a1) # dst[5] = TRANSFORMED(V0)
#vnop # adjust for delay
#cfc2 $t0, $18 # t0 = VP0_REGS[CLIP_FLAGS]
#sw $t0,0x0C($a3) # clip_flags[3] = t0
jr $ra
nop
.align 4
.global ONE_VALUE
ONE_VALUE: .float 1.0

View File

@ -435,6 +435,7 @@ typedef struct Matrix VU0_MATRIX __attribute__((aligned(16)));
typedef struct Vec4 VU0_VECTOR __attribute__((aligned(16)));
static VU0_MATRIX mvp;
extern void LoadMvpMatrix(VU0_MATRIX* matrix);
void Gfx_LoadMatrix(MatrixType type, const struct Matrix* matrix) {
if (type == MATRIX_VIEW) _view = *matrix;
@ -442,14 +443,7 @@ void Gfx_LoadMatrix(MatrixType type, const struct Matrix* matrix) {
Matrix_Mul(&mvp, &_view, &_proj);
// TODO
asm __volatile__(
"lqc2 $vf1, 0x00(%0) \n" // vf1 = mvp.row1
"lqc2 $vf2, 0x10(%0) \n" // vf2 = mvp.row2
"lqc2 $vf3, 0x20(%0) \n" // vf3 = mvp.row3
"lqc2 $vf4, 0x30(%0) \n" // vf4 = mvp.row4
:
: "r" (&mvp)
);
LoadMvpMatrix(&mvp);
}
void Gfx_LoadIdentityMatrix(MatrixType type) {
@ -619,6 +613,8 @@ static u64* DrawTexturedTriangle(u64* dw, VU0_VECTOR* coords,
return dw;
}
extern void TransformTexturedQuad(void* src, VU0_VECTOR* dst, VU0_VECTOR* tmp, int* clip_flags);
static void DrawTexturedTriangles(int verticesCount, int startVertex) {
struct VertexTextured* v = (struct VertexTextured*)gfx_vertices + startVertex;
qword_t* base = q;
@ -626,29 +622,22 @@ static void DrawTexturedTriangles(int verticesCount, int startVertex) {
u64* dw = (u64*)q;
unsigned numVerts = 0;
VU0_VECTOR V[4];
VU0_VECTOR V[6], tmp;
int clip[4];
for (int i = 0; i < verticesCount / 4; i++, v += 4)
{
TransformVertex(v + 0, &V[0]);
TransformVertex(v + 1, &V[1]);
TransformVertex(v + 2, &V[2]);
TransformVertex(v + 3, &V[3]);
TransformTexturedQuad(v, V, &tmp, clip);
// V0, V1, V2
//if (((clip[0] | clip[1] | clip[2]) & 0x3F) == 0) {
if (NotClipped(V[0]) && NotClipped(V[1]) && NotClipped(V[2])) {
dw = DrawTexturedTriangle(dw, V, v + 0, v + 1, v + 2);
numVerts += 3;
}
VU0_VECTOR v0 = V[0];
V[0] = V[2];
V[1] = V[3];
V[2] = v0;
// V2, V3, V0
if (NotClipped(V[0]) && NotClipped(V[1]) && NotClipped(V[2])) {
dw = DrawTexturedTriangle(dw, V, v + 2, v + 3, v + 0);
//if (((clip[2] | clip[3] | clip[0]) & 0x3F) == 0) {
if (NotClipped(V[3]) && NotClipped(V[4]) && NotClipped(V[5])) {
dw = DrawTexturedTriangle(dw, V + 3, v + 2, v + 3, v + 0);
numVerts += 3;
}
}
@ -821,11 +810,36 @@ void Gfx_OnWindowResize(void) {
Gfx_SetScissor( 0, 0, Game.Width, Game.Height);
}
extern void LoadClipScaleFactors(VU0_VECTOR* scale);
void Gfx_SetViewport(int x, int y, int w, int h) {
vp_hwidth = w / 2;
vp_hheight = h / 2;
vp_originX = ftoi4(2048 - (x / 2));
vp_originY = -ftoi4(2048 - (y / 2));
// The code below clips to the viewport clip planes
// For e.g. X this is [2048 - vp_width / 2, 2048 + vp_width / 2]
// However the guard band itself ranges from 0 to 4096
// To reduce need to clip, clip against guard band on X/Y axes instead
/*return
xAdj >= -pos.w && xAdj <= pos.w &&
yAdj >= -pos.w && yAdj <= pos.w &&
pos.z >= -pos.w && pos.z <= pos.w;*/
// Rescale clip planes to guard band extent:
// X/W * vp_hwidth <= vp_hwidth -- clipping against viewport
// X/W <= 1
// X <= W
// X/W * vp_hwidth <= 2048 -- clipping against guard band
// X/W <= 2048 / vp_hwidth
// X * vp_hwidth / 2048 <= W
VU0_VECTOR scale;
scale.x = vp_hwidth / 2048.0f;
scale.y = vp_hheight / 2048.0f;
scale.z = 1.0f;
scale.w = 1.0f;
LoadClipScaleFactors(&scale);
}
void Gfx_SetScissor(int x, int y, int w, int h) {