Dreamcast: Minorly optimise T&L to save a cycle

2025-09-15 02:25:32 -04:00 · 2025-05-31 12:06:47 +10:00 · 2025-05-31 12:06:47 +10:00 · 7bc1d6b70a
commit 7bc1d6b70a
parent 35747957b7
8 changed files with 61 additions and 177 deletions
--- a/misc/dreamcast/Makefile
+++ b/misc/dreamcast/Makefile
@ -11,9 +11,8 @@ CFLAGS	:= -g -DNDEBUG -O3 -fipa-pta -fno-pie -flto=auto -fomit-frame-pointer -fb
 DEPFLAGS = -MT $@ -MMD -MP -MF $(BUILD_DIR)/$*.d
 DEPFILES := $(OBJS:%.o=%.d)

-GLDC_LIB	= third_party/gldc/libGLdc.a
 LDFLAGS		= -g
-LIBS		= -lm $(GLDC_LIB) -lppp -lkosfat
+LIBS		= -lm -lppp -lkosfat

 ifeq ($(strip $(KOS_BASE)),)
 $(warning Please set KOS variables in your environment. For example:)
@ -37,7 +36,7 @@ $(BUILD_DIR):
 #---------------------------------------------------------------------------------
 # executable generation
 #---------------------------------------------------------------------------------
-$(TARGET).elf: $(OBJS) $(GLDC_LIB)
+$(TARGET).elf: $(OBJS)
 	kos-cc $(LDFLAGS) $^ -o $@ $(LIBS)
 	
 $(TARGET).bin: $(TARGET).elf
@ -66,10 +65,6 @@ $(TARGET).cdi: $(TARGET).iso
 #---------------------------------------------------------------------------------
 # object generation
 #---------------------------------------------------------------------------------
-$(GLDC_LIB): FORCE
-	$(MAKE) -C third_party/gldc
-FORCE: ;
-	
 $(BUILD_DIR)/%.o: src/%.c
 	kos-cc $(CFLAGS) $(DEPFLAGS) -c $< -o $@

--- a/misc/dreamcast/VertexClip2.S
+++ b/misc/dreamcast/VertexClip2.S
@ -11,13 +11,14 @@
 ! FR10 = invT
 ! FR11 = t

-#define TM1 r1 // temp register 1
-#define TM2 r3 // temp register 2
-
+! INPUT ARGUMENTS
 #define IN1 r4 // input vertex 1
 #define IN2 r5 // input vertex 2
 #define OUT r6 // output vertex
+#define TYP r7 // type/flags for output vertex

+#define TM1 r1 // temp register 1
+#define TM2 r3 // temp register 2
 #define CL1 r4 // input colour 1
 #define CL2 r5 // input colour 2
 #define CLO r7 // output colour
@ -60,6 +61,7 @@ _ClipEdge:
 	fmov.s  @TM1,fr11 ! LS, fr11 = v2->z
 	fsub     fr2,fr11 ! FE, fr11 = v2->z - v1->z
 	fldi0    fr8      ! LS, fr8  = 0
+	mov.l  TYP,@OUT   ! LS, OUT->flags = TYPE
 	fmul    fr11,fr11 ! FE, fr11 = (v2->z - v1->z) * (v2->z * v1->z)
 	fldi0    fr9      ! LS, fr9  = 0
 	fldi0    fr0      ! LS, fr0  = 0
@ -118,6 +120,7 @@ _ClipEdge:
 	cmp/eq  CL1,CL2   ! MT, T = ACOLOR == BCOLOR
 	bt.s    1f        ! BR, if (T) goto 1;
 	mov     CL1,CLO   ! MT, OUTCOLOR = ACOLOR (branch delay instruction)
+
 ! Interpolate B
 	extu.b  CL1,TM1   ! EX, val = ACOLOR.b
 	lds     TM1,fpul  ! CO, FPUL = val
@ -130,6 +133,7 @@ _ClipEdge:
 	ftrc    fr3,fpul  ! FE, FPUL = int(lerp)
 	shlr8   CL2       ! EX, BCOLOR >>= 8
 	sts     fpul,TM2  ! CO, tmp = FPUL
+
 ! Interpolate G
 	extu.b  CL1,TM1   ! EX, val = ACOLOR.g
 	lds     TM1,fpul  ! CO, FPUL = val
@ -144,6 +148,7 @@ _ClipEdge:
 	mov     TM2,CLO   ! MT, OUTCOLOR.b = tmp
 	shlr8   CL2       ! EX, BCOLOR >>= 8
 	sts     fpul,TM2  ! CO, tmp = FPUL
+
 ! Interpolate R
 	extu.b  CL1,TM1   ! EX, val = ACOLOR.r
 	lds     TM1,fpul  ! CO, FPUL = val
@ -160,6 +165,7 @@ _ClipEdge:
 	shlr8   CL2       ! EX, BCOLOR >>= 8
 	sts     fpul,TM2  ! CO, tmp = FPUL
 	
+! Interpolate A
 	extu.b  CL1,TM1   ! EX, val = ACOLOR.a
 	lds     TM1,fpul  ! CO, FPUL = val
 	float   fpul,fr2  ! EX, fr2 = float(FPUL)
--- a/src/Graphics_Dreamcast.c
+++ b/src/Graphics_Dreamcast.c
@ -9,8 +9,8 @@
 #include <kos.h>
 #include <dc/matrix.h>
 #include <dc/pvr.h>
-#include "../third_party/gldc/src/gldc.h"
-#include "../third_party/gldc/src/state.c"
+#include "../third_party/gldc/state.c"
+#include "../third_party/gldc/sh4.c"

 static cc_bool renderingDisabled;
 static cc_bool stateDirty;
--- a/third_party/gldc/Makefile
+++ b/third_party/gldc/Makefile
@ -1,20 +0,0 @@
-OBJS 	:= sh4.o
-
-C_FLAGS = -O3 -DNDEBUG -mfsrra -mfsca -fno-math-errno -ffp-contract=fast -ffast-math -O3 -mpretend-cmove -fexpensive-optimizations -fomit-frame-pointer -finline-functions -ml -m4-single-only -ffunction-sections -fdata-sections -std=gnu99
-
-C_DEFINES = -DDREAMCAST -DNDEBUG -D__DREAMCAST__ -D__arch_dreamcast -D_arch_dreamcast -D_arch_sub_pristine
-
-TARGET := libGLdc.a
-
-ifeq ($(strip $(KOS_BASE)),)
-$(error "Please set KOS variables in your environment.")
-endif
-
-default: $(TARGET)
-
-%.o: src/%.c
-	kos-cc $(C_DEFINES) $(C_FLAGS)  -c $< -o $@
-
-$(TARGET): $(OBJS)
-	kos-ar cr $@ $^
-	kos-ranlib $@
--- a/third_party/gldc/README.md
+++ b/third_party/gldc/README.md
@ -1,66 +0,0 @@
-This is a fork of GLdc optimised for the Dreamcast port of ClassiCube, and unfortunately is essentially useless for any other project
-
---
-
-# GLdc
-
-**Development of GLdc has moved to [Gitlab](https://gitlab.com/simulant/GLdc)**
-
-This is a partial implementation of OpenGL 1.2 for the SEGA Dreamcast for use
-with the KallistiOS SDK.
-
-It began as a fork of libGL by Josh Pearson but has undergone a large refactor
-which is essentially a rewrite.
-
-The aim is to implement as much of OpenGL 1.2 as possible, and to add additional
-features via extensions.
-
-Things left to (re)implement:
-
- - Spotlights (Trivial)
- - Framebuffer extension (Trivial)
- - Texture Matrix (Trivial)
- 
-Things I'd like to do:
-
- - Use a clean "gl.h"
- - Define an extension for modifier volumes
- - Add support for point sprites
- - Optimise, add unit tests for correctness
-
-# Compiling
-
-GLdc uses CMake for its build system, it currently ships with two "backends":
-
- - kospvr - This is the hardware-accelerated Dreamcast backend
- - software - This is a stub software rasterizer used for testing testing and debugging
- 
-To compile a Dreamcast debug build, you'll want to do something like the following:
-
-```
-mkdir dcbuild
-cd dcbuild
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" ..
-make
-```
-
-For a release build, replace the cmake line with with the following:
-```
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release ..
-```
-
-You will need KallistiOS compiled and configured (e.g. the KOS_BASE environment
-variable must be set)
-
-To compile for PC:
-
-```
-mkdir pcbuild
-cd pcbuild
-cmake -G "Unix Makefiles" ..
-make
-```
- 
-# Special Thanks!
-
- - Massive shout out to Hayden Kowalchuk for diagnosing and fixing a large number of bugs while porting GL Quake to the Dreamcast. Absolute hero!  
--- a/third_party/gldc/src/gldc.h
+++ b/third_party/gldc/src/gldc.h
@ -2,9 +2,6 @@
 #define PRIVATE_H
 #include <stdint.h>

-#define GLenum     unsigned int
-#define GLboolean  unsigned char
-
 #define GLDC_FORCE_INLINE __attribute__((always_inline)) inline
 #define GLDC_NO_INLINE    __attribute__((noinline))

@ -24,6 +21,6 @@ typedef struct {
    uint16_t height;
 } TextureObject;

-void SceneListSubmit(Vertex* v2, int n);
+void GLDC_NO_INLINE SceneListSubmit(Vertex* v2, int n);

 #endif // PRIVATE_H
--- a/third_party/gldc/src/sh4.c
+++ b/third_party/gldc/src/sh4.c
@ -15,24 +15,21 @@ static GLDC_FORCE_INLINE float sh4_fsrra(float x) {
  return x;
 }

-static GLDC_FORCE_INLINE float _glFastInvert(float x) {
-    return sh4_fsrra(x * x);
-}
-
 static GLDC_FORCE_INLINE void PushVertex(Vertex* v) {
    volatile Vertex* dst = (Vertex*)(sq);
-    float f = _glFastInvert(v->w);
-    // Convert to NDC (viewport already applied)
-    float x = v->x * f;
-    float y = v->y * f;
-
+	float ww   = v->w * v->w;
    dst->flags = v->flags;
-    dst->x = x;
-    dst->y = y;
-    dst->z = f;
-    dst->u = v->u;
-    dst->v = v->v;
-    dst->bgra = v->bgra;
+    float f    = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w
+    // Convert to NDC (viewport already applied)
+    float x    = v->x * f;
+    float y    = v->y * f;
+
+    dst->x     = x;
+    dst->y     = y;
+    dst->z     = f;
+    dst->u     = v->u;
+    dst->v     = v->v;
+    dst->bgra  = v->bgra;
    __asm__("pref @%0" : : "r"(dst));
    dst++;
 }
@ -51,7 +48,7 @@ static inline void PushCommand(Vertex* v)  {
    sq += 8;
 }

-extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout);
+extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout, int type);

 #define V0_VIS (1 << 0)
 #define V1_VIS (1 << 1)
@ -74,10 +71,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        // .....A....B...
        //    /      |
        //  v3--v2---v1
-        ClipEdge(v3, v0, a);
-        a->flags = PVR_CMD_VERTEX_EOL;
-        ClipEdge(v0, v1, b);
-        b->flags = PVR_CMD_VERTEX;
+        ClipEdge(v3, v0, a, PVR_CMD_VERTEX_EOL);
+        ClipEdge(v0, v1, b, PVR_CMD_VERTEX);

        PushVertex(v0);
        PushVertex(b);
@ -92,10 +87,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        // ....A.....B...
        //    /      |
        //  v0--v3---v2
-        ClipEdge(v0, v1, a);
-        a->flags = PVR_CMD_VERTEX;
-        ClipEdge(v1, v2, b);
-        b->flags = PVR_CMD_VERTEX_EOL;
+        ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
+        ClipEdge(v1, v2, b, PVR_CMD_VERTEX_EOL);

        PushVertex(a);
        PushVertex(v1);
@ -109,11 +102,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        // ....A.....B...
        //    /      |
        //  v1--v0---v3
-
-        ClipEdge(v1, v2, a);
-        a->flags = PVR_CMD_VERTEX;
-        ClipEdge(v2, v3, b);
-        b->flags = PVR_CMD_VERTEX_EOL;
+        ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
+        ClipEdge(v2, v3, b, PVR_CMD_VERTEX_EOL);

        PushVertex(a);
        PushVertex(v2);
@ -127,10 +117,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        // ....A.....B...
        //    /      |
        //  v2--v1---v0
-        ClipEdge(v2, v3, a);
-        a->flags = PVR_CMD_VERTEX;
-        ClipEdge(v3, v0, b);
-        b->flags = PVR_CMD_VERTEX;
+        ClipEdge(v2, v3, a, PVR_CMD_VERTEX);
+        ClipEdge(v3, v0, b, PVR_CMD_VERTEX);

        PushVertex(b);
        PushVertex(a);
@ -144,10 +132,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        //   ....B..........A...
        //         \        |
        //          v3-----v2
-        ClipEdge(v1, v2, a);
-        a->flags = PVR_CMD_VERTEX;
-        ClipEdge(v3, v0, b);
-        b->flags = PVR_CMD_VERTEX_EOL;
+        ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
+        ClipEdge(v3, v0, b, PVR_CMD_VERTEX_EOL);

        PushVertex(v1);
        PushVertex(a);
@ -162,10 +148,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        //   ....B..........A...
        //         \        |
        //          v2-----v1
-        ClipEdge(v0, v1, a);
-        a->flags = PVR_CMD_VERTEX;
-        ClipEdge(v2, v3, b);
-        b->flags = PVR_CMD_VERTEX;
+        ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
+        ClipEdge(v2, v3, b, PVR_CMD_VERTEX);

        PushVertex(a);
        PushVertex(b);
@ -179,10 +163,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        //   ....B..........A...
        //         \        |
        //          v0-----v3
-        ClipEdge(v2, v3, a);
-        a->flags = PVR_CMD_VERTEX_EOL;
-        ClipEdge(v0, v1, b);
-        b->flags = PVR_CMD_VERTEX;
+        ClipEdge(v2, v3, a, PVR_CMD_VERTEX_EOL);
+        ClipEdge(v0, v1, b, PVR_CMD_VERTEX);

        PushVertex(v1);
        PushVertex(v2);
@ -197,10 +179,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        //   ....B..........A...
        //         \        |
        //          v1-----v0
-        ClipEdge(v3, v0, a);
-        a->flags = PVR_CMD_VERTEX;
-        ClipEdge(v1, v2, b);
-        b->flags = PVR_CMD_VERTEX;
+        ClipEdge(v3, v0, a, PVR_CMD_VERTEX);
+        ClipEdge(v1, v2, b, PVR_CMD_VERTEX);

        PushVertex(b);
        PushVertex(v2);
@ -216,10 +196,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        //          \   |
        //            v3
        // v1,v2,v0  v2,v0,A  v0,A,B
-        ClipEdge(v2, v3, a);
-        a->flags = PVR_CMD_VERTEX;
-        ClipEdge(v3, v0, b);
-        b->flags = PVR_CMD_VERTEX_EOL;
+        ClipEdge(v2, v3, a, PVR_CMD_VERTEX);
+        ClipEdge(v3, v0, b, PVR_CMD_VERTEX_EOL);

        PushVertex(v1);
        PushVertex(v2);
@ -236,10 +214,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        //          \   |
        //            v2
        // v0,v1,v3  v1,v3,A  v3,A,B
-        ClipEdge(v1, v2, a);
-        a->flags  = PVR_CMD_VERTEX;
-        ClipEdge(v2, v3, b);
-        b->flags  = PVR_CMD_VERTEX_EOL;
+        ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
+        ClipEdge(v2, v3, b, PVR_CMD_VERTEX_EOL);
        v3->flags = PVR_CMD_VERTEX;

        PushVertex(v0);
@ -257,10 +233,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        //          \   |
        //            v1
        // v3,v0,v2  v0,v2,A  v2,A,B
-        ClipEdge(v0, v1, a);
-        a->flags  = PVR_CMD_VERTEX;
-        ClipEdge(v1, v2, b);
-        b->flags  = PVR_CMD_VERTEX_EOL;
+        ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
+        ClipEdge(v1, v2, b, PVR_CMD_VERTEX_EOL);
        v3->flags = PVR_CMD_VERTEX;

        PushVertex(v3);
@ -278,10 +252,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
        //          \   |
        //            v0
        // v2,v3,v1  v3,v1,A  v1,A,B
-        ClipEdge(v3, v0, a);
-        a->flags  = PVR_CMD_VERTEX;
-        ClipEdge(v0, v1, b);
-        b->flags  = PVR_CMD_VERTEX_EOL;
+        ClipEdge(v3, v0, a, PVR_CMD_VERTEX);
+        ClipEdge(v0, v1, b, PVR_CMD_VERTEX_EOL);
        v3->flags = PVR_CMD_VERTEX;

        PushVertex(v2);
--- a/third_party/gldc/src/state.c
+++ b/third_party/gldc/src/state.c
@ -3,21 +3,21 @@

 static TextureObject* TEXTURE_ACTIVE;

-static GLboolean DEPTH_TEST_ENABLED;
-static GLboolean DEPTH_MASK_ENABLED;
+static uint8_t  DEPTH_TEST_ENABLED;
+static uint8_t  DEPTH_MASK_ENABLED;

-static GLboolean CULLING_ENABLED;
+static uint8_t  CULLING_ENABLED;

-static GLboolean FOG_ENABLED;
-static GLboolean ALPHA_TEST_ENABLED;
+static uint8_t  FOG_ENABLED;
+static uint8_t  ALPHA_TEST_ENABLED;

-static GLboolean SCISSOR_TEST_ENABLED;
-static GLenum SHADE_MODEL = PVR_SHADE_GOURAUD;
+static uint8_t  SCISSOR_TEST_ENABLED;
+static uint32_t SHADE_MODEL = PVR_SHADE_GOURAUD;

-static GLboolean BLEND_ENABLED;
+static uint8_t  BLEND_ENABLED;

-static GLboolean TEXTURES_ENABLED;
-static GLboolean AUTOSORT_ENABLED;
+static uint8_t  TEXTURES_ENABLED;
+static uint8_t  AUTOSORT_ENABLED;

 static inline int DimensionFlag(int w) {
    switch(w) {