Dreamcast: Minorly optimise T&L to save a cycle

This commit is contained in:
UnknownShadow200 2025-05-31 12:06:47 +10:00
parent 35747957b7
commit 7bc1d6b70a
8 changed files with 61 additions and 177 deletions

View File

@ -11,9 +11,8 @@ CFLAGS := -g -DNDEBUG -O3 -fipa-pta -fno-pie -flto=auto -fomit-frame-pointer -fb
DEPFLAGS = -MT $@ -MMD -MP -MF $(BUILD_DIR)/$*.d DEPFLAGS = -MT $@ -MMD -MP -MF $(BUILD_DIR)/$*.d
DEPFILES := $(OBJS:%.o=%.d) DEPFILES := $(OBJS:%.o=%.d)
GLDC_LIB = third_party/gldc/libGLdc.a
LDFLAGS = -g LDFLAGS = -g
LIBS = -lm $(GLDC_LIB) -lppp -lkosfat LIBS = -lm -lppp -lkosfat
ifeq ($(strip $(KOS_BASE)),) ifeq ($(strip $(KOS_BASE)),)
$(warning Please set KOS variables in your environment. For example:) $(warning Please set KOS variables in your environment. For example:)
@ -37,7 +36,7 @@ $(BUILD_DIR):
#--------------------------------------------------------------------------------- #---------------------------------------------------------------------------------
# executable generation # executable generation
#--------------------------------------------------------------------------------- #---------------------------------------------------------------------------------
$(TARGET).elf: $(OBJS) $(GLDC_LIB) $(TARGET).elf: $(OBJS)
kos-cc $(LDFLAGS) $^ -o $@ $(LIBS) kos-cc $(LDFLAGS) $^ -o $@ $(LIBS)
$(TARGET).bin: $(TARGET).elf $(TARGET).bin: $(TARGET).elf
@ -66,10 +65,6 @@ $(TARGET).cdi: $(TARGET).iso
#--------------------------------------------------------------------------------- #---------------------------------------------------------------------------------
# object generation # object generation
#--------------------------------------------------------------------------------- #---------------------------------------------------------------------------------
$(GLDC_LIB): FORCE
$(MAKE) -C third_party/gldc
FORCE: ;
$(BUILD_DIR)/%.o: src/%.c $(BUILD_DIR)/%.o: src/%.c
kos-cc $(CFLAGS) $(DEPFLAGS) -c $< -o $@ kos-cc $(CFLAGS) $(DEPFLAGS) -c $< -o $@

View File

@ -11,13 +11,14 @@
! FR10 = invT ! FR10 = invT
! FR11 = t ! FR11 = t
#define TM1 r1 // temp register 1 ! INPUT ARGUMENTS
#define TM2 r3 // temp register 2
#define IN1 r4 // input vertex 1 #define IN1 r4 // input vertex 1
#define IN2 r5 // input vertex 2 #define IN2 r5 // input vertex 2
#define OUT r6 // output vertex #define OUT r6 // output vertex
#define TYP r7 // type/flags for output vertex
#define TM1 r1 // temp register 1
#define TM2 r3 // temp register 2
#define CL1 r4 // input colour 1 #define CL1 r4 // input colour 1
#define CL2 r5 // input colour 2 #define CL2 r5 // input colour 2
#define CLO r7 // output colour #define CLO r7 // output colour
@ -60,6 +61,7 @@ _ClipEdge:
fmov.s @TM1,fr11 ! LS, fr11 = v2->z fmov.s @TM1,fr11 ! LS, fr11 = v2->z
fsub fr2,fr11 ! FE, fr11 = v2->z - v1->z fsub fr2,fr11 ! FE, fr11 = v2->z - v1->z
fldi0 fr8 ! LS, fr8 = 0 fldi0 fr8 ! LS, fr8 = 0
mov.l TYP,@OUT ! LS, OUT->flags = TYPE
fmul fr11,fr11 ! FE, fr11 = (v2->z - v1->z) * (v2->z * v1->z) fmul fr11,fr11 ! FE, fr11 = (v2->z - v1->z) * (v2->z * v1->z)
fldi0 fr9 ! LS, fr9 = 0 fldi0 fr9 ! LS, fr9 = 0
fldi0 fr0 ! LS, fr0 = 0 fldi0 fr0 ! LS, fr0 = 0
@ -118,6 +120,7 @@ _ClipEdge:
cmp/eq CL1,CL2 ! MT, T = ACOLOR == BCOLOR cmp/eq CL1,CL2 ! MT, T = ACOLOR == BCOLOR
bt.s 1f ! BR, if (T) goto 1; bt.s 1f ! BR, if (T) goto 1;
mov CL1,CLO ! MT, OUTCOLOR = ACOLOR (branch delay instruction) mov CL1,CLO ! MT, OUTCOLOR = ACOLOR (branch delay instruction)
! Interpolate B ! Interpolate B
extu.b CL1,TM1 ! EX, val = ACOLOR.b extu.b CL1,TM1 ! EX, val = ACOLOR.b
lds TM1,fpul ! CO, FPUL = val lds TM1,fpul ! CO, FPUL = val
@ -130,6 +133,7 @@ _ClipEdge:
ftrc fr3,fpul ! FE, FPUL = int(lerp) ftrc fr3,fpul ! FE, FPUL = int(lerp)
shlr8 CL2 ! EX, BCOLOR >>= 8 shlr8 CL2 ! EX, BCOLOR >>= 8
sts fpul,TM2 ! CO, tmp = FPUL sts fpul,TM2 ! CO, tmp = FPUL
! Interpolate G ! Interpolate G
extu.b CL1,TM1 ! EX, val = ACOLOR.g extu.b CL1,TM1 ! EX, val = ACOLOR.g
lds TM1,fpul ! CO, FPUL = val lds TM1,fpul ! CO, FPUL = val
@ -144,6 +148,7 @@ _ClipEdge:
mov TM2,CLO ! MT, OUTCOLOR.b = tmp mov TM2,CLO ! MT, OUTCOLOR.b = tmp
shlr8 CL2 ! EX, BCOLOR >>= 8 shlr8 CL2 ! EX, BCOLOR >>= 8
sts fpul,TM2 ! CO, tmp = FPUL sts fpul,TM2 ! CO, tmp = FPUL
! Interpolate R ! Interpolate R
extu.b CL1,TM1 ! EX, val = ACOLOR.r extu.b CL1,TM1 ! EX, val = ACOLOR.r
lds TM1,fpul ! CO, FPUL = val lds TM1,fpul ! CO, FPUL = val
@ -160,6 +165,7 @@ _ClipEdge:
shlr8 CL2 ! EX, BCOLOR >>= 8 shlr8 CL2 ! EX, BCOLOR >>= 8
sts fpul,TM2 ! CO, tmp = FPUL sts fpul,TM2 ! CO, tmp = FPUL
! Interpolate A
extu.b CL1,TM1 ! EX, val = ACOLOR.a extu.b CL1,TM1 ! EX, val = ACOLOR.a
lds TM1,fpul ! CO, FPUL = val lds TM1,fpul ! CO, FPUL = val
float fpul,fr2 ! EX, fr2 = float(FPUL) float fpul,fr2 ! EX, fr2 = float(FPUL)

View File

@ -9,8 +9,8 @@
#include <kos.h> #include <kos.h>
#include <dc/matrix.h> #include <dc/matrix.h>
#include <dc/pvr.h> #include <dc/pvr.h>
#include "../third_party/gldc/src/gldc.h" #include "../third_party/gldc/state.c"
#include "../third_party/gldc/src/state.c" #include "../third_party/gldc/sh4.c"
static cc_bool renderingDisabled; static cc_bool renderingDisabled;
static cc_bool stateDirty; static cc_bool stateDirty;

View File

@ -1,20 +0,0 @@
OBJS := sh4.o
C_FLAGS = -O3 -DNDEBUG -mfsrra -mfsca -fno-math-errno -ffp-contract=fast -ffast-math -O3 -mpretend-cmove -fexpensive-optimizations -fomit-frame-pointer -finline-functions -ml -m4-single-only -ffunction-sections -fdata-sections -std=gnu99
C_DEFINES = -DDREAMCAST -DNDEBUG -D__DREAMCAST__ -D__arch_dreamcast -D_arch_dreamcast -D_arch_sub_pristine
TARGET := libGLdc.a
ifeq ($(strip $(KOS_BASE)),)
$(error "Please set KOS variables in your environment.")
endif
default: $(TARGET)
%.o: src/%.c
kos-cc $(C_DEFINES) $(C_FLAGS) -c $< -o $@
$(TARGET): $(OBJS)
kos-ar cr $@ $^
kos-ranlib $@

View File

@ -1,66 +0,0 @@
This is a fork of GLdc optimised for the Dreamcast port of ClassiCube, and unfortunately is essentially useless for any other project
---
# GLdc
**Development of GLdc has moved to [Gitlab](https://gitlab.com/simulant/GLdc)**
This is a partial implementation of OpenGL 1.2 for the SEGA Dreamcast for use
with the KallistiOS SDK.
It began as a fork of libGL by Josh Pearson but has undergone a large refactor
which is essentially a rewrite.
The aim is to implement as much of OpenGL 1.2 as possible, and to add additional
features via extensions.
Things left to (re)implement:
- Spotlights (Trivial)
- Framebuffer extension (Trivial)
- Texture Matrix (Trivial)
Things I'd like to do:
- Use a clean "gl.h"
- Define an extension for modifier volumes
- Add support for point sprites
- Optimise, add unit tests for correctness
# Compiling
GLdc uses CMake for its build system, it currently ships with two "backends":
- kospvr - This is the hardware-accelerated Dreamcast backend
- software - This is a stub software rasterizer used for testing testing and debugging
To compile a Dreamcast debug build, you'll want to do something like the following:
```
mkdir dcbuild
cd dcbuild
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" ..
make
```
For a release build, replace the cmake line with with the following:
```
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release ..
```
You will need KallistiOS compiled and configured (e.g. the KOS_BASE environment
variable must be set)
To compile for PC:
```
mkdir pcbuild
cd pcbuild
cmake -G "Unix Makefiles" ..
make
```
# Special Thanks!
- Massive shout out to Hayden Kowalchuk for diagnosing and fixing a large number of bugs while porting GL Quake to the Dreamcast. Absolute hero!

View File

@ -2,9 +2,6 @@
#define PRIVATE_H #define PRIVATE_H
#include <stdint.h> #include <stdint.h>
#define GLenum unsigned int
#define GLboolean unsigned char
#define GLDC_FORCE_INLINE __attribute__((always_inline)) inline #define GLDC_FORCE_INLINE __attribute__((always_inline)) inline
#define GLDC_NO_INLINE __attribute__((noinline)) #define GLDC_NO_INLINE __attribute__((noinline))
@ -24,6 +21,6 @@ typedef struct {
uint16_t height; uint16_t height;
} TextureObject; } TextureObject;
void SceneListSubmit(Vertex* v2, int n); void GLDC_NO_INLINE SceneListSubmit(Vertex* v2, int n);
#endif // PRIVATE_H #endif // PRIVATE_H

View File

@ -15,24 +15,21 @@ static GLDC_FORCE_INLINE float sh4_fsrra(float x) {
return x; return x;
} }
static GLDC_FORCE_INLINE float _glFastInvert(float x) {
return sh4_fsrra(x * x);
}
static GLDC_FORCE_INLINE void PushVertex(Vertex* v) { static GLDC_FORCE_INLINE void PushVertex(Vertex* v) {
volatile Vertex* dst = (Vertex*)(sq); volatile Vertex* dst = (Vertex*)(sq);
float f = _glFastInvert(v->w); float ww = v->w * v->w;
// Convert to NDC (viewport already applied)
float x = v->x * f;
float y = v->y * f;
dst->flags = v->flags; dst->flags = v->flags;
dst->x = x; float f = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w
dst->y = y; // Convert to NDC (viewport already applied)
dst->z = f; float x = v->x * f;
dst->u = v->u; float y = v->y * f;
dst->v = v->v;
dst->bgra = v->bgra; dst->x = x;
dst->y = y;
dst->z = f;
dst->u = v->u;
dst->v = v->v;
dst->bgra = v->bgra;
__asm__("pref @%0" : : "r"(dst)); __asm__("pref @%0" : : "r"(dst));
dst++; dst++;
} }
@ -51,7 +48,7 @@ static inline void PushCommand(Vertex* v) {
sq += 8; sq += 8;
} }
extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout); extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout, int type);
#define V0_VIS (1 << 0) #define V0_VIS (1 << 0)
#define V1_VIS (1 << 1) #define V1_VIS (1 << 1)
@ -74,10 +71,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// .....A....B... // .....A....B...
// / | // / |
// v3--v2---v1 // v3--v2---v1
ClipEdge(v3, v0, a); ClipEdge(v3, v0, a, PVR_CMD_VERTEX_EOL);
a->flags = PVR_CMD_VERTEX_EOL; ClipEdge(v0, v1, b, PVR_CMD_VERTEX);
ClipEdge(v0, v1, b);
b->flags = PVR_CMD_VERTEX;
PushVertex(v0); PushVertex(v0);
PushVertex(b); PushVertex(b);
@ -92,10 +87,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....A.....B... // ....A.....B...
// / | // / |
// v0--v3---v2 // v0--v3---v2
ClipEdge(v0, v1, a); ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v1, v2, b, PVR_CMD_VERTEX_EOL);
ClipEdge(v1, v2, b);
b->flags = PVR_CMD_VERTEX_EOL;
PushVertex(a); PushVertex(a);
PushVertex(v1); PushVertex(v1);
@ -109,11 +102,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....A.....B... // ....A.....B...
// / | // / |
// v1--v0---v3 // v1--v0---v3
ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
ClipEdge(v1, v2, a); ClipEdge(v2, v3, b, PVR_CMD_VERTEX_EOL);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v2, v3, b);
b->flags = PVR_CMD_VERTEX_EOL;
PushVertex(a); PushVertex(a);
PushVertex(v2); PushVertex(v2);
@ -127,10 +117,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....A.....B... // ....A.....B...
// / | // / |
// v2--v1---v0 // v2--v1---v0
ClipEdge(v2, v3, a); ClipEdge(v2, v3, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v3, v0, b, PVR_CMD_VERTEX);
ClipEdge(v3, v0, b);
b->flags = PVR_CMD_VERTEX;
PushVertex(b); PushVertex(b);
PushVertex(a); PushVertex(a);
@ -144,10 +132,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....B..........A... // ....B..........A...
// \ | // \ |
// v3-----v2 // v3-----v2
ClipEdge(v1, v2, a); ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v3, v0, b, PVR_CMD_VERTEX_EOL);
ClipEdge(v3, v0, b);
b->flags = PVR_CMD_VERTEX_EOL;
PushVertex(v1); PushVertex(v1);
PushVertex(a); PushVertex(a);
@ -162,10 +148,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....B..........A... // ....B..........A...
// \ | // \ |
// v2-----v1 // v2-----v1
ClipEdge(v0, v1, a); ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v2, v3, b, PVR_CMD_VERTEX);
ClipEdge(v2, v3, b);
b->flags = PVR_CMD_VERTEX;
PushVertex(a); PushVertex(a);
PushVertex(b); PushVertex(b);
@ -179,10 +163,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....B..........A... // ....B..........A...
// \ | // \ |
// v0-----v3 // v0-----v3
ClipEdge(v2, v3, a); ClipEdge(v2, v3, a, PVR_CMD_VERTEX_EOL);
a->flags = PVR_CMD_VERTEX_EOL; ClipEdge(v0, v1, b, PVR_CMD_VERTEX);
ClipEdge(v0, v1, b);
b->flags = PVR_CMD_VERTEX;
PushVertex(v1); PushVertex(v1);
PushVertex(v2); PushVertex(v2);
@ -197,10 +179,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....B..........A... // ....B..........A...
// \ | // \ |
// v1-----v0 // v1-----v0
ClipEdge(v3, v0, a); ClipEdge(v3, v0, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v1, v2, b, PVR_CMD_VERTEX);
ClipEdge(v1, v2, b);
b->flags = PVR_CMD_VERTEX;
PushVertex(b); PushVertex(b);
PushVertex(v2); PushVertex(v2);
@ -216,10 +196,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// \ | // \ |
// v3 // v3
// v1,v2,v0 v2,v0,A v0,A,B // v1,v2,v0 v2,v0,A v0,A,B
ClipEdge(v2, v3, a); ClipEdge(v2, v3, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v3, v0, b, PVR_CMD_VERTEX_EOL);
ClipEdge(v3, v0, b);
b->flags = PVR_CMD_VERTEX_EOL;
PushVertex(v1); PushVertex(v1);
PushVertex(v2); PushVertex(v2);
@ -236,10 +214,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// \ | // \ |
// v2 // v2
// v0,v1,v3 v1,v3,A v3,A,B // v0,v1,v3 v1,v3,A v3,A,B
ClipEdge(v1, v2, a); ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v2, v3, b, PVR_CMD_VERTEX_EOL);
ClipEdge(v2, v3, b);
b->flags = PVR_CMD_VERTEX_EOL;
v3->flags = PVR_CMD_VERTEX; v3->flags = PVR_CMD_VERTEX;
PushVertex(v0); PushVertex(v0);
@ -257,10 +233,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// \ | // \ |
// v1 // v1
// v3,v0,v2 v0,v2,A v2,A,B // v3,v0,v2 v0,v2,A v2,A,B
ClipEdge(v0, v1, a); ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v1, v2, b, PVR_CMD_VERTEX_EOL);
ClipEdge(v1, v2, b);
b->flags = PVR_CMD_VERTEX_EOL;
v3->flags = PVR_CMD_VERTEX; v3->flags = PVR_CMD_VERTEX;
PushVertex(v3); PushVertex(v3);
@ -278,10 +252,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// \ | // \ |
// v0 // v0
// v2,v3,v1 v3,v1,A v1,A,B // v2,v3,v1 v3,v1,A v1,A,B
ClipEdge(v3, v0, a); ClipEdge(v3, v0, a, PVR_CMD_VERTEX);
a->flags = PVR_CMD_VERTEX; ClipEdge(v0, v1, b, PVR_CMD_VERTEX_EOL);
ClipEdge(v0, v1, b);
b->flags = PVR_CMD_VERTEX_EOL;
v3->flags = PVR_CMD_VERTEX; v3->flags = PVR_CMD_VERTEX;
PushVertex(v2); PushVertex(v2);

View File

@ -3,21 +3,21 @@
static TextureObject* TEXTURE_ACTIVE; static TextureObject* TEXTURE_ACTIVE;
static GLboolean DEPTH_TEST_ENABLED; static uint8_t DEPTH_TEST_ENABLED;
static GLboolean DEPTH_MASK_ENABLED; static uint8_t DEPTH_MASK_ENABLED;
static GLboolean CULLING_ENABLED; static uint8_t CULLING_ENABLED;
static GLboolean FOG_ENABLED; static uint8_t FOG_ENABLED;
static GLboolean ALPHA_TEST_ENABLED; static uint8_t ALPHA_TEST_ENABLED;
static GLboolean SCISSOR_TEST_ENABLED; static uint8_t SCISSOR_TEST_ENABLED;
static GLenum SHADE_MODEL = PVR_SHADE_GOURAUD; static uint32_t SHADE_MODEL = PVR_SHADE_GOURAUD;
static GLboolean BLEND_ENABLED; static uint8_t BLEND_ENABLED;
static GLboolean TEXTURES_ENABLED; static uint8_t TEXTURES_ENABLED;
static GLboolean AUTOSORT_ENABLED; static uint8_t AUTOSORT_ENABLED;
static inline int DimensionFlag(int w) { static inline int DimensionFlag(int w) {
switch(w) { switch(w) {