Dreamcast: Minorly optimise T&L to save a cycle

This commit is contained in:
UnknownShadow200 2025-05-31 12:06:47 +10:00
parent 35747957b7
commit 7bc1d6b70a
8 changed files with 61 additions and 177 deletions

View File

@ -11,9 +11,8 @@ CFLAGS := -g -DNDEBUG -O3 -fipa-pta -fno-pie -flto=auto -fomit-frame-pointer -fb
DEPFLAGS = -MT $@ -MMD -MP -MF $(BUILD_DIR)/$*.d
DEPFILES := $(OBJS:%.o=%.d)
GLDC_LIB = third_party/gldc/libGLdc.a
LDFLAGS = -g
LIBS = -lm $(GLDC_LIB) -lppp -lkosfat
LIBS = -lm -lppp -lkosfat
ifeq ($(strip $(KOS_BASE)),)
$(warning Please set KOS variables in your environment. For example:)
@ -37,7 +36,7 @@ $(BUILD_DIR):
#---------------------------------------------------------------------------------
# executable generation
#---------------------------------------------------------------------------------
$(TARGET).elf: $(OBJS) $(GLDC_LIB)
$(TARGET).elf: $(OBJS)
kos-cc $(LDFLAGS) $^ -o $@ $(LIBS)
$(TARGET).bin: $(TARGET).elf
@ -66,10 +65,6 @@ $(TARGET).cdi: $(TARGET).iso
#---------------------------------------------------------------------------------
# object generation
#---------------------------------------------------------------------------------
$(GLDC_LIB): FORCE
$(MAKE) -C third_party/gldc
FORCE: ;
$(BUILD_DIR)/%.o: src/%.c
kos-cc $(CFLAGS) $(DEPFLAGS) -c $< -o $@

View File

@ -11,13 +11,14 @@
! FR10 = invT
! FR11 = t
#define TM1 r1 // temp register 1
#define TM2 r3 // temp register 2
! INPUT ARGUMENTS
#define IN1 r4 // input vertex 1
#define IN2 r5 // input vertex 2
#define OUT r6 // output vertex
#define TYP r7 // type/flags for output vertex
#define TM1 r1 // temp register 1
#define TM2 r3 // temp register 2
#define CL1 r4 // input colour 1
#define CL2 r5 // input colour 2
#define CLO r7 // output colour
@ -60,6 +61,7 @@ _ClipEdge:
fmov.s @TM1,fr11 ! LS, fr11 = v2->z
fsub fr2,fr11 ! FE, fr11 = v2->z - v1->z
fldi0 fr8 ! LS, fr8 = 0
mov.l TYP,@OUT ! LS, OUT->flags = TYPE
fmul fr11,fr11 ! FE, fr11 = (v2->z - v1->z) * (v2->z * v1->z)
fldi0 fr9 ! LS, fr9 = 0
fldi0 fr0 ! LS, fr0 = 0
@ -118,6 +120,7 @@ _ClipEdge:
cmp/eq CL1,CL2 ! MT, T = ACOLOR == BCOLOR
bt.s 1f ! BR, if (T) goto 1;
mov CL1,CLO ! MT, OUTCOLOR = ACOLOR (branch delay instruction)
! Interpolate B
extu.b CL1,TM1 ! EX, val = ACOLOR.b
lds TM1,fpul ! CO, FPUL = val
@ -130,6 +133,7 @@ _ClipEdge:
ftrc fr3,fpul ! FE, FPUL = int(lerp)
shlr8 CL2 ! EX, BCOLOR >>= 8
sts fpul,TM2 ! CO, tmp = FPUL
! Interpolate G
extu.b CL1,TM1 ! EX, val = ACOLOR.g
lds TM1,fpul ! CO, FPUL = val
@ -144,6 +148,7 @@ _ClipEdge:
mov TM2,CLO ! MT, OUTCOLOR.b = tmp
shlr8 CL2 ! EX, BCOLOR >>= 8
sts fpul,TM2 ! CO, tmp = FPUL
! Interpolate R
extu.b CL1,TM1 ! EX, val = ACOLOR.r
lds TM1,fpul ! CO, FPUL = val
@ -160,6 +165,7 @@ _ClipEdge:
shlr8 CL2 ! EX, BCOLOR >>= 8
sts fpul,TM2 ! CO, tmp = FPUL
! Interpolate A
extu.b CL1,TM1 ! EX, val = ACOLOR.a
lds TM1,fpul ! CO, FPUL = val
float fpul,fr2 ! EX, fr2 = float(FPUL)

View File

@ -9,8 +9,8 @@
#include <kos.h>
#include <dc/matrix.h>
#include <dc/pvr.h>
#include "../third_party/gldc/src/gldc.h"
#include "../third_party/gldc/src/state.c"
#include "../third_party/gldc/state.c"
#include "../third_party/gldc/sh4.c"
static cc_bool renderingDisabled;
static cc_bool stateDirty;

View File

@ -1,20 +0,0 @@
OBJS := sh4.o
C_FLAGS = -O3 -DNDEBUG -mfsrra -mfsca -fno-math-errno -ffp-contract=fast -ffast-math -O3 -mpretend-cmove -fexpensive-optimizations -fomit-frame-pointer -finline-functions -ml -m4-single-only -ffunction-sections -fdata-sections -std=gnu99
C_DEFINES = -DDREAMCAST -DNDEBUG -D__DREAMCAST__ -D__arch_dreamcast -D_arch_dreamcast -D_arch_sub_pristine
TARGET := libGLdc.a
ifeq ($(strip $(KOS_BASE)),)
$(error "Please set KOS variables in your environment.")
endif
default: $(TARGET)
%.o: src/%.c
kos-cc $(C_DEFINES) $(C_FLAGS) -c $< -o $@
$(TARGET): $(OBJS)
kos-ar cr $@ $^
kos-ranlib $@

View File

@ -1,66 +0,0 @@
This is a fork of GLdc optimised for the Dreamcast port of ClassiCube, and unfortunately is essentially useless for any other project
---
# GLdc
**Development of GLdc has moved to [Gitlab](https://gitlab.com/simulant/GLdc)**
This is a partial implementation of OpenGL 1.2 for the SEGA Dreamcast for use
with the KallistiOS SDK.
It began as a fork of libGL by Josh Pearson but has undergone a large refactor
which is essentially a rewrite.
The aim is to implement as much of OpenGL 1.2 as possible, and to add additional
features via extensions.
Things left to (re)implement:
- Spotlights (Trivial)
- Framebuffer extension (Trivial)
- Texture Matrix (Trivial)
Things I'd like to do:
- Use a clean "gl.h"
- Define an extension for modifier volumes
- Add support for point sprites
- Optimise, add unit tests for correctness
# Compiling
GLdc uses CMake for its build system, it currently ships with two "backends":
- kospvr - This is the hardware-accelerated Dreamcast backend
- software - This is a stub software rasterizer used for testing testing and debugging
To compile a Dreamcast debug build, you'll want to do something like the following:
```
mkdir dcbuild
cd dcbuild
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" ..
make
```
For a release build, replace the cmake line with with the following:
```
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release ..
```
You will need KallistiOS compiled and configured (e.g. the KOS_BASE environment
variable must be set)
To compile for PC:
```
mkdir pcbuild
cd pcbuild
cmake -G "Unix Makefiles" ..
make
```
# Special Thanks!
- Massive shout out to Hayden Kowalchuk for diagnosing and fixing a large number of bugs while porting GL Quake to the Dreamcast. Absolute hero!

View File

@ -2,9 +2,6 @@
#define PRIVATE_H
#include <stdint.h>
#define GLenum unsigned int
#define GLboolean unsigned char
#define GLDC_FORCE_INLINE __attribute__((always_inline)) inline
#define GLDC_NO_INLINE __attribute__((noinline))
@ -24,6 +21,6 @@ typedef struct {
uint16_t height;
} TextureObject;
void SceneListSubmit(Vertex* v2, int n);
void GLDC_NO_INLINE SceneListSubmit(Vertex* v2, int n);
#endif // PRIVATE_H

View File

@ -15,24 +15,21 @@ static GLDC_FORCE_INLINE float sh4_fsrra(float x) {
return x;
}
static GLDC_FORCE_INLINE float _glFastInvert(float x) {
return sh4_fsrra(x * x);
}
static GLDC_FORCE_INLINE void PushVertex(Vertex* v) {
volatile Vertex* dst = (Vertex*)(sq);
float f = _glFastInvert(v->w);
// Convert to NDC (viewport already applied)
float x = v->x * f;
float y = v->y * f;
float ww = v->w * v->w;
dst->flags = v->flags;
dst->x = x;
dst->y = y;
dst->z = f;
dst->u = v->u;
dst->v = v->v;
dst->bgra = v->bgra;
float f = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w
// Convert to NDC (viewport already applied)
float x = v->x * f;
float y = v->y * f;
dst->x = x;
dst->y = y;
dst->z = f;
dst->u = v->u;
dst->v = v->v;
dst->bgra = v->bgra;
__asm__("pref @%0" : : "r"(dst));
dst++;
}
@ -51,7 +48,7 @@ static inline void PushCommand(Vertex* v) {
sq += 8;
}
extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout);
extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout, int type);
#define V0_VIS (1 << 0)
#define V1_VIS (1 << 1)
@ -74,10 +71,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// .....A....B...
// / |
// v3--v2---v1
ClipEdge(v3, v0, a);
a->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v0, v1, b);
b->flags = PVR_CMD_VERTEX;
ClipEdge(v3, v0, a, PVR_CMD_VERTEX_EOL);
ClipEdge(v0, v1, b, PVR_CMD_VERTEX);
PushVertex(v0);
PushVertex(b);
@ -92,10 +87,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....A.....B...
// / |
// v0--v3---v2
ClipEdge(v0, v1, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v1, v2, b);
b->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
ClipEdge(v1, v2, b, PVR_CMD_VERTEX_EOL);
PushVertex(a);
PushVertex(v1);
@ -109,11 +102,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....A.....B...
// / |
// v1--v0---v3
ClipEdge(v1, v2, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v2, v3, b);
b->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
ClipEdge(v2, v3, b, PVR_CMD_VERTEX_EOL);
PushVertex(a);
PushVertex(v2);
@ -127,10 +117,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....A.....B...
// / |
// v2--v1---v0
ClipEdge(v2, v3, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v3, v0, b);
b->flags = PVR_CMD_VERTEX;
ClipEdge(v2, v3, a, PVR_CMD_VERTEX);
ClipEdge(v3, v0, b, PVR_CMD_VERTEX);
PushVertex(b);
PushVertex(a);
@ -144,10 +132,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....B..........A...
// \ |
// v3-----v2
ClipEdge(v1, v2, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v3, v0, b);
b->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
ClipEdge(v3, v0, b, PVR_CMD_VERTEX_EOL);
PushVertex(v1);
PushVertex(a);
@ -162,10 +148,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....B..........A...
// \ |
// v2-----v1
ClipEdge(v0, v1, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v2, v3, b);
b->flags = PVR_CMD_VERTEX;
ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
ClipEdge(v2, v3, b, PVR_CMD_VERTEX);
PushVertex(a);
PushVertex(b);
@ -179,10 +163,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....B..........A...
// \ |
// v0-----v3
ClipEdge(v2, v3, a);
a->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v0, v1, b);
b->flags = PVR_CMD_VERTEX;
ClipEdge(v2, v3, a, PVR_CMD_VERTEX_EOL);
ClipEdge(v0, v1, b, PVR_CMD_VERTEX);
PushVertex(v1);
PushVertex(v2);
@ -197,10 +179,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// ....B..........A...
// \ |
// v1-----v0
ClipEdge(v3, v0, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v1, v2, b);
b->flags = PVR_CMD_VERTEX;
ClipEdge(v3, v0, a, PVR_CMD_VERTEX);
ClipEdge(v1, v2, b, PVR_CMD_VERTEX);
PushVertex(b);
PushVertex(v2);
@ -216,10 +196,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// \ |
// v3
// v1,v2,v0 v2,v0,A v0,A,B
ClipEdge(v2, v3, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v3, v0, b);
b->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v2, v3, a, PVR_CMD_VERTEX);
ClipEdge(v3, v0, b, PVR_CMD_VERTEX_EOL);
PushVertex(v1);
PushVertex(v2);
@ -236,10 +214,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// \ |
// v2
// v0,v1,v3 v1,v3,A v3,A,B
ClipEdge(v1, v2, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v2, v3, b);
b->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v1, v2, a, PVR_CMD_VERTEX);
ClipEdge(v2, v3, b, PVR_CMD_VERTEX_EOL);
v3->flags = PVR_CMD_VERTEX;
PushVertex(v0);
@ -257,10 +233,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// \ |
// v1
// v3,v0,v2 v0,v2,A v2,A,B
ClipEdge(v0, v1, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v1, v2, b);
b->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v0, v1, a, PVR_CMD_VERTEX);
ClipEdge(v1, v2, b, PVR_CMD_VERTEX_EOL);
v3->flags = PVR_CMD_VERTEX;
PushVertex(v3);
@ -278,10 +252,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
// \ |
// v0
// v2,v3,v1 v3,v1,A v1,A,B
ClipEdge(v3, v0, a);
a->flags = PVR_CMD_VERTEX;
ClipEdge(v0, v1, b);
b->flags = PVR_CMD_VERTEX_EOL;
ClipEdge(v3, v0, a, PVR_CMD_VERTEX);
ClipEdge(v0, v1, b, PVR_CMD_VERTEX_EOL);
v3->flags = PVR_CMD_VERTEX;
PushVertex(v2);

View File

@ -3,21 +3,21 @@
static TextureObject* TEXTURE_ACTIVE;
static GLboolean DEPTH_TEST_ENABLED;
static GLboolean DEPTH_MASK_ENABLED;
static uint8_t DEPTH_TEST_ENABLED;
static uint8_t DEPTH_MASK_ENABLED;
static GLboolean CULLING_ENABLED;
static uint8_t CULLING_ENABLED;
static GLboolean FOG_ENABLED;
static GLboolean ALPHA_TEST_ENABLED;
static uint8_t FOG_ENABLED;
static uint8_t ALPHA_TEST_ENABLED;
static GLboolean SCISSOR_TEST_ENABLED;
static GLenum SHADE_MODEL = PVR_SHADE_GOURAUD;
static uint8_t SCISSOR_TEST_ENABLED;
static uint32_t SHADE_MODEL = PVR_SHADE_GOURAUD;
static GLboolean BLEND_ENABLED;
static uint8_t BLEND_ENABLED;
static GLboolean TEXTURES_ENABLED;
static GLboolean AUTOSORT_ENABLED;
static uint8_t TEXTURES_ENABLED;
static uint8_t AUTOSORT_ENABLED;
static inline int DimensionFlag(int w) {
switch(w) {