Dreamcast: Fix not using second store queue

This commit is contained in:
UnknownShadow200 2025-05-31 20:18:58 +10:00
parent ee2e521f5d
commit f082d17ee4
2 changed files with 259 additions and 264 deletions

View File

@ -23,17 +23,18 @@
#define CL2 r5 // input colour 2
#define CLO r7 // output colour
! Calculates the near plane intersection point between two points:
! Writes output vertex as the near plane intersection point between two points:
! float t = fabsf(v1->z) / fabsf(v2->z - v1->z)
! float invt = 1.0f - t;
! // note: w = invt * v1->w + t * v2->w;, always ends up being zNear
!
! out->x = invt * v1->x + t * v2->x;
! out->y = invt * v1->y + t * v2->y;
! out->z = 0.0f; // clipped against near plane anyways (I.e Z/W = 0 --> Z = 0)
! out->c = type << 24
! out->x = (invt * v1->x + t * v2->x) * 1/zNear
! out->y = (invt * v1->y + t * v2->y) * 1/zNear
! out->w = 1/zNear
!
! out->u = invt * v1->u + t * v2->u;
! out->v = invt * v1->v + t * v2->v;
! out->w = zNear // invt * v1->w + t * v2->w;, always ends up being zNear
!
! out->b = invt * v1->b + t * v2->b;
! out->g = invt * v1->g + t * v2->g;
@ -41,7 +42,7 @@
! out->a = invt * v1->a + t * v2->a;
! To optimise these calculations, FIPR is used:
! FIPR = FVm.x*FVn.x + FVm.y*FVn.x + FVm.z*FVn.z + FVm.w*FVn.w --> FVn.w
! FIPR can be used to accomplish "vout->Q invt * v1->Q + t * v2->Q" by:
! FIPR can be used to accomplish "vout->Q = invt * v1->Q + t * v2->Q" by:
! - assigning x/y components to 0 for both vectors
! - assigning t and invT to z/w of FVm vector
! - assigning v1 and v2 to z/w of FVn vector
@ -51,74 +52,74 @@
.global _ClipEdge
.align 4
_ClipEdge:
mov IN1, TM1 ! MT, tmp = &v1
add #12, IN1 ! EX, IN1 = &v1->z
fldi0 fr4 ! LS, fr4 = 0
add #12, TM1 ! EX, tmp = &v1->z
fmov.s @TM1, fr2 ! LS, fr2 = v1->z
mov IN2, TM1 ! MT, tmp = &v2
fmov.s @IN1, fr2 ! LS, fr2 = v1->z
add #12, IN2 ! EX, IN = &v2->z
fldi0 fr5 ! LS, fr5 = 0
add #12, TM1 ! EX, tmp = &v2->z
fmov.s @TM1,fr11 ! LS, fr11 = v2->z
fmov.s @IN2,fr11 ! LS, fr11 = v2->z
fsub fr2,fr11 ! FE, fr11 = v2->z - v1->z
fldi0 fr8 ! LS, fr8 = 0
mov.l TYP,@OUT ! LS, OUT->cmd = TYPE
shll16 TYP ! EX, TYP <<= 16
fmul fr11,fr11 ! FE, fr11 = (v2->z - v1->z) * (v2->z * v1->z)
fldi0 fr9 ! LS, fr9 = 0
fldi0 fr0 ! LS, fr0 = 0
fldi0 fr1 ! LS, fr1 = 0
fsrra fr11 ! FE, fr11 = 1 / abs(v2->z - v1->z)
shll8 TYP ! EX, TYP <<= 8
fabs fr2 ! LS, fr2 = abs(v1->z)
mov.l TYP,@OUT ! LS, dst->cmd = TYPE
fmul fr2,fr11 ! FE, fr11 = abs(v1->Z) / abs(v2->z - v1->z) --> t
add #4, IN1 ! EX, v1 += 4
add #-8, IN1 ! EX, IN1 = &v1->x
fldi1 fr10 ! LS, fr10 = 1
add #4, IN2 ! EX, v2 += 4
add #4, OUT ! EX, OUT += 4
fsub fr11,fr10 ! FE, fr10 = 1.0 - t --> invT
add #-8, IN2 ! EX, IN2 = &v2->x
add #4, OUT ! EX, OUT = &dst->x
fsub fr11,fr10 ! FE, invT = 1.0 - t --> invT
! Load X components
fmov.s @IN1+, fr2 ! LS, A1 = v1->x, v1 += 4
fmov.s @IN2+, fr3 ! LS, B1 = v2->x, v2 += 4
fmov.s @IN1+, fr2 ! LS, A1 = v1->x, IN1 = &v1->y
fmov.s @IN2+, fr3 ! LS, B1 = v2->x, IN2 = &v2->y
! Start interpolating X
fipr fv8, fv0 ! FE, LERP(A1, B1)
! Load Y components
fmov.s @IN1+, fr6 ! LS, A2 = v1->y, v1 += 4
fmov.s @IN2+, fr7 ! LS, B2 = v2->y, v2 += 4
fmov.s @IN1, fr6 ! LS, A2 = v1->y
fmov.s @IN2, fr7 ! LS, B2 = v2->y
! Load W
mov.l _NEAR_CLIP_W,TM1 ! tmp = zNear
lds TM1,fpul ! LS, FPUL = zNear
fsts fpul,fr2 ! LS, fr2 = FPUL
! Store interpolated X
fmul fr2,fr3 ! EX, fr7 = LERP * invW
fmov.s fr3,@OUT ! LS, OUT->x = LERP * invW
add #4, OUT ! EX, OUT += 4
fmov.s fr3,@OUT ! LS, dst->x = LERP * invW
add #4, OUT ! EX, OUT = &dst->y
! Start interpolating Y
fipr fv8, fv4 ! FE, LERP(A2, B2)
! Skip Z of input vertices
add #4, IN1 ! EX, v1 += 4
add #4, IN2 ! EX, v2 += 4
add #8, IN1 ! EX, IN1 = &v1->u
add #8, IN2 ! EX, IN2 = &v2->u
! Store interpolated Y
fmul fr2,fr7 ! EX, fr7 = LERP * invW
fmov.s fr7,@OUT ! LS, OUT->y = LERP * invW
add #4, OUT ! EX, OUT += 4
add #4, OUT ! EX, OUT = &dst->w
! Store W
fmov.s fr2,@OUT ! LS, OUT->w = 1/zNear
add #4, OUT ! EX, OUT += 4
add #4, OUT ! EX, OUT = &dst->u
! Load U components
fmov.s @IN1+, fr2 ! LS, A1 = v1->u, v1 += 4
fmov.s @IN2+, fr3 ! LS, B1 = v2->u, v2 += 4
fmov.s @IN1+, fr2 ! LS, A1 = v1->u, IN1 = &v1->v
fmov.s @IN2+, fr3 ! LS, B1 = v2->u, IN2 = &v1->v
! Start interpolating U
fipr fv8, fv0 ! FE, LERP(A1, B1)
! Load V components
fmov.s @IN1+, fr6 ! LS, A2 = v1->v, v1 += 4
fmov.s @IN2+, fr7 ! LS, B2 = v2->v, v2 += 4
fmov.s @IN1+, fr6 ! LS, A2 = v1->v, IN1 = &v1->bgra
fmov.s @IN2+, fr7 ! LS, B2 = v2->v, IN2 = &v2->bgra
! Store interpolated U
fmov.s fr3,@OUT ! LS, OUT->u = LERP
add #4, OUT ! EX, OUT += 4
fmov.s fr3,@OUT ! LS, dst->u = LERP
add #4, OUT ! EX, OUT = &dst->v
! Start interpolating V
fipr fv8, fv4 ! FE, LERP(A2, B2)
@ -127,8 +128,8 @@ _ClipEdge:
mov.l @IN2,CL2 ! LS, BCOLOR = v2->bgra
cmp/eq CL1,CL2 ! MT, T = ACOLOR == BCOLOR
! Store V
fmov.s fr7,@OUT ! LS, OUT->v = LERP
add #4, OUT ! EX, OUT += 4
fmov.s fr7,@OUT ! LS, dst->v = LERP
add #4, OUT ! EX, OUT = &dst->bgra
! Bypass RGBA interpolation if unnecessary
bt.s 1f ! BR, if (T) goto 1;

452
third_party/gldc/sh4.c vendored
View File

@ -2,9 +2,6 @@
#include <dc/pvr.h>
#include "gldc.h"
#define PREFETCH(addr) __builtin_prefetch((addr))
static volatile uint32_t* sq;
// calculates 1/sqrt(x)
static GLDC_FORCE_INLINE float sh4_fsrra(float x) {
asm volatile ("fsrra %[value]\n"
@ -15,251 +12,248 @@ static GLDC_FORCE_INLINE float sh4_fsrra(float x) {
return x;
}
static GLDC_FORCE_INLINE void PushVertex(Vertex* v) {
volatile Vertex* dst = (Vertex*)(sq);
static GLDC_FORCE_INLINE void PushVertex(Vertex* v, volatile Vertex* dst) {
float ww = v->w * v->w;
dst->flags = v->flags;
float f = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w
// Convert to NDC (viewport already applied)
float x = v->x * f;
float y = v->y * f;
dst->flags = v->flags;
float f = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w
// Convert to NDC (viewport already applied)
float x = v->x * f;
float y = v->y * f;
dst->x = x;
dst->y = y;
dst->z = f;
dst->u = v->u;
dst->v = v->v;
dst->bgra = v->bgra;
__asm__("pref @%0" : : "r"(dst));
dst++;
dst->x = x;
dst->y = y;
dst->z = f;
dst->u = v->u;
dst->v = v->v;
dst->bgra = v->bgra;
__asm__("pref @%0" : : "r"(dst));
}
static inline void PushCommand(Vertex* v) {
uint32_t* s = (uint32_t*)v;
sq[0] = *(s++);
sq[1] = *(s++);
sq[2] = *(s++);
sq[3] = *(s++);
sq[4] = *(s++);
sq[5] = *(s++);
sq[6] = *(s++);
sq[7] = *(s++);
__asm__("pref @%0" : : "r"(sq));
sq += 8;
static inline void PushCommand(Vertex* v, volatile Vertex* dst) {
uint32_t* s = (uint32_t*)v;
volatile uint32_t* sq = (volatile uint32_t*)dst;
sq[0] = *(s++);
sq[1] = *(s++);
sq[2] = *(s++);
sq[3] = *(s++);
sq[4] = *(s++);
sq[5] = *(s++);
sq[6] = *(s++);
sq[7] = *(s++);
__asm__("pref @%0" : : "r"(sq));
}
extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, volatile void* vout, int type);
extern void ClipEdge(Vertex* const v1, Vertex* const v2, volatile Vertex* vout, char type);
#define V0_VIS (1 << 0)
#define V1_VIS (1 << 1)
#define V2_VIS (1 << 2)
#define V3_VIS (1 << 3)
// https://casual-effects.com/research/McGuire2011Clipping/clip.glsl
static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_t visible_mask) {
switch(visible_mask) {
case V0_VIS:
// v0
// / |
// / |
// .....A....B...
// / |
// v3--v2---v1
PushVertex(v0);
ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // B
ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // A
break;
case V1_VIS:
// v1
// / |
// / |
// ....A.....B...
// / |
// v0--v3---v2
ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A
PushVertex(v1); // v1
ClipEdge(v1, v2, sq, PVR_CMD_VERTEX_EOL); // B
break;
case V2_VIS:
// v2
// / |
// / |
// ....A.....B...
// / |
// v1--v0---v3
ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A
PushVertex(v2); // v2
ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // B
break;
case V3_VIS:
// v3
// / |
// / |
// ....A.....B...
// / |
// v2--v1---v0
ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // B
ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // A
PushVertex(v3); // v3
break;
case V0_VIS | V1_VIS:
// v0-----------v1
// \ |
// ....B..........A...
// \ |
// v3-----v2
PushVertex(v1); // v1
ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A
PushVertex(v0); // v0
ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // B
break;
// case V0_VIS | V2_VIS: degenerate case that should never happen
case V0_VIS | V3_VIS:
// v3-----------v0
// \ |
// ....B..........A...
// \ |
// v2-----v1
ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A
ClipEdge(v2, v3, sq, PVR_CMD_VERTEX); // B
PushVertex(v0); // v0
PushVertex(v3); // v3
break;
case V1_VIS | V2_VIS:
// v1-----------v2
// \ |
// ....B..........A...
// \ |
// v0-----v3
PushVertex(v1); // v1
PushVertex(v2); // v2
ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // B
ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // A
break;
// case V1_VIS | V3_VIS: degenerate case that should never happen
case V2_VIS | V3_VIS:
// v2-----------v3
// \ |
// ....B..........A...
// \ |
// v1-----v0
ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // B
PushVertex(v2); // v2
ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // A
PushVertex(v3); // v3
break;
case V0_VIS | V1_VIS | V2_VIS:
// --v1--
// v0-- --v2
// \ |
// .....B.....A...
// \ |
// v3
// v1,v2,v0 v2,v0,A v0,A,B
PushVertex(v1); // v1
PushVertex(v2); // v2
PushVertex(v0); // v0
ClipEdge(v2, v3, sq, PVR_CMD_VERTEX); // A
ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // B
break;
case V0_VIS | V1_VIS | V3_VIS:
// --v0--
// v3-- --v1
// \ |
// .....B.....A...
// \ |
// v2
// v0,v1,v3 v1,v3,A v3,A,B
v3->flags = PVR_CMD_VERTEX;
PushVertex(v0); // v0
PushVertex(v1); // v1
PushVertex(v3); // v3
ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A
ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // B
break;
case V0_VIS | V2_VIS | V3_VIS:
// --v3--
// v2-- --v0
// \ |
// .....B.....A...
// \ |
// v1
// v3,v0,v2 v0,v2,A v2,A,B
v3->flags = PVR_CMD_VERTEX;
PushVertex(v3); // v3
PushVertex(v0); // v0
PushVertex(v2); // v2
ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A
ClipEdge(v1, v2, sq, PVR_CMD_VERTEX_EOL); // B
break;
case V1_VIS | V2_VIS | V3_VIS:
// --v2--
// v1-- --v3
// \ |
// .....B.....A...
// \ |
// v0
// v2,v3,v1 v3,v1,A v1,A,B
v3->flags = PVR_CMD_VERTEX;
PushVertex(v2); // v2
PushVertex(v3); // v3
PushVertex(v1); // v1
ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // A
ClipEdge(v0, v1, sq, PVR_CMD_VERTEX_EOL); // B
break;
}
}
#define TYPE_VTX 0xE0 // PVR vertex, data
#define TYPE_EOS 0xF0 // PVR vertex, end of strip
extern void ProcessVertexList(Vertex* v3, int n, void* sq_addr);
void SceneListSubmit(Vertex* v3, int n) {
sq = (uint32_t*)MEM_AREA_SQ_BASE;
volatile Vertex* dst = (volatile Vertex*)MEM_AREA_SQ_BASE;
for (int i = 0; i < n; i++, v3++)
for (int i = 0; i < n; i++, v3++)
{
PREFETCH(v3 + 1);
switch(v3->flags & 0xFF000000) {
case PVR_CMD_VERTEX_EOL:
break;
case PVR_CMD_VERTEX:
continue;
default:
PushCommand(v3);
continue;
};
// Preload next vertex into memory
__builtin_prefetch(v3 + 1);
switch(v3->flags & 0xFF000000) {
case PVR_CMD_VERTEX_EOL:
break;
case PVR_CMD_VERTEX:
continue;
default:
PushCommand(v3, dst++);
continue;
};
// Quads [0, 1, 2, 3] -> Triangles [{0, 1, 2} {2, 3, 0}]
Vertex* const v0 = v3 - 3;
Vertex* const v1 = v3 - 2;
Vertex* const v2 = v3 - 1;
uint8_t visible_mask = v3->flags & 0xFF;
Vertex* const v0 = v3 - 3;
Vertex* const v1 = v3 - 2;
Vertex* const v2 = v3 - 1;
uint8_t mask = v3->flags & 0xFF;
switch(visible_mask) {
case V0_VIS | V1_VIS | V2_VIS | V3_VIS: // All vertices visible
{
// Triangle strip: {1,2,0} {2,0,3}
PushVertex(v1);
PushVertex(v2);
PushVertex(v0);
PushVertex(v3);
}
break;
default: // Some vertices visible
SubmitClipped(v0, v1, v2, v3, visible_mask);
break;
}
}
// Check if all vertices visible
if (__builtin_expect(mask == (V0_VIS | V1_VIS | V2_VIS | V3_VIS), 1)) {
// Triangle strip: {1,2,0} {2,0,3}
PushVertex(v1, dst++);
PushVertex(v2, dst++);
PushVertex(v0, dst++);
PushVertex(v3, dst++);
continue;
}
// Only some vertices visible
// https://casual-effects.com/research/McGuire2011Clipping/clip.glsl
switch(mask) {
case V0_VIS:
// v0
// / |
// / |
// .....A....B...
// / |
// v3--v2---v1
PushVertex(v0, dst++); // v0
ClipEdge(v0, v1, dst++, TYPE_VTX); // B
ClipEdge(v3, v0, dst++, TYPE_EOS); // A
break;
case V1_VIS:
// v1
// / |
// / |
// ....A.....B...
// / |
// v0--v3---v2
ClipEdge(v0, v1, dst++, TYPE_VTX); // A
PushVertex(v1, dst++); // v1
ClipEdge(v1, v2, dst++, TYPE_EOS); // B
break;
case V2_VIS:
// v2
// / |
// / |
// ....A.....B...
// / |
// v1--v0---v3
ClipEdge(v1, v2, dst++, TYPE_VTX); // A
PushVertex(v2, dst++); // v2
ClipEdge(v2, v3, dst++, TYPE_EOS); // B
break;
case V3_VIS:
// v3
// / |
// / |
// ....A.....B...
// / |
// v2--v1---v0
ClipEdge(v3, v0, dst++, TYPE_VTX); // B
ClipEdge(v2, v3, dst++, TYPE_EOS); // A
PushVertex(v3, dst++); // v3
break;
case V0_VIS | V1_VIS:
// v0-----------v1
// \ |
// ....B..........A...
// \ |
// v3-----v2
PushVertex(v1, dst++); // v1
ClipEdge(v1, v2, dst++, TYPE_VTX); // A
PushVertex(v0, dst++); // v0
ClipEdge(v3, v0, dst++, TYPE_EOS); // B
break;
// case V0_VIS | V2_VIS: degenerate case that should never happen
case V0_VIS | V3_VIS:
// v3-----------v0
// \ |
// ....B..........A...
// \ |
// v2-----v1
ClipEdge(v0, v1, dst++, TYPE_VTX); // A
ClipEdge(v2, v3, dst++, TYPE_VTX); // B
PushVertex(v0, dst++); // v0
PushVertex(v3, dst++); // v3
break;
case V1_VIS | V2_VIS:
// v1-----------v2
// \ |
// ....B..........A...
// \ |
// v0-----v3
PushVertex(v1, dst++); // v1
PushVertex(v2, dst++); // v2
ClipEdge(v0, v1, dst++, TYPE_VTX); // B
ClipEdge(v2, v3, dst++, TYPE_EOS); // A
break;
// case V1_VIS | V3_VIS: degenerate case that should never happen
case V2_VIS | V3_VIS:
// v2-----------v3
// \ |
// ....B..........A...
// \ |
// v1-----v0
ClipEdge(v1, v2, dst++, TYPE_VTX); // B
PushVertex(v2, dst++); // v2
ClipEdge(v3, v0, dst++, TYPE_VTX); // A
PushVertex(v3, dst++); // v3
break;
case V0_VIS | V1_VIS | V2_VIS:
// --v1--
// v0-- --v2
// \ |
// .....B.....A...
// \ |
// v3
// v1,v2,v0 v2,v0,A v0,A,B
PushVertex(v1, dst++); // v1
PushVertex(v2, dst++); // v2
PushVertex(v0, dst++); // v0
ClipEdge(v2, v3, dst++, TYPE_VTX); // A
ClipEdge(v3, v0, dst++, TYPE_EOS); // B
break;
case V0_VIS | V1_VIS | V3_VIS:
// --v0--
// v3-- --v1
// \ |
// .....B.....A...
// \ |
// v2
// v0,v1,v3 v1,v3,A v3,A,B
v3->flags = PVR_CMD_VERTEX;
PushVertex(v0, dst++); // v0
PushVertex(v1, dst++); // v1
PushVertex(v3, dst++); // v3
ClipEdge(v1, v2, dst++, TYPE_VTX); // A
ClipEdge(v2, v3, dst++, TYPE_EOS); // B
break;
case V0_VIS | V2_VIS | V3_VIS:
// --v3--
// v2-- --v0
// \ |
// .....B.....A...
// \ |
// v1
// v3,v0,v2 v0,v2,A v2,A,B
v3->flags = PVR_CMD_VERTEX;
PushVertex(v3, dst++); // v3
PushVertex(v0, dst++); // v0
PushVertex(v2, dst++); // v2
ClipEdge(v0, v1, dst++, TYPE_VTX); // A
ClipEdge(v1, v2, dst++, TYPE_EOS); // B
break;
case V1_VIS | V2_VIS | V3_VIS:
// --v2--
// v1-- --v3
// \ |
// .....B.....A...
// \ |
// v0
// v2,v3,v1 v3,v1,A v1,A,B
v3->flags = PVR_CMD_VERTEX;
PushVertex(v2, dst++); // v2
PushVertex(v3, dst++); // v3
PushVertex(v1, dst++); // v1
ClipEdge(v3, v0, dst++, TYPE_VTX); // A
ClipEdge(v0, v1, dst++, TYPE_EOS); // B
break;
}
}
}