diff --git a/misc/dreamcast/VertexClip2.S b/misc/dreamcast/VertexClip2.S index 4728d67b8..6d693a28a 100644 --- a/misc/dreamcast/VertexClip2.S +++ b/misc/dreamcast/VertexClip2.S @@ -33,7 +33,7 @@ ! ! out->u = invt * v1->u + t * v2->u; ! out->v = invt * v1->v + t * v2->v; -! out->w = invt * v1->w + t * v2->w; +! out->w = zNear // invt * v1->w + t * v2->w;, always ends up being zNear ! ! out->b = invt * v1->b + t * v2->b; ! out->g = invt * v1->g + t * v2->g; @@ -61,7 +61,7 @@ _ClipEdge: fmov.s @TM1,fr11 ! LS, fr11 = v2->z fsub fr2,fr11 ! FE, fr11 = v2->z - v1->z fldi0 fr8 ! LS, fr8 = 0 - mov.l TYP,@OUT ! LS, OUT->flags = TYPE + mov.l TYP,@OUT ! LS, OUT->cmd = TYPE fmul fr11,fr11 ! FE, fr11 = (v2->z - v1->z) * (v2->z * v1->z) fldi0 fr9 ! LS, fr9 = 0 fldi0 fr0 ! LS, fr0 = 0 @@ -74,117 +74,137 @@ _ClipEdge: add #4, IN2 ! EX, v2 += 4 add #4, OUT ! EX, OUT += 4 fsub fr11,fr10 ! FE, fr10 = 1.0 - t --> invT - + +! Load X components fmov.s @IN1+, fr2 ! LS, A1 = v1->x, v1 += 4 fmov.s @IN2+, fr3 ! LS, B1 = v2->x, v2 += 4 + +! Start interpolating X fipr fv8, fv0 ! FE, LERP(A1, B1) +! Load Y components fmov.s @IN1+, fr6 ! LS, A2 = v1->y, v1 += 4 fmov.s @IN2+, fr7 ! LS, B2 = v2->y, v2 += 4 - - fmov.s fr3,@OUT ! LS, OUT->x = LERP +! Load W + mov.l _NEAR_CLIP_W,TM1 ! tmp = zNear + lds TM1,fpul ! LS, FPUL = zNear + fsts fpul,fr2 ! LS, fr2 = FPUL +! Store interpolated X + fmul fr2,fr3 ! EX, fr7 = LERP * invW + fmov.s fr3,@OUT ! LS, OUT->x = LERP * invW add #4, OUT ! EX, OUT += 4 + +! Start interpolating Y fipr fv8, fv4 ! FE, LERP(A2, B2) +! Skip Z of input vertices add #4, IN1 ! EX, v1 += 4 add #4, IN2 ! EX, v2 += 4 - fmov.s fr7,@OUT ! LS, OUT->y = LERP +! Store interpolated Y + fmul fr2,fr7 ! EX, fr7 = LERP * invW + fmov.s fr7,@OUT ! LS, OUT->y = LERP * invW add #4, OUT ! EX, OUT += 4 - fmov.s fr1,@OUT ! LS, OUT->z = 0 +! Store W + fmov.s fr2,@OUT ! LS, OUT->w = 1/zNear add #4, OUT ! EX, OUT += 4 +! Load U components fmov.s @IN1+, fr2 ! LS, A1 = v1->u, v1 += 4 fmov.s @IN2+, fr3 ! LS, B1 = v2->u, v2 += 4 + +! Start interpolating U fipr fv8, fv0 ! FE, LERP(A1, B1) +! Load V components fmov.s @IN1+, fr6 ! LS, A2 = v1->v, v1 += 4 fmov.s @IN2+, fr7 ! LS, B2 = v2->v, v2 += 4 - +! Store interpolated U fmov.s fr3,@OUT ! LS, OUT->u = LERP add #4, OUT ! EX, OUT += 4 + +! Start interpolating V fipr fv8, fv4 ! FE, LERP(A2, B2) - add #4, IN1 ! EX, v1 += 4 - add #4, IN2 ! EX, v2 += 4 - fmov.s @IN1,fr2 ! LS, A1 = v1->w - fmov.s @IN2,fr3 ! LS, B1 = v2->w - fmov.s fr7,@OUT ! LS, OUT->v = LERP - add #8, OUT ! EX, OUT += 8 - - fipr fv8, fv0 ! FE, LERP(A1, B1) - add #-4, IN1 ! EX, v1 -= 4 - add #-4, IN2 ! EX, v2 -= 4 - fmov.s fr3,@OUT ! LS, OUT->w = lerp - add #-4, OUT ! EX, OUT -= 4 - +! Load colours and check if equal mov.l @IN1,CL1 ! LS, ACOLOR = v1->bgra mov.l @IN2,CL2 ! LS, BCOLOR = v2->bgra -! Bypass RGBA interpolation if unnecessary cmp/eq CL1,CL2 ! MT, T = ACOLOR == BCOLOR +! Store V + fmov.s fr7,@OUT ! LS, OUT->v = LERP + add #4, OUT ! EX, OUT += 4 + +! Bypass RGBA interpolation if unnecessary bt.s 1f ! BR, if (T) goto 1; mov CL1,CLO ! MT, OUTCOLOR = ACOLOR (branch delay instruction) ! Interpolate B extu.b CL1,TM1 ! EX, val = ACOLOR.b - lds TM1,fpul ! CO, FPUL = val - float fpul,fr2 ! EX, fr2 = float(FPUL) + lds TM1,fpul ! LS, FPUL = val + float fpul,fr2 ! FE, fr2 = float(FPUL) extu.b CL2,TM1 ! EX, val = BCOLOR.b - lds TM1,fpul ! CO, FPUL = val - float fpul,fr3 ! EX, fr3 = float(FPUL) + lds TM1,fpul ! LS, FPUL = val + float fpul,fr3 ! FE, fr3 = float(FPUL) fipr fv8, fv0 ! FE, LERP(A1, B1) shlr8 CL1 ! EX, ACOLOR >>= 8 - ftrc fr3,fpul ! FE, FPUL = int(lerp) shlr8 CL2 ! EX, BCOLOR >>= 8 - sts fpul,TM2 ! CO, tmp = FPUL + ftrc fr3,fpul ! FE, FPUL = int(lerp) + sts fpul,TM2 ! LS, tmp = FPUL ! Interpolate G extu.b CL1,TM1 ! EX, val = ACOLOR.g - lds TM1,fpul ! CO, FPUL = val - float fpul,fr2 ! EX, fr2 = float(FPUL) + lds TM1,fpul ! LS, FPUL = val + float fpul,fr2 ! FE, fr2 = float(FPUL) extu.b CL2,TM1 ! EX, val = BCOLOR.g - lds TM1,fpul ! CO, FPUL = val - float fpul,fr3 ! EX, fr3 = float(FPUL) + lds TM1,fpul ! LS, FPUL = val + float fpul,fr3 ! FE, fr3 = float(FPUL) fipr fv8, fv0 ! FE, LERP(A1, B1) shlr8 CL1 ! EX, ACOLOR >>= 8 - ftrc fr3,fpul ! FE, FPUL = int(lerp) extu.b TM2,TM2 ! EX, tmp = (uint8)tmp mov TM2,CLO ! MT, OUTCOLOR.b = tmp shlr8 CL2 ! EX, BCOLOR >>= 8 - sts fpul,TM2 ! CO, tmp = FPUL + ftrc fr3,fpul ! FE, FPUL = int(lerp) + sts fpul,TM2 ! LS, tmp = FPUL ! Interpolate R extu.b CL1,TM1 ! EX, val = ACOLOR.r - lds TM1,fpul ! CO, FPUL = val - float fpul,fr2 ! EX, fr2 = float(FPUL) + lds TM1,fpul ! LS, FPUL = val + float fpul,fr2 ! FE, fr2 = float(FPUL) extu.b CL2,TM1 ! EX, val = BCOLOR.r - lds TM1,fpul ! CO, FPUL = val - float fpul,fr3 ! EX, fr3 = float(FPUL) + lds TM1,fpul ! LS, FPUL = val + float fpul,fr3 ! FE, fr3 = float(FPUL) fipr fv8, fv0 ! FE, LERP(A1, B1) shlr8 CL1 ! EX, ACOLOR >>= 8 - ftrc fr3,fpul ! FE, FPUL = int(lerp) extu.b TM2,TM2 ! EX, tmp = (uint8)tmp shll8 TM2 ! EX, tmp <<= 8 or TM2,CLO ! EX, OUTCOLOR.g |= tmp shlr8 CL2 ! EX, BCOLOR >>= 8 - sts fpul,TM2 ! CO, tmp = FPUL + ftrc fr3,fpul ! FE, FPUL = int(lerp) + sts fpul,TM2 ! LS, tmp = FPUL ! Interpolate A extu.b CL1,TM1 ! EX, val = ACOLOR.a - lds TM1,fpul ! CO, FPUL = val - float fpul,fr2 ! EX, fr2 = float(FPUL) + lds TM1,fpul ! LS, FPUL = val + float fpul,fr2 ! FE, fr2 = float(FPUL) extu.b CL2,TM1 ! EX, val = BCOLOR.a - lds TM1,fpul ! CO, FPUL = val - float fpul,fr3 ! EX, fr3 = float(FPUL) + lds TM1,fpul ! LS, FPUL = val + float fpul,fr3 ! FE, fr3 = float(FPUL) fipr fv8, fv0 ! FE, LERP(A1, B1) - ftrc fr3,fpul ! FE, FPUL = int(lerp) extu.b TM2,TM2 ! EX, tmp = (uint8)tmp shll16 TM2 ! EX, tmp <<= 16 or TM2,CLO ! EX, OUTCOLOR.r |= tmp - sts fpul,TM2 ! CO, tmp = FPUL + ftrc fr3,fpul ! FE, FPUL = int(lerp) + sts fpul,TM2 ! LS, tmp = FPUL extu.b TM2,TM2 ! EX, tmp = (uint8)tmp shll16 TM2 ! EX, tmp <<= 16 shll8 TM2 ! EX, tmp <<= 8 or TM2,CLO ! EX, OUTCOLOR.a |= tmp 1: - rts ! CO, return after executing instruction in delay slot mov.l CLO,@OUT ! LS, OUT->color = OUTCOLOR + add #-24, OUT ! EX, OUT += 8 + rts ! CO, return after executing instruction in delay slot + pref @OUT ! LS, trigger store queue flush .size _ClipEdge, .-_ClipEdge .type _ClipEdge, %function + +.align 4 +_NEAR_CLIP_W: + .float 0 +.global _NEAR_CLIP_W diff --git a/src/Graphics_Dreamcast.c b/src/Graphics_Dreamcast.c index 319196a81..ac8629234 100644 --- a/src/Graphics_Dreamcast.c +++ b/src/Graphics_Dreamcast.c @@ -361,8 +361,11 @@ void Gfx_CalcOrthoMatrix(struct Matrix* matrix, float width, float height, float } static float Cotangent(float x) { return Math_CosF(x) / Math_SinF(x); } +extern float NEAR_CLIP_W; + void Gfx_CalcPerspectiveMatrix(struct Matrix* matrix, float fov, float aspect, float zFar) { float zNear = 0.1f; + NEAR_CLIP_W = 1.0f / zNear; /* Source https://learn.microsoft.com/en-us/windows/win32/direct3d9/d3dxmatrixperspectivefovrh */ float c = Cotangent(0.5f * fov); diff --git a/src/Graphics_PS1.c b/src/Graphics_PS1.c index a5a424441..3b99524e2 100644 --- a/src/Graphics_PS1.c +++ b/src/Graphics_PS1.c @@ -926,14 +926,14 @@ static void DrawColouredQuads3D(int verticesCount, int startVertex) { int p; GTE_Get_OTZ(p); if (p == 0 || (p >> 2) > OT_LENGTH) continue; - GTE_Store_XY0(&poly->x0, 0); - GTE_Store_XY1(&poly->x1, 0); - GTE_Store_XY2(&poly->x2, 0); + GTE_Store_XY0(poly, 8); // &poly->x0 + GTE_Store_XY1(poly, 12); // &poly->x1 + GTE_Store_XY2(poly, 16); // &poly->x2 GTE_Load_XYZ0(&v2->vx); GTE_Exec_RTPS(); // 15 cycles addPrim(&ot[p >> 2], poly); - GTE_Store_XY2(&poly->x3, 0); + GTE_Store_XY2(poly, 20); // &poly->x3 poly++; } @@ -980,14 +980,14 @@ static void DrawTexturedQuads3D(int verticesCount, int startVertex) { int p; GTE_Get_OTZ(p); if (p == 0 || (p >> 2) > OT_LENGTH) continue; - GTE_Store_XY0(&poly->x0, 0); - GTE_Store_XY1(&poly->x1, 0); - GTE_Store_XY2(&poly->x2, 0); - GTE_Load_XYZ0(&v2->vx); + GTE_Store_XY0(poly, 8); // &poly->x0 + GTE_Store_XY1(poly, 16); // &poly->x1 + GTE_Store_XY2(poly, 24); // &poly->x2 + GTE_Load_XYZ0(&v2->vx); GTE_Exec_RTPS(); // 15 cycles addPrim(&ot[p >> 2], poly); - GTE_Store_XY2(&poly->x3, 0); + GTE_Store_XY2(poly, 32); // &poly->x3 poly->u0 = (v1->u >> uShift) + uOffset; poly->v0 = (v1->v >> vShift) + vOffset; diff --git a/third_party/gldc/sh4.c b/third_party/gldc/sh4.c index 937fe6b0e..9481cb37b 100644 --- a/third_party/gldc/sh4.c +++ b/third_party/gldc/sh4.c @@ -48,7 +48,7 @@ static inline void PushCommand(Vertex* v) { sq += 8; } -extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout, int type); +extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, volatile void* vout, int type); #define V0_VIS (1 << 0) #define V1_VIS (1 << 1) @@ -58,137 +58,106 @@ extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vou // https://casual-effects.com/research/McGuire2011Clipping/clip.glsl static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_t visible_mask) { - Vertex __attribute__((aligned(32))) scratch[2]; - Vertex* a = &scratch[0]; - Vertex* b = &scratch[1]; - switch(visible_mask) { case V0_VIS: - { // v0 // / | // / | // .....A....B... // / | // v3--v2---v1 - ClipEdge(v3, v0, a, PVR_CMD_VERTEX_EOL); - ClipEdge(v0, v1, b, PVR_CMD_VERTEX); - PushVertex(v0); - PushVertex(b); - PushVertex(a); - } - break; + ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // B + ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // A + break; + case V1_VIS: - { // v1 // / | // / | // ....A.....B... // / | // v0--v3---v2 - ClipEdge(v0, v1, a, PVR_CMD_VERTEX); - ClipEdge(v1, v2, b, PVR_CMD_VERTEX_EOL); + ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A + PushVertex(v1); // v1 + ClipEdge(v1, v2, sq, PVR_CMD_VERTEX_EOL); // B + break; - PushVertex(a); - PushVertex(v1); - PushVertex(b); - } break; case V2_VIS: - { // v2 // / | // / | // ....A.....B... // / | // v1--v0---v3 - ClipEdge(v1, v2, a, PVR_CMD_VERTEX); - ClipEdge(v2, v3, b, PVR_CMD_VERTEX_EOL); + ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A + PushVertex(v2); // v2 + ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // B + break; - PushVertex(a); - PushVertex(v2); - PushVertex(b); - } break; case V3_VIS: - { // v3 // / | // / | // ....A.....B... // / | // v2--v1---v0 - ClipEdge(v2, v3, a, PVR_CMD_VERTEX); - ClipEdge(v3, v0, b, PVR_CMD_VERTEX); + ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // B + ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // A + PushVertex(v3); // v3 + break; - PushVertex(b); - PushVertex(a); - PushVertex(v3); - } - break; case V0_VIS | V1_VIS: - { // v0-----------v1 // \ | // ....B..........A... // \ | // v3-----v2 - ClipEdge(v1, v2, a, PVR_CMD_VERTEX); - ClipEdge(v3, v0, b, PVR_CMD_VERTEX_EOL); + PushVertex(v1); // v1 + ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A + PushVertex(v0); // v0 + ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // B + break; - PushVertex(v1); - PushVertex(a); - PushVertex(v0); - PushVertex(b); - } break; // case V0_VIS | V2_VIS: degenerate case that should never happen case V0_VIS | V3_VIS: - { // v3-----------v0 // \ | // ....B..........A... // \ | // v2-----v1 - ClipEdge(v0, v1, a, PVR_CMD_VERTEX); - ClipEdge(v2, v3, b, PVR_CMD_VERTEX); + ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A + ClipEdge(v2, v3, sq, PVR_CMD_VERTEX); // B + PushVertex(v0); // v0 + PushVertex(v3); // v3 + break; - PushVertex(a); - PushVertex(b); - PushVertex(v0); - PushVertex(v3); - } break; case V1_VIS | V2_VIS: - { // v1-----------v2 // \ | // ....B..........A... // \ | // v0-----v3 - ClipEdge(v2, v3, a, PVR_CMD_VERTEX_EOL); - ClipEdge(v0, v1, b, PVR_CMD_VERTEX); + PushVertex(v1); // v1 + PushVertex(v2); // v2 + ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // B + ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // A + break; - PushVertex(v1); - PushVertex(v2); - PushVertex(b); - PushVertex(a); - } break; // case V1_VIS | V3_VIS: degenerate case that should never happen case V2_VIS | V3_VIS: - { // v2-----------v3 // \ | // ....B..........A... // \ | // v1-----v0 - ClipEdge(v3, v0, a, PVR_CMD_VERTEX); - ClipEdge(v1, v2, b, PVR_CMD_VERTEX); + ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // B + PushVertex(v2); // v2 + ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // A + PushVertex(v3); // v3 + break; - PushVertex(b); - PushVertex(v2); - PushVertex(a); - PushVertex(v3); - } break; case V0_VIS | V1_VIS | V2_VIS: - { // --v1-- // v0-- --v2 // \ | @@ -196,17 +165,14 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_ // \ | // v3 // v1,v2,v0 v2,v0,A v0,A,B - ClipEdge(v2, v3, a, PVR_CMD_VERTEX); - ClipEdge(v3, v0, b, PVR_CMD_VERTEX_EOL); + PushVertex(v1); // v1 + PushVertex(v2); // v2 + PushVertex(v0); // v0 + ClipEdge(v2, v3, sq, PVR_CMD_VERTEX); // A + ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // B + break; - PushVertex(v1); - PushVertex(v2); - PushVertex(v0); - PushVertex(a); - PushVertex(b); - } break; case V0_VIS | V1_VIS | V3_VIS: - { // --v0-- // v3-- --v1 // \ | @@ -214,18 +180,15 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_ // \ | // v2 // v0,v1,v3 v1,v3,A v3,A,B - ClipEdge(v1, v2, a, PVR_CMD_VERTEX); - ClipEdge(v2, v3, b, PVR_CMD_VERTEX_EOL); v3->flags = PVR_CMD_VERTEX; + PushVertex(v0); // v0 + PushVertex(v1); // v1 + PushVertex(v3); // v3 + ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A + ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // B + break; - PushVertex(v0); - PushVertex(v1); - PushVertex(v3); - PushVertex(a); - PushVertex(b); - } break; case V0_VIS | V2_VIS | V3_VIS: - { // --v3-- // v2-- --v0 // \ | @@ -233,18 +196,15 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_ // \ | // v1 // v3,v0,v2 v0,v2,A v2,A,B - ClipEdge(v0, v1, a, PVR_CMD_VERTEX); - ClipEdge(v1, v2, b, PVR_CMD_VERTEX_EOL); v3->flags = PVR_CMD_VERTEX; + PushVertex(v3); // v3 + PushVertex(v0); // v0 + PushVertex(v2); // v2 + ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A + ClipEdge(v1, v2, sq, PVR_CMD_VERTEX_EOL); // B + break; - PushVertex(v3); - PushVertex(v0); - PushVertex(v2); - PushVertex(a); - PushVertex(b); - } break; case V1_VIS | V2_VIS | V3_VIS: - { // --v2-- // v1-- --v3 // \ | @@ -252,16 +212,13 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_ // \ | // v0 // v2,v3,v1 v3,v1,A v1,A,B - ClipEdge(v3, v0, a, PVR_CMD_VERTEX); - ClipEdge(v0, v1, b, PVR_CMD_VERTEX_EOL); v3->flags = PVR_CMD_VERTEX; - - PushVertex(v2); - PushVertex(v3); - PushVertex(v1); - PushVertex(a); - PushVertex(b); - } break; + PushVertex(v2); // v2 + PushVertex(v3); // v3 + PushVertex(v1); // v1 + ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // A + ClipEdge(v0, v1, sq, PVR_CMD_VERTEX_EOL); // B + break; } }