diff --git a/misc/dreamcast/VertexClip2.S b/misc/dreamcast/VertexClip2.S index 6d693a28a..c40ce93f3 100644 --- a/misc/dreamcast/VertexClip2.S +++ b/misc/dreamcast/VertexClip2.S @@ -23,17 +23,18 @@ #define CL2 r5 // input colour 2 #define CLO r7 // output colour -! Calculates the near plane intersection point between two points: +! Writes output vertex as the near plane intersection point between two points: ! float t = fabsf(v1->z) / fabsf(v2->z - v1->z) ! float invt = 1.0f - t; +! // note: w = invt * v1->w + t * v2->w;, always ends up being zNear ! -! out->x = invt * v1->x + t * v2->x; -! out->y = invt * v1->y + t * v2->y; -! out->z = 0.0f; // clipped against near plane anyways (I.e Z/W = 0 --> Z = 0) +! out->c = type << 24 +! out->x = (invt * v1->x + t * v2->x) * 1/zNear +! out->y = (invt * v1->y + t * v2->y) * 1/zNear +! out->w = 1/zNear ! ! out->u = invt * v1->u + t * v2->u; ! out->v = invt * v1->v + t * v2->v; -! out->w = zNear // invt * v1->w + t * v2->w;, always ends up being zNear ! ! out->b = invt * v1->b + t * v2->b; ! out->g = invt * v1->g + t * v2->g; @@ -41,7 +42,7 @@ ! out->a = invt * v1->a + t * v2->a; ! To optimise these calculations, FIPR is used: ! FIPR = FVm.x*FVn.x + FVm.y*FVn.x + FVm.z*FVn.z + FVm.w*FVn.w --> FVn.w -! FIPR can be used to accomplish "vout->Q invt * v1->Q + t * v2->Q" by: +! FIPR can be used to accomplish "vout->Q = invt * v1->Q + t * v2->Q" by: ! - assigning x/y components to 0 for both vectors ! - assigning t and invT to z/w of FVm vector ! - assigning v1 and v2 to z/w of FVn vector @@ -51,74 +52,74 @@ .global _ClipEdge .align 4 _ClipEdge: - mov IN1, TM1 ! MT, tmp = &v1 + add #12, IN1 ! EX, IN1 = &v1->z fldi0 fr4 ! LS, fr4 = 0 - add #12, TM1 ! EX, tmp = &v1->z - fmov.s @TM1, fr2 ! LS, fr2 = v1->z - mov IN2, TM1 ! MT, tmp = &v2 + fmov.s @IN1, fr2 ! LS, fr2 = v1->z + add #12, IN2 ! EX, IN = &v2->z fldi0 fr5 ! LS, fr5 = 0 - add #12, TM1 ! EX, tmp = &v2->z - fmov.s @TM1,fr11 ! LS, fr11 = v2->z + fmov.s @IN2,fr11 ! LS, fr11 = v2->z fsub fr2,fr11 ! FE, fr11 = v2->z - v1->z fldi0 fr8 ! LS, fr8 = 0 - mov.l TYP,@OUT ! LS, OUT->cmd = TYPE + shll16 TYP ! EX, TYP <<= 16 fmul fr11,fr11 ! FE, fr11 = (v2->z - v1->z) * (v2->z * v1->z) fldi0 fr9 ! LS, fr9 = 0 fldi0 fr0 ! LS, fr0 = 0 fldi0 fr1 ! LS, fr1 = 0 fsrra fr11 ! FE, fr11 = 1 / abs(v2->z - v1->z) + shll8 TYP ! EX, TYP <<= 8 fabs fr2 ! LS, fr2 = abs(v1->z) + mov.l TYP,@OUT ! LS, dst->cmd = TYPE fmul fr2,fr11 ! FE, fr11 = abs(v1->Z) / abs(v2->z - v1->z) --> t - add #4, IN1 ! EX, v1 += 4 + add #-8, IN1 ! EX, IN1 = &v1->x fldi1 fr10 ! LS, fr10 = 1 - add #4, IN2 ! EX, v2 += 4 - add #4, OUT ! EX, OUT += 4 - fsub fr11,fr10 ! FE, fr10 = 1.0 - t --> invT + add #-8, IN2 ! EX, IN2 = &v2->x + add #4, OUT ! EX, OUT = &dst->x + fsub fr11,fr10 ! FE, invT = 1.0 - t --> invT ! Load X components - fmov.s @IN1+, fr2 ! LS, A1 = v1->x, v1 += 4 - fmov.s @IN2+, fr3 ! LS, B1 = v2->x, v2 += 4 + fmov.s @IN1+, fr2 ! LS, A1 = v1->x, IN1 = &v1->y + fmov.s @IN2+, fr3 ! LS, B1 = v2->x, IN2 = &v2->y ! Start interpolating X fipr fv8, fv0 ! FE, LERP(A1, B1) ! Load Y components - fmov.s @IN1+, fr6 ! LS, A2 = v1->y, v1 += 4 - fmov.s @IN2+, fr7 ! LS, B2 = v2->y, v2 += 4 + fmov.s @IN1, fr6 ! LS, A2 = v1->y + fmov.s @IN2, fr7 ! LS, B2 = v2->y ! Load W mov.l _NEAR_CLIP_W,TM1 ! tmp = zNear lds TM1,fpul ! LS, FPUL = zNear fsts fpul,fr2 ! LS, fr2 = FPUL ! Store interpolated X fmul fr2,fr3 ! EX, fr7 = LERP * invW - fmov.s fr3,@OUT ! LS, OUT->x = LERP * invW - add #4, OUT ! EX, OUT += 4 + fmov.s fr3,@OUT ! LS, dst->x = LERP * invW + add #4, OUT ! EX, OUT = &dst->y ! Start interpolating Y fipr fv8, fv4 ! FE, LERP(A2, B2) ! Skip Z of input vertices - add #4, IN1 ! EX, v1 += 4 - add #4, IN2 ! EX, v2 += 4 + add #8, IN1 ! EX, IN1 = &v1->u + add #8, IN2 ! EX, IN2 = &v2->u ! Store interpolated Y fmul fr2,fr7 ! EX, fr7 = LERP * invW fmov.s fr7,@OUT ! LS, OUT->y = LERP * invW - add #4, OUT ! EX, OUT += 4 + add #4, OUT ! EX, OUT = &dst->w ! Store W fmov.s fr2,@OUT ! LS, OUT->w = 1/zNear - add #4, OUT ! EX, OUT += 4 + add #4, OUT ! EX, OUT = &dst->u ! Load U components - fmov.s @IN1+, fr2 ! LS, A1 = v1->u, v1 += 4 - fmov.s @IN2+, fr3 ! LS, B1 = v2->u, v2 += 4 + fmov.s @IN1+, fr2 ! LS, A1 = v1->u, IN1 = &v1->v + fmov.s @IN2+, fr3 ! LS, B1 = v2->u, IN2 = &v1->v ! Start interpolating U fipr fv8, fv0 ! FE, LERP(A1, B1) ! Load V components - fmov.s @IN1+, fr6 ! LS, A2 = v1->v, v1 += 4 - fmov.s @IN2+, fr7 ! LS, B2 = v2->v, v2 += 4 + fmov.s @IN1+, fr6 ! LS, A2 = v1->v, IN1 = &v1->bgra + fmov.s @IN2+, fr7 ! LS, B2 = v2->v, IN2 = &v2->bgra ! Store interpolated U - fmov.s fr3,@OUT ! LS, OUT->u = LERP - add #4, OUT ! EX, OUT += 4 + fmov.s fr3,@OUT ! LS, dst->u = LERP + add #4, OUT ! EX, OUT = &dst->v ! Start interpolating V fipr fv8, fv4 ! FE, LERP(A2, B2) @@ -127,8 +128,8 @@ _ClipEdge: mov.l @IN2,CL2 ! LS, BCOLOR = v2->bgra cmp/eq CL1,CL2 ! MT, T = ACOLOR == BCOLOR ! Store V - fmov.s fr7,@OUT ! LS, OUT->v = LERP - add #4, OUT ! EX, OUT += 4 + fmov.s fr7,@OUT ! LS, dst->v = LERP + add #4, OUT ! EX, OUT = &dst->bgra ! Bypass RGBA interpolation if unnecessary bt.s 1f ! BR, if (T) goto 1; diff --git a/third_party/gldc/sh4.c b/third_party/gldc/sh4.c index 9481cb37b..d508777ac 100644 --- a/third_party/gldc/sh4.c +++ b/third_party/gldc/sh4.c @@ -2,9 +2,6 @@ #include #include "gldc.h" -#define PREFETCH(addr) __builtin_prefetch((addr)) -static volatile uint32_t* sq; - // calculates 1/sqrt(x) static GLDC_FORCE_INLINE float sh4_fsrra(float x) { asm volatile ("fsrra %[value]\n" @@ -15,251 +12,248 @@ static GLDC_FORCE_INLINE float sh4_fsrra(float x) { return x; } -static GLDC_FORCE_INLINE void PushVertex(Vertex* v) { - volatile Vertex* dst = (Vertex*)(sq); +static GLDC_FORCE_INLINE void PushVertex(Vertex* v, volatile Vertex* dst) { float ww = v->w * v->w; - dst->flags = v->flags; - float f = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w - // Convert to NDC (viewport already applied) - float x = v->x * f; - float y = v->y * f; + dst->flags = v->flags; + float f = sh4_fsrra(ww); // 1/sqrt(w^2) ~ 1/w + // Convert to NDC (viewport already applied) + float x = v->x * f; + float y = v->y * f; - dst->x = x; - dst->y = y; - dst->z = f; - dst->u = v->u; - dst->v = v->v; - dst->bgra = v->bgra; - __asm__("pref @%0" : : "r"(dst)); - dst++; + dst->x = x; + dst->y = y; + dst->z = f; + dst->u = v->u; + dst->v = v->v; + dst->bgra = v->bgra; + __asm__("pref @%0" : : "r"(dst)); } -static inline void PushCommand(Vertex* v) { - uint32_t* s = (uint32_t*)v; - sq[0] = *(s++); - sq[1] = *(s++); - sq[2] = *(s++); - sq[3] = *(s++); - sq[4] = *(s++); - sq[5] = *(s++); - sq[6] = *(s++); - sq[7] = *(s++); - __asm__("pref @%0" : : "r"(sq)); - sq += 8; +static inline void PushCommand(Vertex* v, volatile Vertex* dst) { + uint32_t* s = (uint32_t*)v; + volatile uint32_t* sq = (volatile uint32_t*)dst; + + sq[0] = *(s++); + sq[1] = *(s++); + sq[2] = *(s++); + sq[3] = *(s++); + sq[4] = *(s++); + sq[5] = *(s++); + sq[6] = *(s++); + sq[7] = *(s++); + __asm__("pref @%0" : : "r"(sq)); } -extern void ClipEdge(const Vertex* const v1, const Vertex* const v2, volatile void* vout, int type); +extern void ClipEdge(Vertex* const v1, Vertex* const v2, volatile Vertex* vout, char type); #define V0_VIS (1 << 0) #define V1_VIS (1 << 1) #define V2_VIS (1 << 2) #define V3_VIS (1 << 3) - -// https://casual-effects.com/research/McGuire2011Clipping/clip.glsl -static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_t visible_mask) { - switch(visible_mask) { - case V0_VIS: - // v0 - // / | - // / | - // .....A....B... - // / | - // v3--v2---v1 - PushVertex(v0); - ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // B - ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // A - break; - - case V1_VIS: - // v1 - // / | - // / | - // ....A.....B... - // / | - // v0--v3---v2 - ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A - PushVertex(v1); // v1 - ClipEdge(v1, v2, sq, PVR_CMD_VERTEX_EOL); // B - break; - - case V2_VIS: - // v2 - // / | - // / | - // ....A.....B... - // / | - // v1--v0---v3 - ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A - PushVertex(v2); // v2 - ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // B - break; - - case V3_VIS: - // v3 - // / | - // / | - // ....A.....B... - // / | - // v2--v1---v0 - ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // B - ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // A - PushVertex(v3); // v3 - break; - - case V0_VIS | V1_VIS: - // v0-----------v1 - // \ | - // ....B..........A... - // \ | - // v3-----v2 - PushVertex(v1); // v1 - ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A - PushVertex(v0); // v0 - ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // B - break; - - // case V0_VIS | V2_VIS: degenerate case that should never happen - case V0_VIS | V3_VIS: - // v3-----------v0 - // \ | - // ....B..........A... - // \ | - // v2-----v1 - ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A - ClipEdge(v2, v3, sq, PVR_CMD_VERTEX); // B - PushVertex(v0); // v0 - PushVertex(v3); // v3 - break; - - case V1_VIS | V2_VIS: - // v1-----------v2 - // \ | - // ....B..........A... - // \ | - // v0-----v3 - PushVertex(v1); // v1 - PushVertex(v2); // v2 - ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // B - ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // A - break; - - // case V1_VIS | V3_VIS: degenerate case that should never happen - case V2_VIS | V3_VIS: - // v2-----------v3 - // \ | - // ....B..........A... - // \ | - // v1-----v0 - ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // B - PushVertex(v2); // v2 - ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // A - PushVertex(v3); // v3 - break; - - case V0_VIS | V1_VIS | V2_VIS: - // --v1-- - // v0-- --v2 - // \ | - // .....B.....A... - // \ | - // v3 - // v1,v2,v0 v2,v0,A v0,A,B - PushVertex(v1); // v1 - PushVertex(v2); // v2 - PushVertex(v0); // v0 - ClipEdge(v2, v3, sq, PVR_CMD_VERTEX); // A - ClipEdge(v3, v0, sq, PVR_CMD_VERTEX_EOL); // B - break; - - case V0_VIS | V1_VIS | V3_VIS: - // --v0-- - // v3-- --v1 - // \ | - // .....B.....A... - // \ | - // v2 - // v0,v1,v3 v1,v3,A v3,A,B - v3->flags = PVR_CMD_VERTEX; - PushVertex(v0); // v0 - PushVertex(v1); // v1 - PushVertex(v3); // v3 - ClipEdge(v1, v2, sq, PVR_CMD_VERTEX); // A - ClipEdge(v2, v3, sq, PVR_CMD_VERTEX_EOL); // B - break; - - case V0_VIS | V2_VIS | V3_VIS: - // --v3-- - // v2-- --v0 - // \ | - // .....B.....A... - // \ | - // v1 - // v3,v0,v2 v0,v2,A v2,A,B - v3->flags = PVR_CMD_VERTEX; - PushVertex(v3); // v3 - PushVertex(v0); // v0 - PushVertex(v2); // v2 - ClipEdge(v0, v1, sq, PVR_CMD_VERTEX); // A - ClipEdge(v1, v2, sq, PVR_CMD_VERTEX_EOL); // B - break; - - case V1_VIS | V2_VIS | V3_VIS: - // --v2-- - // v1-- --v3 - // \ | - // .....B.....A... - // \ | - // v0 - // v2,v3,v1 v3,v1,A v1,A,B - v3->flags = PVR_CMD_VERTEX; - PushVertex(v2); // v2 - PushVertex(v3); // v3 - PushVertex(v1); // v1 - ClipEdge(v3, v0, sq, PVR_CMD_VERTEX); // A - ClipEdge(v0, v1, sq, PVR_CMD_VERTEX_EOL); // B - break; - } -} +#define TYPE_VTX 0xE0 // PVR vertex, data +#define TYPE_EOS 0xF0 // PVR vertex, end of strip extern void ProcessVertexList(Vertex* v3, int n, void* sq_addr); void SceneListSubmit(Vertex* v3, int n) { - sq = (uint32_t*)MEM_AREA_SQ_BASE; + volatile Vertex* dst = (volatile Vertex*)MEM_AREA_SQ_BASE; - for (int i = 0; i < n; i++, v3++) + for (int i = 0; i < n; i++, v3++) { - PREFETCH(v3 + 1); - switch(v3->flags & 0xFF000000) { - case PVR_CMD_VERTEX_EOL: - break; - case PVR_CMD_VERTEX: - continue; - default: - PushCommand(v3); - continue; - }; + // Preload next vertex into memory + __builtin_prefetch(v3 + 1); + + switch(v3->flags & 0xFF000000) { + case PVR_CMD_VERTEX_EOL: + break; + case PVR_CMD_VERTEX: + continue; + default: + PushCommand(v3, dst++); + continue; + }; // Quads [0, 1, 2, 3] -> Triangles [{0, 1, 2} {2, 3, 0}] - Vertex* const v0 = v3 - 3; - Vertex* const v1 = v3 - 2; - Vertex* const v2 = v3 - 1; - uint8_t visible_mask = v3->flags & 0xFF; + Vertex* const v0 = v3 - 3; + Vertex* const v1 = v3 - 2; + Vertex* const v2 = v3 - 1; + uint8_t mask = v3->flags & 0xFF; - switch(visible_mask) { - case V0_VIS | V1_VIS | V2_VIS | V3_VIS: // All vertices visible - { - // Triangle strip: {1,2,0} {2,0,3} - PushVertex(v1); - PushVertex(v2); - PushVertex(v0); - PushVertex(v3); - } - break; - - default: // Some vertices visible - SubmitClipped(v0, v1, v2, v3, visible_mask); - break; - } - } + // Check if all vertices visible + if (__builtin_expect(mask == (V0_VIS | V1_VIS | V2_VIS | V3_VIS), 1)) { + // Triangle strip: {1,2,0} {2,0,3} + PushVertex(v1, dst++); + PushVertex(v2, dst++); + PushVertex(v0, dst++); + PushVertex(v3, dst++); + continue; + } + + + // Only some vertices visible + // https://casual-effects.com/research/McGuire2011Clipping/clip.glsl + switch(mask) { + case V0_VIS: + // v0 + // / | + // / | + // .....A....B... + // / | + // v3--v2---v1 + PushVertex(v0, dst++); // v0 + ClipEdge(v0, v1, dst++, TYPE_VTX); // B + ClipEdge(v3, v0, dst++, TYPE_EOS); // A + break; + + case V1_VIS: + // v1 + // / | + // / | + // ....A.....B... + // / | + // v0--v3---v2 + ClipEdge(v0, v1, dst++, TYPE_VTX); // A + PushVertex(v1, dst++); // v1 + ClipEdge(v1, v2, dst++, TYPE_EOS); // B + break; + + case V2_VIS: + // v2 + // / | + // / | + // ....A.....B... + // / | + // v1--v0---v3 + ClipEdge(v1, v2, dst++, TYPE_VTX); // A + PushVertex(v2, dst++); // v2 + ClipEdge(v2, v3, dst++, TYPE_EOS); // B + break; + + case V3_VIS: + // v3 + // / | + // / | + // ....A.....B... + // / | + // v2--v1---v0 + ClipEdge(v3, v0, dst++, TYPE_VTX); // B + ClipEdge(v2, v3, dst++, TYPE_EOS); // A + PushVertex(v3, dst++); // v3 + break; + + case V0_VIS | V1_VIS: + // v0-----------v1 + // \ | + // ....B..........A... + // \ | + // v3-----v2 + PushVertex(v1, dst++); // v1 + ClipEdge(v1, v2, dst++, TYPE_VTX); // A + PushVertex(v0, dst++); // v0 + ClipEdge(v3, v0, dst++, TYPE_EOS); // B + break; + + // case V0_VIS | V2_VIS: degenerate case that should never happen + case V0_VIS | V3_VIS: + // v3-----------v0 + // \ | + // ....B..........A... + // \ | + // v2-----v1 + ClipEdge(v0, v1, dst++, TYPE_VTX); // A + ClipEdge(v2, v3, dst++, TYPE_VTX); // B + PushVertex(v0, dst++); // v0 + PushVertex(v3, dst++); // v3 + break; + + case V1_VIS | V2_VIS: + // v1-----------v2 + // \ | + // ....B..........A... + // \ | + // v0-----v3 + PushVertex(v1, dst++); // v1 + PushVertex(v2, dst++); // v2 + ClipEdge(v0, v1, dst++, TYPE_VTX); // B + ClipEdge(v2, v3, dst++, TYPE_EOS); // A + break; + + // case V1_VIS | V3_VIS: degenerate case that should never happen + case V2_VIS | V3_VIS: + // v2-----------v3 + // \ | + // ....B..........A... + // \ | + // v1-----v0 + ClipEdge(v1, v2, dst++, TYPE_VTX); // B + PushVertex(v2, dst++); // v2 + ClipEdge(v3, v0, dst++, TYPE_VTX); // A + PushVertex(v3, dst++); // v3 + break; + + case V0_VIS | V1_VIS | V2_VIS: + // --v1-- + // v0-- --v2 + // \ | + // .....B.....A... + // \ | + // v3 + // v1,v2,v0 v2,v0,A v0,A,B + PushVertex(v1, dst++); // v1 + PushVertex(v2, dst++); // v2 + PushVertex(v0, dst++); // v0 + ClipEdge(v2, v3, dst++, TYPE_VTX); // A + ClipEdge(v3, v0, dst++, TYPE_EOS); // B + break; + + case V0_VIS | V1_VIS | V3_VIS: + // --v0-- + // v3-- --v1 + // \ | + // .....B.....A... + // \ | + // v2 + // v0,v1,v3 v1,v3,A v3,A,B + v3->flags = PVR_CMD_VERTEX; + PushVertex(v0, dst++); // v0 + PushVertex(v1, dst++); // v1 + PushVertex(v3, dst++); // v3 + ClipEdge(v1, v2, dst++, TYPE_VTX); // A + ClipEdge(v2, v3, dst++, TYPE_EOS); // B + break; + + case V0_VIS | V2_VIS | V3_VIS: + // --v3-- + // v2-- --v0 + // \ | + // .....B.....A... + // \ | + // v1 + // v3,v0,v2 v0,v2,A v2,A,B + v3->flags = PVR_CMD_VERTEX; + PushVertex(v3, dst++); // v3 + PushVertex(v0, dst++); // v0 + PushVertex(v2, dst++); // v2 + ClipEdge(v0, v1, dst++, TYPE_VTX); // A + ClipEdge(v1, v2, dst++, TYPE_EOS); // B + break; + + case V1_VIS | V2_VIS | V3_VIS: + // --v2-- + // v1-- --v3 + // \ | + // .....B.....A... + // \ | + // v0 + // v2,v3,v1 v3,v1,A v1,A,B + v3->flags = PVR_CMD_VERTEX; + PushVertex(v2, dst++); // v2 + PushVertex(v3, dst++); // v3 + PushVertex(v1, dst++); // v1 + ClipEdge(v3, v0, dst++, TYPE_VTX); // A + ClipEdge(v0, v1, dst++, TYPE_EOS); // B + break; + } + } }