Dreamcast: Slightly boost performance by attempting to perform polygon perspective division and clipping as a quad instead of 2 triangles when possible

This commit is contained in:
UnknownShadow200 2023-11-24 08:00:55 +11:00
parent 064be092e8
commit 48f0cb7b1a
2 changed files with 248 additions and 186 deletions

View File

@ -75,22 +75,13 @@ static void generateQuads(SubmissionTarget* target, const GLsizei first, const G
*((Float2*)it->uv) = F2ZERO;
}
src += stride;
src += stride;
it->flags = GPU_CMD_VERTEX;
it++;
}
// Quads [0, 1, 2, 3] -> Triangles [{0, 1, 2} {2, 3, 0}]
PREFETCH(dst); // TODO: more prefetching?
memcpy_vertex(dst + 5, dst + 0); dst[5].flags = GPU_CMD_VERTEX_EOL;
memcpy_vertex(dst + 4, dst + 3); dst[4].flags = GPU_CMD_VERTEX;
memcpy_vertex(dst + 3, dst + 2); dst[3].flags = GPU_CMD_VERTEX;
dst[2].flags = GPU_CMD_VERTEX_EOL;
dst[1].flags = GPU_CMD_VERTEX;
dst[0].flags = GPU_CMD_VERTEX;
// TODO copy straight to dst??
dst += 6;
dst[3].flags = GPU_CMD_VERTEX_EOL;
dst += 4;
}
}
@ -131,7 +122,7 @@ void APIENTRY glDrawArrays(GLenum mode, GLint first, GLsizei count) {
TRACE();
if (!count) return;
submitVertices(count * 6 / 4); // quads -> triangles
submitVertices(count);
generateQuads(&SUBMISSION_TARGET, first, count);
}

View File

@ -39,13 +39,13 @@ GL_FORCE_INLINE float _glFastInvert(float x) {
return MATH_fsrra(x * x);
}
GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex) {
TRACE();
const float f = _glFastInvert(vertex->w);
/* Convert to NDC and apply viewport */
vertex->xyz[0] = (vertex->xyz[0] * f * 320) + 320;
vertex->xyz[0] = (vertex->xyz[0] * f * 320) + 320;
vertex->xyz[1] = (vertex->xyz[1] * f * -240) + 240;
/* Orthographic projections need to use invZ otherwise we lose
@ -120,15 +120,191 @@ static volatile uint32_t* QACR = (uint32_t*) 0xFF000038;
#define V0_VIS (1 << 0)
#define V1_VIS (1 << 1)
#define V2_VIS (1 << 2)
#define V3_VIS (1 << 3)
void SceneListSubmit(Vertex* v2, int n) {
static void SubmitTriangle(Vertex* v0, Vertex* v1, Vertex* v2, uint8_t visible_mask) {
Vertex __attribute__((aligned(32))) scratch[4];
switch(visible_mask) {
case V0_VIS | V1_VIS | V2_VIS: // All vertices visible
{
_glPerspectiveDivideVertex(v0);
_glPushHeaderOrVertex(v0);
_glPerspectiveDivideVertex(v1);
_glPushHeaderOrVertex(v1);
_glPerspectiveDivideVertex(v2);
_glPushHeaderOrVertex(v2);
}
break;
case V0_VIS: // First vertex was visible
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
_glClipEdge(v0, v1, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v2, v0, b);
b->flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(v0);
_glPushHeaderOrVertex(v0);
_glPerspectiveDivideVertex(a);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(b);
_glPushHeaderOrVertex(b);
}
break;
case V1_VIS: // Second vertex was visible
{
/* Second vertex was visible. In self case we need to create a triangle and produce
two new vertices: 1-2, and 2-3. */
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v1);
_glClipEdge(v0, v1, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v1, v2, b);
b->flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(a);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(c);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(b);
_glPushHeaderOrVertex(b);
}
break;
case V0_VIS | V1_VIS: // First and second vertex were visible
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v1);
_glClipEdge(v2, v0, b);
b->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v0);
_glPushHeaderOrVertex(v0);
_glClipEdge(v1, v2, a);
a->flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(c);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(b);
_glPushHeaderOrVertex(b);
_glPerspectiveDivideVertex(a);
_glPushHeaderOrVertex(c);
_glPushHeaderOrVertex(a);
}
break;
case V2_VIS: // Third vertex was visible
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v2);
_glClipEdge(v2, v0, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v1, v2, b);
b->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(a);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(b);
_glPushHeaderOrVertex(b);
_glPerspectiveDivideVertex(c);
_glPushHeaderOrVertex(c);
}
break;
case V0_VIS | V2_VIS: // First and third vertex were visible
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v2);
c->flags = GPU_CMD_VERTEX;
_glClipEdge(v0, v1, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v1, v2, b);
b->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v0);
_glPushHeaderOrVertex(v0);
_glPerspectiveDivideVertex(a);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(c);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(b);
_glPushHeaderOrVertex(b);
c->flags = GPU_CMD_VERTEX_EOL;
_glPushHeaderOrVertex(c);
}
break;
case V1_VIS | V2_VIS: // Second and third vertex were visible
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v1);
_glClipEdge(v0, v1, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v2, v0, b);
b->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(a);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(c);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(b);
_glPushHeaderOrVertex(b);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(v2);
_glPushHeaderOrVertex(v2);
}
break;
}
}
extern int PASSED, CLIPPED, SKIPPED;
void SceneListSubmit(Vertex* v3, int n) {
TRACE();
/* You need at least a header, and 3 vertices to render anything */
if(n < 4) return;
const float h = vid_mode->height;
PVR_SET(SPAN_SORT_CFG, 0x0);
//Set PVR DMA registers
@ -150,198 +326,93 @@ void SceneListSubmit(Vertex* v2, int n) {
sq = SQ_BASE_ADDRESS;
for(int i = 0; i < n; ++i, ++v2) {
PREFETCH(v2 + 1);
switch(v2->flags) {
for(int i = 0; i < n; ++i, ++v3) {
PREFETCH(v3 + 1);
switch(v3->flags) {
case GPU_CMD_VERTEX_EOL:
break;
case GPU_CMD_VERTEX:
continue;
default:
_glPushHeaderOrVertex(v2);
_glPushHeaderOrVertex(v3);
continue;
};
Vertex* const v0 = v2 - 2;
Vertex* const v1 = v2 - 1;
// Quads [0, 1, 2, 3] -> Triangles [{0, 1, 2} {2, 3, 0}]
Vertex* const v0 = v3 - 3;
Vertex* const v1 = v3 - 2;
Vertex* const v2 = v3 - 1;
visible_mask = (
(v0->xyz[2] > -v0->w) << 0 |
(v1->xyz[2] > -v1->w) << 1 |
(v2->xyz[2] > -v2->w) << 2
(v2->xyz[2] > -v2->w) << 2 |
(v3->xyz[2] > -v3->w) << 3
);
Vertex __attribute__((aligned(32))) scratch[4];
// Stats gathering found that when testing a 64x64x64 sized world, at most
// ~400-500 triangles needed clipping
// ~13% of the triangles in a frame needed clipping (percentage increased when less triangles overall)
// Based on this, the decision was made to optimise for rendering quads there
// were either entirely visible or entirely culled, at the expensive at making
// partially visible quads a bit slower due to needing to be split into two triangles first
// Performance measuring indicated that overall FPS improved from this change
// to switching to try to process 1 quad instead of 2 triangles though
if (visible_mask == 15) PASSED += 2;
else if (visible_mask == 0) SKIPPED += 2;
else CLIPPED += 2;
switch(visible_mask) {
case V0_VIS | V1_VIS | V2_VIS: /* All vertices visible */
case V0_VIS | V1_VIS | V2_VIS | V3_VIS: // All vertices visible
{
_glPerspectiveDivideVertex(v0, h);
_glPerspectiveDivideVertex(v0);
_glPushHeaderOrVertex(v0);
_glPerspectiveDivideVertex(v1, h);
_glPerspectiveDivideVertex(v1);
_glPushHeaderOrVertex(v1);
_glPerspectiveDivideVertex(v2, h);
v2->flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(v2);
_glPushHeaderOrVertex(v2);
v2->flags = GPU_CMD_VERTEX;
_glPushHeaderOrVertex(v2);
}
break;
case V0_VIS: /* First vertex was visible */
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
_glClipEdge(v0, v1, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v2, v0, b);
b->flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(v0, h);
v3->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v3);
_glPushHeaderOrVertex(v3);
v0->flags = GPU_CMD_VERTEX_EOL;
_glPushHeaderOrVertex(v0);
_glPerspectiveDivideVertex(a, h);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(b, h);
_glPushHeaderOrVertex(b);
}
break;
case V1_VIS: /* Second vertex was visible */
{
/* Second vertex was visible. In self case we need to create a triangle and produce
two new vertices: 1-2, and 2-3. */
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v1);
_glClipEdge(v0, v1, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v1, v2, b);
b->flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(a, h);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(c, h);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(b, h);
_glPushHeaderOrVertex(b);
}
case 0: // No vertices visible
break;
case V0_VIS | V1_VIS: /* First and second vertex were visible */
default: // Some vertices visible
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v1);
_glClipEdge(v2, v0, b);
b->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v0, h);
_glPushHeaderOrVertex(v0);
_glClipEdge(v1, v2, a);
a->flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(c, h);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(b, h);
_glPushHeaderOrVertex(b);
_glPerspectiveDivideVertex(a, h);
_glPushHeaderOrVertex(c);
_glPushHeaderOrVertex(a);
}
break;
case V2_VIS: /* Third vertex was visible */
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v2);
_glClipEdge(v2, v0, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v1, v2, b);
b->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(a, h);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(b, h);
_glPushHeaderOrVertex(b);
_glPerspectiveDivideVertex(c, h);
_glPushHeaderOrVertex(c);
}
break;
case V0_VIS | V2_VIS: /* First and third vertex were visible */
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
memcpy_vertex(c, v2);
c->flags = GPU_CMD_VERTEX;
_glClipEdge(v0, v1, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v1, v2, b);
b->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v0, h);
_glPushHeaderOrVertex(v0);
_glPerspectiveDivideVertex(a, h);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(c, h);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(b, h);
_glPushHeaderOrVertex(b);
c->flags = GPU_CMD_VERTEX_EOL;
_glPushHeaderOrVertex(c);
}
break;
case V1_VIS | V2_VIS: /* Second and third vertex were visible */
{
Vertex* a = &scratch[0];
Vertex* b = &scratch[1];
Vertex* c = &scratch[2];
Vertex* d = &scratch[3];
memcpy_vertex(c, v1);
memcpy_vertex(d, v2);
_glClipEdge(v0, v1, a);
a->flags = GPU_CMD_VERTEX;
_glClipEdge(v2, v0, b);
b->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(a, h);
_glPushHeaderOrVertex(a);
_glPerspectiveDivideVertex(c, h);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(b, h);
_glPushHeaderOrVertex(b);
_glPushHeaderOrVertex(c);
_glPerspectiveDivideVertex(d, h);
_glPushHeaderOrVertex(d);
// vertices are modified in SubmitTriangle, so need to copy them
Vertex __attribute__((aligned(32))) scratch[4];
Vertex* a0 = &scratch[0];
Vertex* a2 = &scratch[1];
memcpy_vertex(a0, v0);
memcpy_vertex(a2, v2);
visible_mask &= (V0_VIS | V1_VIS | V2_VIS);
v2->flags = GPU_CMD_VERTEX_EOL;
SubmitTriangle(v0, v1, v2, visible_mask);
visible_mask = (
(a2->xyz[2] > -v2->w) << 0 |
(v3->xyz[2] > -v3->w) << 1 |
(a0->xyz[2] > -a0->w) << 2
);
v3->flags = GPU_CMD_VERTEX;
a0->flags = GPU_CMD_VERTEX_EOL;
SubmitTriangle(a2, v3, a0, visible_mask);
}
break;
}