Implement pre-vertex-shader backface culling on the GPU.

Decreases frametime by ~16% on my hardware.
Not as much as I hoped, but still worth a lot.
This commit is contained in:
IntegratedQuantum 2024-06-06 16:44:51 +02:00
parent 50076d172d
commit c225efa1de
4 changed files with 85 additions and 27 deletions

View File

@ -42,7 +42,7 @@ struct ChunkData {
int visibilityMask;
int voxelSize;
uint vertexStartOpaque;
uint vertexCountOpaque;
uint faceCountsByNormalOpaque[7];
uint vertexStartTransparent;
uint vertexCountTransparent;
};

View File

@ -12,7 +12,7 @@ struct ChunkData {
int visibilityMask;
int voxelSize;
uint vertexStartOpaque;
uint vertexCountOpaque;
uint faceCountsByNormalOpaque[7];
uint vertexStartTransparent;
uint vertexCountTransparent;
};
@ -40,26 +40,62 @@ uniform uint chunkIDIndex;
uniform uint commandIndexStart;
uniform uint size;
uniform bool isTransparent;
uniform ivec3 playerPositionInteger;
bool isVisible(int dir, ivec3 relativePlayerPos, int voxelSize) {
switch(dir) {
case 0: // dirUp
return relativePlayerPos.z >= 0;
case 1: // dirDown
return relativePlayerPos.z < 32*voxelSize;
case 2: // dirPosX
return relativePlayerPos.x >= 0;
case 3: // dirNegX
return relativePlayerPos.x < 32*voxelSize;
case 4: // dirPosY
return relativePlayerPos.y >= 0;
case 5: // dirNegY
return relativePlayerPos.y < 32*voxelSize;
}
return true;
}
DrawElementsIndirectCommand addCommand(uint indices, uint vertexOffset, uint chunkID) {
return DrawElementsIndirectCommand(indices, 1, 0, int(vertexOffset), chunkID);
}
void main() {
uint chunkID = chunkIDs[chunkIDIndex + gl_GlobalInvocationID.x];
uint commandIndex = commandIndexStart + gl_GlobalInvocationID.x;
if(gl_GlobalInvocationID.x >= size) return;
if(isTransparent) {
commands[commandIndex] = DrawElementsIndirectCommand(
chunks[chunkID].vertexCountTransparent,
1,
0,
int(chunks[chunkID].vertexStartTransparent),
chunkID
);
uint commandIndex = commandIndexStart + gl_GlobalInvocationID.x;
commands[commandIndex] = addCommand(chunks[chunkID].vertexCountTransparent, chunks[chunkID].vertexStartTransparent, chunkID);
} else {
commands[commandIndex] = DrawElementsIndirectCommand(
chunks[chunkID].vertexCountOpaque,
1,
0,
int(chunks[chunkID].vertexStartOpaque),
chunkID
);
uint commandIndex = commandIndexStart + gl_GlobalInvocationID.x*4;
uint commandIndexEnd = commandIndex + 4;
uint groupFaceOffset = 0;
uint groupFaceCount = 0;
for(int i = 0; i < 7; i++) {
uint faceCount = chunks[chunkID].faceCountsByNormalOpaque[i];
if(isVisible(i, playerPositionInteger - chunks[chunkID].position.xyz, chunks[chunkID].voxelSize) || faceCount == 0) {
groupFaceCount += faceCount;
} else {
if(groupFaceCount != 0) {
commands[commandIndex] = addCommand(6*groupFaceCount, chunks[chunkID].vertexStartOpaque + 4*groupFaceOffset, chunkID);
commandIndex += 1;
groupFaceOffset += groupFaceCount;
groupFaceCount = 0;
}
groupFaceOffset += faceCount;
}
}
if(groupFaceCount != 0) {
commands[commandIndex] = addCommand(6*groupFaceCount, chunks[chunkID].vertexStartOpaque + 4*groupFaceOffset, chunkID);
commandIndex += 1;
}
for(; commandIndex < commandIndexEnd; commandIndex++) {
commands[commandIndex] = DrawElementsIndirectCommand(0, 0, 0, 0, 0);
}
}
}

View File

@ -2015,7 +2015,7 @@ pub fn generateBlockTexture(blockType: u16) Texture {
.visibilityMask = 255,
.voxelSize = 1,
.vertexStartOpaque = undefined,
.vertexCountOpaque = undefined,
.faceCountsByNormalOpaque = undefined,
.vertexStartTransparent = undefined,
.vertexCountTransparent = undefined,
}}, &chunkAllocation);

View File

@ -51,6 +51,7 @@ pub var commandUniforms: struct {
commandIndexStart: c_int,
size: c_int,
isTransparent: c_int,
playerPositionInteger: c_int,
} = undefined;
var vao: c_uint = undefined;
var vbo: c_uint = undefined;
@ -158,16 +159,18 @@ pub fn bindTransparentShaderAndUniforms(projMatrix: Mat4f, ambient: Vec3f, playe
}
pub fn drawChunksIndirect(chunkIDs: []const u32, projMatrix: Mat4f, ambient: Vec3f, playerPos: Vec3d, transparent: bool) void {
const drawCallsEstimate: u31 = @intCast(if(transparent) chunkIDs.len else chunkIDs.len*4);
var chunkIDAllocation: main.graphics.SubAllocation = .{.start = 0, .len = 0};
chunkIDBuffer.uploadData(chunkIDs, &chunkIDAllocation);
defer chunkIDBuffer.free(chunkIDAllocation);
const allocation = commandBuffer.rawAlloc(@intCast(chunkIDs.len));
const allocation = commandBuffer.rawAlloc(drawCallsEstimate);
defer commandBuffer.free(allocation);
commandShader.bind();
c.glUniform1ui(commandUniforms.chunkIDIndex, chunkIDAllocation.start);
c.glUniform1ui(commandUniforms.commandIndexStart, allocation.start);
c.glUniform1ui(commandUniforms.size, @intCast(chunkIDs.len));
c.glUniform1i(commandUniforms.isTransparent, @intFromBool(transparent));
c.glUniform3i(commandUniforms.playerPositionInteger, @intFromFloat(playerPos[0]), @intFromFloat(playerPos[1]), @intFromFloat(playerPos[2]));
c.glDispatchCompute(@intCast(@divFloor(chunkIDs.len + 63, 64)), 1, 1); // TODO: Replace with @divCeil once available
c.glMemoryBarrier(c.GL_SHADER_STORAGE_BARRIER_BIT);
@ -177,7 +180,7 @@ pub fn drawChunksIndirect(chunkIDs: []const u32, projMatrix: Mat4f, ambient: Vec
bindShaderAndUniforms(projMatrix, ambient, playerPos);
}
c.glBindBuffer(c.GL_DRAW_INDIRECT_BUFFER, commandBuffer.ssbo.bufferID);
c.glMultiDrawElementsIndirect(c.GL_TRIANGLES, c.GL_UNSIGNED_INT, @ptrFromInt(allocation.start*@sizeOf(IndirectData)), @intCast(chunkIDs.len), 0);
c.glMultiDrawElementsIndirect(c.GL_TRIANGLES, c.GL_UNSIGNED_INT, @ptrFromInt(allocation.start*@sizeOf(IndirectData)), drawCallsEstimate, 0);
}
pub const FaceData = extern struct {
@ -208,7 +211,7 @@ pub const ChunkData = extern struct {
visibilityMask: i32,
voxelSize: i32,
vertexStartOpaque: u32,
vertexCountOpaque: u32,
faceCountsByNormalOpaque: [7]u32,
vertexStartTransparent: u32,
vertexCountTransparent: u32,
};
@ -232,6 +235,7 @@ const PrimitiveMesh = struct {
mutex: std.Thread.Mutex = .{},
bufferAllocation: graphics.SubAllocation = .{.start = 0, .len = 0},
vertexCount: u31 = 0,
byNormalCount: [7]u32 = .{0} ** 7,
wasChanged: bool = false,
fn deinit(self: *PrimitiveMesh) void {
@ -497,12 +501,30 @@ const PrimitiveMesh = struct {
}
const fullBuffer = faceBuffer.allocateAndMapRange(len, &self.bufferAllocation);
defer faceBuffer.unmapRange(fullBuffer);
@memcpy(fullBuffer[0..self.coreLen], self.completeList[0..self.coreLen]);
var i: usize = self.coreLen;
for(0..6) |n| {
@memcpy(fullBuffer[i..][0..list[n].len], list[n]);
i += list[n].len;
// Sort the faces by normal to allow for backface culling on the GPU:
var i: u32 = 0;
var iStart = i;
const coreList = self.completeList[0..self.coreLen];
for(0..7) |normal| {
for(coreList) |face| {
if(main.models.extraQuadInfos.items[face.blockAndQuad.quadIndex].alignedNormalDirection) |normalDir| {
if(normalDir == normal) {
fullBuffer[i] = face;
i += 1;
}
} else if(normal == 6) {
fullBuffer[i] = face;
i += 1;
}
}
if(normal < 6) {
@memcpy(fullBuffer[i..][0..list[normal ^ 1].len], list[normal ^ 1]);
i += @intCast(list[normal ^ 1].len);
}
self.byNormalCount[normal] = i - iStart;
iStart = i;
}
std.debug.assert(i == fullBuffer.len);
self.vertexCount = @intCast(6*fullBuffer.len);
self.wasChanged = true;
}
@ -1102,7 +1124,7 @@ pub const ChunkMesh = struct {
.voxelSize = self.pos.voxelSize,
.visibilityMask = self.visibilityMask,
.vertexStartOpaque = self.opaqueMesh.bufferAllocation.start*4,
.vertexCountOpaque = self.opaqueMesh.vertexCount,
.faceCountsByNormalOpaque = self.opaqueMesh.byNormalCount,
.vertexStartTransparent = self.transparentMesh.bufferAllocation.start*4,
.vertexCountTransparent = self.transparentMesh.bufferAllocation.len*6,
}}, &self.chunkAllocation);