From 86e08b9f8a3c1c243897ff5eef3d446d1a218a94 Mon Sep 17 00:00:00 2001 From: IntegratedQuantum Date: Sun, 14 Apr 2024 15:39:41 +0200 Subject: [PATCH] =?UTF-8?q?Use=20palette=20compression=20for=20light=20dat?= =?UTF-8?q?a,=20reduing=20memory=20usage=20(of=20light=20data)=20by=20up?= =?UTF-8?q?=20to=20100=C3=97.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Progress towards #156 Makes #277 worse --- src/renderer/lighting.zig | 138 ++++++++++++++++++++++++++++++-------- src/utils.zig | 73 ++++++++++++++++++++ 2 files changed, 184 insertions(+), 27 deletions(-) diff --git a/src/renderer/lighting.zig b/src/renderer/lighting.zig index 033d4f35..1ad51532 100644 --- a/src/renderer/lighting.zig +++ b/src/renderer/lighting.zig @@ -27,7 +27,11 @@ fn extractColor(in: u32) [3]u8 { } pub const ChannelChunk = struct { - data: [chunk.chunkVolume][3]Atomic(u8), + data: main.utils.DynamicPackedIntArray(chunk.chunkVolume), + palette: [][3]u8, + paletteOccupancy: []u32, + paletteLength: u32, + activePaletteEntries: u32, mutex: std.Thread.Mutex, ch: *chunk.Chunk, isSun: bool, @@ -39,11 +43,20 @@ pub const ChannelChunk = struct { self.mutex = .{}; self.ch = ch; self.isSun = isSun; - @memset(&self.data, .{Atomic(u8).init(0)} ** 3); + self.data = .{}; + self.palette = main.globalAllocator.alloc([3]u8, 1); + self.palette[0] = .{0, 0, 0}; + self.paletteOccupancy = main.globalAllocator.alloc(u32, 1); + self.paletteOccupancy[0] = chunk.chunkVolume; + self.paletteLength = 1; + self.activePaletteEntries = 1; return self; } pub fn deinit(self: *ChannelChunk) void { + self.data.deinit(main.globalAllocator); + main.globalAllocator.free(self.palette); + main.globalAllocator.free(self.paletteOccupancy); memoryPoolMutex.lock(); memoryPool.destroy(self); memoryPoolMutex.unlock(); @@ -69,9 +82,84 @@ pub const ChannelChunk = struct { entries: main.ListUnmanaged(PositionEntry), }; - pub fn getValue(self: *const ChannelChunk, x: i32, y: i32, z: i32) [3]u8 { + fn getValueInternal(self: *ChannelChunk, i: usize) [3]u8 { + std.debug.assert(!self.mutex.tryLock()); + return self.palette[self.data.getValue(i)]; + } + + pub fn getValue(self: *ChannelChunk, x: i32, y: i32, z: i32) [3]u8 { const index = chunk.getIndex(x, y, z); - return .{self.data[index][0].load(.unordered), self.data[index][1].load(.unordered), self.data[index][2].load(.unordered)}; + self.mutex.lock(); // TODO: Locking a mutex this often can be quite slow. + defer self.mutex.unlock(); + return self.getValueInternal(index); + } + + fn setValueInternal(self: *ChannelChunk, i: usize, val: [3]u8) void { + std.debug.assert(self.paletteLength <= self.palette.len); + std.debug.assert(!self.mutex.tryLock()); + var paletteIndex: u32 = 0; + while(paletteIndex < self.paletteLength) : (paletteIndex += 1) { // TODO: There got to be a faster way to do this. Either using SIMD or using a cache or hashmap. + if(std.meta.eql(self.palette[paletteIndex], val)) { + break; + } + } + if(paletteIndex == self.paletteLength) { + if(self.paletteLength == self.palette.len) { + self.data.resize(main.globalAllocator, self.data.bitSize + 1); + self.palette = main.globalAllocator.realloc(self.palette, @as(usize, 1) << self.data.bitSize); + const oldLen = self.paletteOccupancy.len; + self.paletteOccupancy = main.globalAllocator.realloc(self.paletteOccupancy, @as(usize, 1) << self.data.bitSize); + @memset(self.paletteOccupancy[oldLen..], 0); + } + self.palette[paletteIndex] = val; + self.paletteLength += 1; + std.debug.assert(self.paletteLength <= self.palette.len); + } + + const previousPaletteIndex = self.data.setAndGetValue(i, paletteIndex); + if(self.paletteOccupancy[paletteIndex] == 0) { + self.activePaletteEntries += 1; + } + self.paletteOccupancy[paletteIndex] += 1; + self.paletteOccupancy[previousPaletteIndex] -= 1; + if(self.paletteOccupancy[previousPaletteIndex] == 0) { + self.activePaletteEntries -= 1; + } + } + + fn optimizeLayout(self: *ChannelChunk) void { + std.debug.assert(!self.mutex.tryLock()); + if(std.math.log2_int_ceil(usize, self.palette.len) == std.math.log2_int_ceil(usize, self.activePaletteEntries)) return; + + var newData = main.utils.DynamicPackedIntArray(chunk.chunkVolume).initCapacity(main.globalAllocator, @intCast(std.math.log2_int_ceil(u32, self.activePaletteEntries))); + const paletteMap: []u32 = main.stackAllocator.alloc(u32, self.paletteLength); + defer main.stackAllocator.free(paletteMap); + { + var i: u32 = 0; + var len: u32 = self.paletteLength; + while(i < len) : (i += 1) outer: { + paletteMap[i] = i; + if(self.paletteOccupancy[i] == 0) { + while(true) { + len -= 1; + if(self.paletteOccupancy[len] != 0) break; + if(len == i) break :outer; + } + paletteMap[len] = i; + self.palette[i] = self.palette[len]; + self.paletteOccupancy[i] = self.paletteOccupancy[len]; + self.paletteOccupancy[len] = 0; + } + } + } + for(0..chunk.chunkVolume) |i| { + newData.setValue(i, paletteMap[self.data.getValue(i)]); + } + self.data.deinit(main.globalAllocator); + self.data = newData; + self.paletteLength = self.activePaletteEntries; + self.palette = main.globalAllocator.realloc(self.palette, @as(usize, 1) << self.data.bitSize); + self.paletteOccupancy = main.globalAllocator.realloc(self.paletteOccupancy, @as(usize, 1) << self.data.bitSize); } fn calculateIncomingOcclusion(result: *[3]u8, block: blocks.Block, voxelSize: u31, neighbor: usize) void { @@ -112,20 +200,14 @@ pub const ChannelChunk = struct { self.mutex.lock(); while(lightQueue.dequeue()) |entry| { const index = chunk.getIndex(entry.x, entry.y, entry.z); - const oldValue: [3]u8 = .{ - self.data[index][0].load(.unordered), - self.data[index][1].load(.unordered), - self.data[index][2].load(.unordered), - }; + const oldValue: [3]u8 = self.getValueInternal(index); const newValue: [3]u8 = .{ @max(entry.value[0], oldValue[0]), @max(entry.value[1], oldValue[1]), @max(entry.value[2], oldValue[2]), }; if(newValue[0] == oldValue[0] and newValue[1] == oldValue[1] and newValue[2] == oldValue[2]) continue; - self.data[index][0].store(newValue[0], .unordered); - self.data[index][1].store(newValue[1], .unordered); - self.data[index][2].store(newValue[2], .unordered); + self.setValueInternal(index, newValue); for(chunk.Neighbors.iterable) |neighbor| { if(neighbor == entry.sourceDir) continue; const nx = entry.x + chunk.Neighbors.relX[neighbor]; @@ -148,6 +230,7 @@ pub const ChannelChunk = struct { if(result.value[0] != 0 or result.value[1] != 0 or result.value[2] != 0) lightQueue.enqueue(result); } } + self.optimizeLayout(); self.mutex.unlock(); if(mesh_storage.getMeshAndIncreaseRefCount(self.ch.pos)) |mesh| { mesh.scheduleLightRefreshAndDecreaseRefCount(); @@ -174,11 +257,7 @@ pub const ChannelChunk = struct { self.mutex.lock(); while(lightQueue.dequeue()) |entry| { const index = chunk.getIndex(entry.x, entry.y, entry.z); - const oldValue: [3]u8 = .{ - self.data[index][0].load(.unordered), - self.data[index][1].load(.unordered), - self.data[index][2].load(.unordered), - }; + const oldValue: [3]u8 = self.getValueInternal(index); var activeValue: @Vector(3, bool) = @bitCast(entry.activeValue); var append: bool = false; if(activeValue[0] and entry.value[0] != oldValue[0]) { @@ -204,9 +283,11 @@ pub const ChannelChunk = struct { continue; } isFirstIteration = false; - if(activeValue[0]) self.data[index][0].store(0, .unordered); - if(activeValue[1]) self.data[index][1].store(0, .unordered); - if(activeValue[2]) self.data[index][2].store(0, .unordered); + var insertValue: [3]u8 = oldValue; + if(activeValue[0]) insertValue[0] = 0; + if(activeValue[1]) insertValue[1] = 0; + if(activeValue[2]) insertValue[2] = 0; + self.setValueInternal(index, insertValue); for(chunk.Neighbors.iterable) |neighbor| { if(neighbor == entry.sourceDir) continue; const nx = entry.x + chunk.Neighbors.relX[neighbor]; @@ -307,10 +388,11 @@ pub const ChannelChunk = struct { const neighborMesh = mesh_storage.getNeighborAndIncreaseRefCount(self.ch.pos, self.ch.pos.voxelSize, @intCast(neighbor)) orelse continue; defer neighborMesh.decreaseRefCount(); const neighborLightChunk = neighborMesh.lightingData[@intFromBool(self.isSun)]; + neighborLightChunk.mutex.lock(); + defer neighborLightChunk.mutex.unlock(); const index = chunk.getIndex(x, y, z); const neighborIndex = chunk.getIndex(otherX, otherY, otherZ); - var value: [3]u8 = .{neighborLightChunk.data[neighborIndex][0].load(.unordered), neighborLightChunk.data[neighborIndex][1].load(.unordered), neighborLightChunk.data[neighborIndex][2].load(.unordered)}; - + var value: [3]u8 = neighborLightChunk.getValueInternal(neighborIndex); if(!self.isSun or neighbor != chunk.Neighbors.dirUp or value[0] != 255 or value[1] != 255 or value[2] != 255) { value[0] -|= 8*|@as(u8, @intCast(self.ch.pos.voxelSize)); value[1] -|= 8*|@as(u8, @intCast(self.ch.pos.voxelSize)); @@ -330,10 +412,12 @@ pub const ChannelChunk = struct { pub fn propagateLightsDestructive(self: *ChannelChunk, lights: []const [3]u8) void { var lightQueue = main.utils.CircularBufferQueue(Entry).init(main.stackAllocator, 1 << 12); defer lightQueue.deinit(); + self.mutex.lock(); for(lights) |pos| { const index = chunk.getIndex(pos[0], pos[1], pos[2]); - lightQueue.enqueue(.{.x = @intCast(pos[0]), .y = @intCast(pos[1]), .z = @intCast(pos[2]), .value = .{self.data[index][0].load(.unordered), self.data[index][1].load(.unordered), self.data[index][2].load(.unordered)}, .sourceDir = 6, .activeValue = 0b111}); + lightQueue.enqueue(.{.x = @intCast(pos[0]), .y = @intCast(pos[1]), .z = @intCast(pos[2]), .value = self.getValueInternal(index), .sourceDir = 6, .activeValue = 0b111}); } + self.mutex.unlock(); var constructiveEntries: main.ListUnmanaged(ChunkEntries) = .{}; defer constructiveEntries.deinit(main.stackAllocator); constructiveEntries.append(main.stackAllocator, .{ @@ -346,15 +430,15 @@ pub const ChannelChunk = struct { var entryList = entries.entries; defer entryList.deinit(main.stackAllocator); const channelChunk = if(mesh) |_mesh| _mesh.lightingData[@intFromBool(self.isSun)] else self; + channelChunk.mutex.lock(); for(entryList.items) |entry| { const index = chunk.getIndex(entry.x, entry.y, entry.z); - const value = .{channelChunk.data[index][0].load(.unordered), channelChunk.data[index][1].load(.unordered), channelChunk.data[index][2].load(.unordered)}; + const value = channelChunk.getValueInternal(index); if(value[0] == 0 and value[1] == 0 and value[2] == 0) continue; - channelChunk.data[index][0].store(0, .unordered); - channelChunk.data[index][1].store(0, .unordered); - channelChunk.data[index][2].store(0, .unordered); + channelChunk.setValueInternal(index, .{0, 0, 0}); lightQueue.enqueue(.{.x = entry.x, .y = entry.y, .z = entry.z, .value = value, .sourceDir = 6, .activeValue = 0b111}); } + channelChunk.mutex.unlock(); channelChunk.propagateDirect(&lightQueue); } } diff --git a/src/utils.zig b/src/utils.zig index f095ab52..4364dd3a 100644 --- a/src/utils.zig +++ b/src/utils.zig @@ -1130,6 +1130,79 @@ pub const ThreadPool = struct { } }; +/// An packed array of integers with dynamic bit size. +/// The bit size can be changed using the `resize` function. +pub fn DynamicPackedIntArray(size: comptime_int) type { + return struct { + data: []u8 = &.{}, + bitSize: u5 = 0, + + const Self = @This(); + + pub fn initCapacity(allocator: main.utils.NeverFailingAllocator, bitSize: u5) Self { + return .{ + .data = allocator.alloc(u8, @as(usize, @divFloor(size + 7, 8))*bitSize + @sizeOf(u32)), + .bitSize = bitSize, + }; + } + + pub fn deinit(self: *Self, allocator: main.utils.NeverFailingAllocator) void { + allocator.free(self.data); + self.* = .{}; + } + + pub fn resize(self: *Self, allocator: main.utils.NeverFailingAllocator, newBitSize: u5) void { + if(newBitSize == self.bitSize) return; + var newSelf: Self = Self.initCapacity(allocator, newBitSize); + + for(0..size) |i| { + newSelf.setValue(i, self.getValue(i)); + } + allocator.free(self.data); + self.* = newSelf; + } + + pub fn getValue(self: *const Self, i: usize) u32 { + std.debug.assert(i < size); + if(self.bitSize == 0) return 0; + const bitIndex = i*self.bitSize; + const byteIndex = bitIndex >> 3; + const bitOffset: u5 = @intCast(bitIndex & 7); + const bitMask = (@as(u32, 1) << self.bitSize) - 1; + const ptr: *align(1) u32 = @ptrCast(&self.data[byteIndex]); + return ptr.* >> bitOffset & bitMask; + } + + pub fn setValue(self: *Self, i: usize, value: u32) void { + std.debug.assert(i < size); + if(self.bitSize == 0) return; + const bitIndex = i*self.bitSize; + const byteIndex = bitIndex >> 3; + const bitOffset: u5 = @intCast(bitIndex & 7); + const bitMask = (@as(u32, 1) << self.bitSize) - 1; + std.debug.assert(value <= bitMask); + const ptr: *align(1) u32 = @ptrCast(&self.data[byteIndex]); + ptr.* &= ~(bitMask << bitOffset); + ptr.* |= value << bitOffset; + } + + pub fn setAndGetValue(self: *Self, i: usize, value: u32) u32 { + std.debug.assert(i < size); + if(self.bitSize == 0) return 0; + const bitIndex = i*self.bitSize; + const byteIndex = bitIndex >> 3; + const bitOffset: u5 = @intCast(bitIndex & 7); + const bitMask = (@as(u32, 1) << self.bitSize) - 1; + std.debug.assert(value <= bitMask); + const ptr: *align(1) u32 = @ptrCast(&self.data[byteIndex]); + const result = ptr.* >> bitOffset & bitMask; + ptr.* &= ~(bitMask << bitOffset); + ptr.* |= value << bitOffset; + return result; + } + }; +} + /// Implements a simple set associative cache with LRU replacement strategy. pub fn Cache(comptime T: type, comptime numberOfBuckets: u32, comptime bucketSize: u32, comptime deinitFunction: fn(*T) void) type { const hashMask = numberOfBuckets-1;