From 588e12fcdba4d50c38bff854b7ce85e1b40e7a1d Mon Sep 17 00:00:00 2001 From: IntegratedQuantum Date: Sat, 8 Mar 2025 14:01:15 +0100 Subject: [PATCH] Restrict palette compression to power of 2 bitSizes and improve the addressing to use aligned reads. This does make it slightly faster, and due to the missing padding bytes it also slightly reduces memory usage. fixes #443 would make #322 easier --- src/utils.zig | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/utils.zig b/src/utils.zig index e05abbce..db76522c 100644 --- a/src/utils.zig +++ b/src/utils.zig @@ -1316,14 +1316,15 @@ pub const ThreadPool = struct { // MARK: ThreadPool /// The bit size can be changed using the `resize` function. pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIntArray return struct { - data: []u8 = &.{}, + data: []align(64)u32 = &.{}, bitSize: u5 = 0, const Self = @This(); pub fn initCapacity(allocator: main.utils.NeverFailingAllocator, bitSize: u5) Self { + std.debug.assert(bitSize == 0 or bitSize & bitSize - 1 == 0); // Must be a power of 2 return .{ - .data = allocator.alloc(u8, @as(usize, @divFloor(size + 7, 8))*bitSize + @sizeOf(u32)), + .data = allocator.alignedAlloc(u32, 64, @as(usize, @divExact(size, @bitSizeOf(u32)))*bitSize), .bitSize = bitSize, }; } @@ -1334,6 +1335,7 @@ pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIn } pub fn resize(self: *Self, allocator: main.utils.NeverFailingAllocator, newBitSize: u5) void { + std.debug.assert(newBitSize == 0 or newBitSize & newBitSize - 1 == 0); // Must be a power of 2 if(newBitSize == self.bitSize) return; var newSelf = Self.initCapacity(allocator, newBitSize); @@ -1348,22 +1350,21 @@ pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIn std.debug.assert(i < size); if(self.bitSize == 0) return 0; const bitIndex = i*self.bitSize; - const byteIndex = bitIndex >> 3; - const bitOffset: u5 = @intCast(bitIndex & 7); + const intIndex = bitIndex >> 5; + const bitOffset: u5 = @intCast(bitIndex & 31); const bitMask = (@as(u32, 1) << self.bitSize) - 1; - const ptr: *align(1) u32 = @ptrCast(&self.data[byteIndex]); - return ptr.* >> bitOffset & bitMask; + return self.data[intIndex] >> bitOffset & bitMask; } pub fn setValue(self: *Self, i: usize, value: u32) void { std.debug.assert(i < size); if(self.bitSize == 0) return; const bitIndex = i*self.bitSize; - const byteIndex = bitIndex >> 3; - const bitOffset: u5 = @intCast(bitIndex & 7); + const intIndex = bitIndex >> 5; + const bitOffset: u5 = @intCast(bitIndex & 31); const bitMask = (@as(u32, 1) << self.bitSize) - 1; std.debug.assert(value <= bitMask); - const ptr: *align(1) u32 = @ptrCast(&self.data[byteIndex]); + const ptr: *u32 = &self.data[intIndex]; ptr.* &= ~(bitMask << bitOffset); ptr.* |= value << bitOffset; } @@ -1372,11 +1373,11 @@ pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIn std.debug.assert(i < size); if(self.bitSize == 0) return 0; const bitIndex = i*self.bitSize; - const byteIndex = bitIndex >> 3; - const bitOffset: u5 = @intCast(bitIndex & 7); + const intIndex = bitIndex >> 5; + const bitOffset: u5 = @intCast(bitIndex & 31); const bitMask = (@as(u32, 1) << self.bitSize) - 1; std.debug.assert(value <= bitMask); - const ptr: *align(1) u32 = @ptrCast(&self.data[byteIndex]); + const ptr: *u32 = &self.data[intIndex]; const result = ptr.* >> bitOffset & bitMask; ptr.* &= ~(bitMask << bitOffset); ptr.* |= value << bitOffset; @@ -1407,9 +1408,11 @@ pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: Pale } pub fn initCopy(self: *Self, template: *const Self) void { + const dataDupe = main.globalAllocator.alignedAlloc(u32, 64, template.data.data.len); + @memcpy(dataDupe, template.data.data); self.* = .{ .data = .{ - .data = main.globalAllocator.dupe(u8, template.data.data), + .data = dataDupe, .bitSize = template.data.bitSize, }, .palette = main.globalAllocator.dupe(T, template.palette), @@ -1421,7 +1424,7 @@ pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: Pale pub fn initCapacity(self: *Self, paletteLength: u32) void { std.debug.assert(paletteLength < 0x80000000 and paletteLength > 0); - const bitSize: u5 = @intCast(std.math.log2_int_ceil(u32, paletteLength)); + const bitSize: u5 = getTargetBitSize(paletteLength); const bufferLength = @as(u32, 1) << bitSize; self.* = .{ .data = DynamicPackedIntArray(size).initCapacity(main.globalAllocator, bitSize), @@ -1440,6 +1443,13 @@ pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: Pale main.globalAllocator.free(self.paletteOccupancy); } + fn getTargetBitSize(paletteLength: u32) u5 { + const base: u5 = @intCast(std.math.log2_int_ceil(u32, paletteLength)); + if(base == 0) return 0; + const logLog = std.math.log2_int_ceil(u5, base); + return @as(u5, 1) << logLog; + } + pub fn getValue(self: *const Self, i: usize) T { return self.palette[self.data.getValue(i)]; } @@ -1454,7 +1464,7 @@ pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: Pale } if(paletteIndex == self.paletteLength) { if(self.paletteLength == self.palette.len) { - self.data.resize(main.globalAllocator, self.data.bitSize + 1); + self.data.resize(main.globalAllocator, getTargetBitSize(self.paletteLength + 1)); self.palette = main.globalAllocator.realloc(self.palette, @as(usize, 1) << self.data.bitSize); const oldLen = self.paletteOccupancy.len; self.paletteOccupancy = main.globalAllocator.realloc(self.paletteOccupancy, @as(usize, 1) << self.data.bitSize); @@ -1513,9 +1523,10 @@ pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: Pale } pub fn optimizeLayout(self: *Self) void { - if(std.math.log2_int_ceil(usize, self.palette.len) == std.math.log2_int_ceil(usize, self.activePaletteEntries)) return; + const newBitSize = getTargetBitSize(@intCast(self.activePaletteEntries)); + if(self.data.bitSize == newBitSize) return; - var newData = main.utils.DynamicPackedIntArray(size).initCapacity(main.globalAllocator, @intCast(std.math.log2_int_ceil(u32, self.activePaletteEntries))); + var newData = main.utils.DynamicPackedIntArray(size).initCapacity(main.globalAllocator, newBitSize); const paletteMap: []u32 = main.stackAllocator.alloc(u32, self.paletteLength); defer main.stackAllocator.free(paletteMap); {