Make reading light data thread safe without mutexes (#1727)

After failing in #1725 I decided to use a different approach at atomizing the palette compressed data, I added a new indirection which can be used to swap the entire content in one atomic operation. This should make the process itself cheaper than what I had implemented before. related: https://github.com/PixelGuys/Cubyz/issues/1471 https://github.com/PixelGuys/Cubyz/issues/1413 improves https://github.com/PixelGuys/Cubyz/issues/277 Remaining work: - [x] double check the implementation - [x] Fully remove the ReadWriteLock - [x] Check if this improved meshing performance → yes it did by 10-20% - [x] Check if this improved block update speed → yes it did by ~25%
2025-08-03 11:17:05 -04:00 · 2025-08-02 14:42:34 +02:00 · 2025-08-02 14:42:34 +02:00 · 90375b871a
commit 90375b871a
parent 047e29fe72
9 changed files with 217 additions and 175 deletions
--- a/src/block_entity.zig
+++ b/src/block_entity.zig
@ -475,10 +475,6 @@ pub const BlockEntityTypes = struct {
 				c.glUniform1i(uniforms.quadIndex, @intFromEnum(quad));
 				const mesh = main.renderer.mesh_storage.getMesh(main.chunk.ChunkPosition.initFromWorldPos(signData.blockPos, 1)) orelse continue :outer;
 				mesh.lightingData[0].lock.lockRead();
 				defer mesh.lightingData[0].lock.unlockRead();
 				mesh.lightingData[1].lock.lockRead();
 				defer mesh.lightingData[1].lock.unlockRead();
 				const light: [4]u32 = main.renderer.chunk_meshing.PrimitiveMesh.getLight(mesh, signData.blockPos -% Vec3i{mesh.pos.wx, mesh.pos.wy, mesh.pos.wz}, 0, quad);
 				c.glUniform4ui(uniforms.lightData, light[0], light[1], light[2], light[3]);
 				c.glUniform3i(uniforms.chunkPos, signData.blockPos[0] & ~main.chunk.chunkMask, signData.blockPos[1] & ~main.chunk.chunkMask, signData.blockPos[2] & ~main.chunk.chunkMask);
--- a/src/chunk.zig
+++ b/src/chunk.zig
@ -286,7 +286,7 @@ pub const Chunk = struct { // MARK: Chunk
 	fn deinitContent(self: *Chunk) void {
 		std.debug.assert(self.blockPosToEntityDataMap.count() == 0);
 		self.blockPosToEntityDataMap.deinit(main.globalAllocator.allocator);
-		self.data.deinit();
+		self.data.deferredDeinit();
 	}
 	pub fn unloadBlockEntities(self: *Chunk, comptime side: main.utils.Side) void {
--- a/src/renderer/chunk_meshing.zig
+++ b/src/renderer/chunk_meshing.zig
@ -381,8 +381,6 @@ pub const PrimitiveMesh = struct { // MARK: PrimitiveMesh
 		self.max = @splat(-std.math.floatMax(f32));
 		self.lock.lockRead();
 		parent.lightingData[0].lock.lockRead();
 		parent.lightingData[1].lock.lockRead();
 		for(self.completeList.getEverything()) |*face| {
 			const light = getLight(parent, .{face.position.x, face.position.y, face.position.z}, face.blockAndQuad.texture, face.blockAndQuad.quadIndex);
 			const result = lightMap.getOrPut(light) catch unreachable;
@ -401,8 +399,6 @@ pub const PrimitiveMesh = struct { // MARK: PrimitiveMesh
 				self.max = @max(self.max, basePos + cornerPos);
 			}
 		}
 		parent.lightingData[0].lock.unlockRead();
 		parent.lightingData[1].lock.unlockRead();
 		self.lock.unlockRead();
 	}
@ -421,10 +417,6 @@ pub const PrimitiveMesh = struct { // MARK: PrimitiveMesh
 			return getValues(parent, wx, wy, wz);
 		}
 		const neighborMesh = mesh_storage.getMesh(.{.wx = wx, .wy = wy, .wz = wz, .voxelSize = parent.pos.voxelSize}) orelse return .{0, 0, 0, 0, 0, 0};
 		neighborMesh.lightingData[0].lock.lockRead();
 		neighborMesh.lightingData[1].lock.lockRead();
 		defer neighborMesh.lightingData[0].lock.unlockRead();
 		defer neighborMesh.lightingData[1].lock.unlockRead();
 		return getValues(neighborMesh, wx, wy, wz);
 	}
@ -807,7 +799,7 @@ pub const ChunkMesh = struct { // MARK: ChunkMesh
 		self.mutex.unlock();
 		self.lightingData[0].propagateLights(lightEmittingBlocks.items, true, lightRefreshList);
 		sunLight: {
-			var allSun: bool = self.chunk.data.paletteLength == 1 and self.chunk.data.palette[0].typ == 0;
+			var allSun: bool = self.chunk.data.palette().len == 1 and self.chunk.data.palette()[0].load(.unordered).typ == 0;
 			var sunStarters: [chunk.chunkSize*chunk.chunkSize][3]u8 = undefined;
 			var index: usize = 0;
 			const lightStartMap = mesh_storage.getLightMapPiece(self.pos.wx, self.pos.wy, self.pos.voxelSize) orelse break :sunLight;
@ -915,10 +907,10 @@ pub const ChunkMesh = struct { // MARK: ChunkMesh
 			hasInternalQuads: bool = false,
 			alwaysViewThrough: bool = false,
 		};
-		var paletteCache = main.stackAllocator.alloc(OcclusionInfo, self.chunk.data.paletteLength);
+		var paletteCache = main.stackAllocator.alloc(OcclusionInfo, self.chunk.data.palette().len);
 		defer main.stackAllocator.free(paletteCache);
-		for(0..self.chunk.data.paletteLength) |i| {
+		for(0..self.chunk.data.palette().len) |i| {
-			const block = self.chunk.data.palette[i];
+			const block = self.chunk.data.palette()[i].load(.unordered);
 			const model = blocks.meshes.model(block).model();
 			var result: OcclusionInfo = .{};
 			if(model.noNeighborsOccluded or block.viewThrough()) {
@ -946,7 +938,7 @@ pub const ChunkMesh = struct { // MARK: ChunkMesh
 				const y: u5 = @intCast(_y);
 				for(0..chunk.chunkSize) |_z| {
 					const z: u5 = @intCast(_z);
-					const paletteId = self.chunk.data.data.getValue(chunk.getIndex(x, y, z));
+					const paletteId = self.chunk.data.impl.raw.data.getValue(chunk.getIndex(x, y, z));
 					const occlusionInfo = paletteCache[paletteId];
 					const setBit = @as(u32, 1) << z;
 					if(occlusionInfo.alwaysViewThrough or (!occlusionInfo.canSeeAllNeighbors and occlusionInfo.canSeeNeighbor == 0)) {
@ -986,7 +978,7 @@ pub const ChunkMesh = struct { // MARK: ChunkMesh
 				const y: u5 = @intCast(_y);
 				for(0..chunk.chunkSize) |_z| {
 					const z: u5 = @intCast(_z);
-					const paletteId = self.chunk.data.data.getValue(chunk.getIndex(x, y, z));
+					const paletteId = self.chunk.data.impl.raw.data.getValue(chunk.getIndex(x, y, z));
 					const occlusionInfo = paletteCache[paletteId];
 					const setBit = @as(u32, 1) << z;
 					if(depthFilteredViewThroughMask[x][y] & setBit != 0) {} else if(occlusionInfo.canSeeAllNeighbors) {
@ -1002,7 +994,7 @@ pub const ChunkMesh = struct { // MARK: ChunkMesh
 						hasFaces[x][y] |= setBit;
 					}
 					if(occlusionInfo.hasInternalQuads) {
-						const block = self.chunk.data.palette[paletteId];
+						const block = self.chunk.data.palette()[paletteId].load(.unordered);
 						if(block.transparent()) {
 							appendInternalQuads(block, x, y, z, false, &transparentCore, main.stackAllocator);
 						} else {
--- a/src/renderer/lighting.zig
+++ b/src/renderer/lighting.zig
@ -17,6 +17,21 @@ pub fn deinit() void {
 	memoryPool.deinit();
 }
 const LightValue = packed struct(u32) {
 	r: u8,
 	g: u8,
 	b: u8,
 	pad: u8 = undefined,
 	fn fromArray(arr: [3]u8) LightValue {
 		return .{.r = arr[0], .g = arr[1], .b = arr[2]};
 	}
 	fn toArray(self: LightValue) [3]u8 {
 		return .{self.r, self.g, self.b};
 	}
 };
 fn extractColor(in: u32) [3]u8 {
 	return .{
 		@truncate(in >> 16),
@ -26,14 +41,14 @@ fn extractColor(in: u32) [3]u8 {
 }
 pub const ChannelChunk = struct {
-	data: main.utils.PaletteCompressedRegion([3]u8, chunk.chunkVolume),
+	data: main.utils.PaletteCompressedRegion(LightValue, chunk.chunkVolume),
-	lock: main.utils.ReadWriteLock,
+	mutex: std.Thread.Mutex,
 	ch: *chunk.Chunk,
 	isSun: bool,
 	pub fn init(ch: *chunk.Chunk, isSun: bool) *ChannelChunk {
 		const self = memoryPool.create();
-		self.lock = .{};
+		self.mutex = .{};
 		self.ch = ch;
 		self.isSun = isSun;
 		self.data.init();
@ -41,7 +56,7 @@ pub const ChannelChunk = struct {
 	}
 	pub fn deinit(self: *ChannelChunk) void {
-		self.data.deinit();
+		self.data.deferredDeinit();
 		memoryPool.destroy(self);
 	}
@ -66,9 +81,8 @@ pub const ChannelChunk = struct {
 	};
 	pub fn getValue(self: *ChannelChunk, x: i32, y: i32, z: i32) [3]u8 {
 		self.lock.assertLockedRead();
 		const index = chunk.getIndex(x, y, z);
-		return self.data.getValue(index);
+		return self.data.getValue(index).toArray();
 	}
 	fn calculateIncomingOcclusion(result: *[3]u8, block: blocks.Block, voxelSize: u31, neighbor: chunk.Neighbor) void {
@ -106,17 +120,17 @@ pub const ChannelChunk = struct {
 			}
 		}
-		self.lock.lockWrite();
+		self.mutex.lock();
 		while(lightQueue.popFront()) |entry| {
 			const index = chunk.getIndex(entry.x, entry.y, entry.z);
-			const oldValue: [3]u8 = self.data.getValue(index);
+			const oldValue: [3]u8 = self.data.getValue(index).toArray();
 			const newValue: [3]u8 = .{
 				@max(entry.value[0], oldValue[0]),
 				@max(entry.value[1], oldValue[1]),
 				@max(entry.value[2], oldValue[2]),
 			};
 			if(newValue[0] == oldValue[0] and newValue[1] == oldValue[1] and newValue[2] == oldValue[2]) continue;
-			self.data.setValue(index, newValue);
+			self.data.setValue(index, .fromArray(newValue));
 			for(chunk.Neighbor.iterable) |neighbor| {
 				if(neighbor.toInt() == entry.sourceDir) continue;
 				const nx = entry.x + neighbor.relX();
@ -140,7 +154,7 @@ pub const ChannelChunk = struct {
 			}
 		}
 		self.data.optimizeLayout();
-		self.lock.unlockWrite();
+		self.mutex.unlock();
 		self.addSelfToLightRefreshList(lightRefreshList);
 		for(chunk.Neighbor.iterable) |neighbor| {
@ -172,10 +186,10 @@ pub const ChannelChunk = struct {
 		}
 		var isFirstIteration: bool = isFirstBlock;
-		self.lock.lockWrite();
+		self.mutex.lock();
 		while(lightQueue.popFront()) |entry| {
 			const index = chunk.getIndex(entry.x, entry.y, entry.z);
-			const oldValue: [3]u8 = self.data.getValue(index);
+			const oldValue: [3]u8 = self.data.getValue(index).toArray();
 			var activeValue: @Vector(3, bool) = @bitCast(entry.activeValue);
 			var append: bool = false;
 			if(activeValue[0] and entry.value[0] != oldValue[0]) {
@ -209,7 +223,7 @@ pub const ChannelChunk = struct {
 			if(activeValue[0]) insertValue[0] = 0;
 			if(activeValue[1]) insertValue[1] = 0;
 			if(activeValue[2]) insertValue[2] = 0;
-			self.data.setValue(index, insertValue);
+			self.data.setValue(index, .fromArray(insertValue));
 			for(chunk.Neighbor.iterable) |neighbor| {
 				if(neighbor.toInt() == entry.sourceDir) continue;
 				const nx = entry.x + neighbor.relX();
@ -231,7 +245,7 @@ pub const ChannelChunk = struct {
 				lightQueue.pushBack(result);
 			}
 		}
-		self.lock.unlockWrite();
+		self.mutex.unlock();
 		self.addSelfToLightRefreshList(lightRefreshList);
 		for(chunk.Neighbor.iterable) |neighbor| {
@ -307,11 +321,9 @@ pub const ChannelChunk = struct {
 						const otherZ = z +% neighbor.relZ() & chunk.chunkMask;
 						const neighborMesh = mesh_storage.getNeighbor(self.ch.pos, self.ch.pos.voxelSize, neighbor) orelse continue;
 						const neighborLightChunk = neighborMesh.lightingData[@intFromBool(self.isSun)];
 						neighborLightChunk.lock.lockRead();
 						defer neighborLightChunk.lock.unlockRead();
 						const index = chunk.getIndex(x, y, z);
 						const neighborIndex = chunk.getIndex(otherX, otherY, otherZ);
-						var value: [3]u8 = neighborLightChunk.data.getValue(neighborIndex);
+						var value: [3]u8 = neighborLightChunk.data.getValue(neighborIndex).toArray();
 						if(!self.isSun or neighbor != .dirUp or value[0] != 255 or value[1] != 255 or value[2] != 255) {
 							value[0] -|= 8*|@as(u8, @intCast(self.ch.pos.voxelSize));
 							value[1] -|= 8*|@as(u8, @intCast(self.ch.pos.voxelSize));
@ -330,13 +342,9 @@ pub const ChannelChunk = struct {
 	pub fn propagateUniformSun(self: *ChannelChunk, lightRefreshList: *main.List(chunk.ChunkPosition)) void {
 		std.debug.assert(self.isSun);
-		self.lock.lockWrite();
+		self.mutex.lock();
-		if(self.data.paletteLength != 1) {
+		self.data.fillUniform(.fromArray(.{255, 255, 255}));
-			self.data.deinit();
+		self.mutex.unlock();
 			self.data.init();
 		}
 		self.data.palette[0] = .{255, 255, 255};
 		self.lock.unlockWrite();
 		const val = 255 -| 8*|@as(u8, @intCast(self.ch.pos.voxelSize));
 		var lightQueue = main.utils.CircularBufferQueue(Entry).init(main.stackAllocator, 1 << 12);
 		defer lightQueue.deinit();
@ -378,12 +386,10 @@ pub const ChannelChunk = struct {
 	pub fn propagateLightsDestructive(self: *ChannelChunk, lights: []const [3]u8, lightRefreshList: *main.List(chunk.ChunkPosition)) void {
 		var lightQueue = main.utils.CircularBufferQueue(Entry).init(main.stackAllocator, 1 << 12);
 		defer lightQueue.deinit();
 		self.lock.lockRead();
 		for(lights) |pos| {
 			const index = chunk.getIndex(pos[0], pos[1], pos[2]);
-			lightQueue.pushBack(.{.x = @intCast(pos[0]), .y = @intCast(pos[1]), .z = @intCast(pos[2]), .value = self.data.getValue(index), .sourceDir = 6, .activeValue = 0b111});
+			lightQueue.pushBack(.{.x = @intCast(pos[0]), .y = @intCast(pos[1]), .z = @intCast(pos[2]), .value = self.data.getValue(index).toArray(), .sourceDir = 6, .activeValue = 0b111});
 		}
 		self.lock.unlockRead();
 		var constructiveEntries: main.ListUnmanaged(ChunkEntries) = .{};
 		defer constructiveEntries.deinit(main.stackAllocator);
 		constructiveEntries.append(main.stackAllocator, .{
@ -395,10 +401,10 @@ pub const ChannelChunk = struct {
 			var entryList = entries.entries;
 			defer entryList.deinit(main.stackAllocator);
 			const channelChunk = if(mesh) |_mesh| _mesh.lightingData[@intFromBool(self.isSun)] else self;
-			channelChunk.lock.lockWrite();
+			channelChunk.mutex.lock();
 			for(entryList.items) |entry| {
 				const index = chunk.getIndex(entry.x, entry.y, entry.z);
-				var value = channelChunk.data.getValue(index);
+				var value = channelChunk.data.getValue(index).toArray();
 				const light = if(self.isSun) .{0, 0, 0} else extractColor(channelChunk.ch.data.getValue(index).light());
 				value = .{
 					@max(value[0], light[0]),
@ -406,10 +412,10 @@ pub const ChannelChunk = struct {
 					@max(value[2], light[2]),
 				};
 				if(value[0] == 0 and value[1] == 0 and value[2] == 0) continue;
-				channelChunk.data.setValue(index, .{0, 0, 0});
+				channelChunk.data.setValue(index, .fromArray(.{0, 0, 0}));
 				lightQueue.pushBack(.{.x = entry.x, .y = entry.y, .z = entry.z, .value = value, .sourceDir = 6, .activeValue = 0b111});
 			}
-			channelChunk.lock.unlockWrite();
+			channelChunk.mutex.unlock();
 			channelChunk.propagateDirect(&lightQueue, lightRefreshList);
 		}
 	}
--- a/src/renderer/mesh_storage.zig
+++ b/src/renderer/mesh_storage.zig
@ -197,10 +197,6 @@ pub fn getLight(wx: i32, wy: i32, wz: i32) ?[6]u8 {
 	const x = (wx >> mesh.chunk.voxelSizeShift) & chunk.chunkMask;
 	const y = (wy >> mesh.chunk.voxelSizeShift) & chunk.chunkMask;
 	const z = (wz >> mesh.chunk.voxelSizeShift) & chunk.chunkMask;
 	mesh.lightingData[0].lock.lockRead();
 	defer mesh.lightingData[0].lock.unlockRead();
 	mesh.lightingData[1].lock.lockRead();
 	defer mesh.lightingData[1].lock.unlockRead();
 	return mesh.lightingData[1].getValue(x, y, z) ++ mesh.lightingData[0].getValue(x, y, z);
 }
--- a/src/server/storage.zig
+++ b/src/server/storage.zig
@ -282,18 +282,18 @@ pub const ChunkCompression = struct { // MARK: ChunkCompression
 	}
 	fn compressBlockData(ch: *chunk.Chunk, allowLossy: bool, writer: *BinaryWriter) void {
-		if(ch.data.paletteLength == 1) {
+		if(ch.data.palette().len == 1) {
 			writer.writeEnum(ChunkCompressionAlgo, .uniform);
-			writer.writeInt(u32, ch.data.palette[0].toInt());
+			writer.writeInt(u32, ch.data.palette()[0].load(.unordered).toInt());
 			return;
 		}
-		if(ch.data.paletteLength < 256) {
+		if(ch.data.palette().len < 256) {
 			var uncompressedData: [chunk.chunkVolume]u8 = undefined;
 			var solidMask: [chunk.chunkSize*chunk.chunkSize]u32 = undefined;
 			for(0..chunk.chunkVolume) |i| {
-				uncompressedData[i] = @intCast(ch.data.data.getValue(i));
+				uncompressedData[i] = @intCast(ch.data.impl.raw.data.getValue(i));
 				if(allowLossy) {
-					const block = ch.data.palette[uncompressedData[i]];
+					const block = ch.data.palette()[uncompressedData[i]].load(.unordered);
 					const model = main.blocks.meshes.model(block).model();
 					const occluder = model.allNeighborsOccluded and !block.viewThrough();
 					if(occluder) {
@ -323,10 +323,10 @@ pub const ChunkCompression = struct { // MARK: ChunkCompression
 			defer main.stackAllocator.free(compressedData);
 			writer.writeEnum(ChunkCompressionAlgo, .deflate_with_8bit_palette);
-			writer.writeInt(u8, @intCast(ch.data.paletteLength));
+			writer.writeInt(u8, @intCast(ch.data.palette().len));
-			for(0..ch.data.paletteLength) |i| {
+			for(0..ch.data.palette().len) |i| {
-				writer.writeInt(u32, ch.data.palette[i].toInt());
+				writer.writeInt(u32, ch.data.palette()[i].load(.unordered).toInt());
 			}
 			writer.writeVarInt(usize, compressedData.len);
 			writer.writeSlice(compressedData);
@ -347,7 +347,7 @@ pub const ChunkCompression = struct { // MARK: ChunkCompression
 	}
 	fn decompressBlockData(ch: *chunk.Chunk, reader: *BinaryReader) !void {
-		std.debug.assert(ch.data.paletteLength == 1);
+		std.debug.assert(ch.data.palette().len == 1);
 		const compressionAlgorithm = try reader.readEnum(ChunkCompressionAlgo);
@ -371,11 +371,11 @@ pub const ChunkCompression = struct { // MARK: ChunkCompression
 			.deflate_with_8bit_palette, .deflate_with_8bit_palette_no_block_entities => {
 				const paletteLength = try reader.readInt(u8);
-				ch.data.deinit();
+				ch.data.deferredDeinit();
 				ch.data.initCapacity(paletteLength);
 				for(0..paletteLength) |i| {
-					ch.data.palette[i] = main.blocks.Block.fromInt(try reader.readInt(u32));
+					ch.data.palette()[i] = .init(main.blocks.Block.fromInt(try reader.readInt(u32)));
 				}
 				const decompressedData = main.stackAllocator.alloc(u8, chunk.chunkVolume);
@ -392,7 +392,7 @@ pub const ChunkCompression = struct { // MARK: ChunkCompression
 				}
 			},
 			.uniform => {
-				ch.data.palette[0] = main.blocks.Block.fromInt(try reader.readInt(u32));
+				ch.data.palette()[0] = .init(main.blocks.Block.fromInt(try reader.readInt(u32)));
 			},
 		}
 	}
--- a/src/server/terrain/chunkgen/TerrainGenerator.zig
+++ b/src/server/terrain/chunkgen/TerrainGenerator.zig
@ -45,15 +45,11 @@ pub fn generate(worldSeed: u64, chunk: *main.chunk.ServerChunk, caveMap: CaveMap
 			}
 		}
 		if(minHeight > chunk.super.pos.wz +| chunk.super.width) {
-			chunk.super.data.deinit();
+			chunk.super.data.fillUniform(stone);
 			chunk.super.data.init();
 			chunk.super.data.palette[0] = stone;
 			return;
 		}
 		if(maxHeight < chunk.super.pos.wz) {
-			chunk.super.data.deinit();
+			chunk.super.data.fillUniform(air);
 			chunk.super.data.init();
 			chunk.super.data.palette[0] = air;
 			return;
 		}
 	}
--- a/src/server/world.zig
+++ b/src/server/world.zig
@ -328,8 +328,8 @@ const ChunkManager = struct { // MARK: ChunkManager
 			generator.generate(server.world.?.seed ^ generator.generatorSeed, ch, caveMap, biomeMap);
 		}
 		if(pos.voxelSize != 1) { // Generate LOD replacements
-			for(ch.super.data.palette[0..ch.super.data.paletteLength]) |*block| {
+			for(ch.super.data.palette()) |*block| {
-				block.typ = block.lodReplacement();
+				block.store(.{.typ = block.load(.unordered).lodReplacement(), .data = block.load(.unordered).data}, .unordered);
 			}
 		}
 		return ch;
--- a/src/utils.zig
+++ b/src/utils.zig
@ -989,7 +989,7 @@ pub fn deinitDynamicIntArrayStorage() void {
 pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIntArray
 	std.debug.assert(std.math.isPowerOfTwo(size));
 	return struct {
-		data: []align(64) u32 = &.{},
+		data: []align(64) Atomic(u32) = &.{},
 		bitSize: u5 = 0,
 		const Self = @This();
@ -997,12 +997,12 @@ pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIn
 		pub fn initCapacity(bitSize: u5) Self {
 			std.debug.assert(bitSize == 0 or bitSize & bitSize - 1 == 0); // Must be a power of 2
 			return .{
-				.data = dynamicIntArrayAllocator.allocator().alignedAlloc(u32, .@"64", @as(usize, @divExact(size, @bitSizeOf(u32)))*bitSize),
+				.data = dynamicIntArrayAllocator.allocator().alignedAlloc(Atomic(u32), .@"64", @as(usize, @divExact(size, @bitSizeOf(u32)))*bitSize),
 				.bitSize = bitSize,
 			};
 		}
-		pub fn deinit(self: *Self) void {
+		fn deinit(self: *Self) void {
 			dynamicIntArrayAllocator.allocator().free(self.data);
 			self.* = .{};
 		}
@ -1016,23 +1016,21 @@ pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIn
 			return result;
 		}
-		pub fn resizeOnce(self: *Self) void {
+		pub fn resizeOnceFrom(self: *Self, other: *const Self) void {
-			const newBitSize = if(self.bitSize != 0) self.bitSize*2 else 1;
+			const newBitSize = if(other.bitSize != 0) other.bitSize*2 else 1;
-			var newSelf = Self.initCapacity(newBitSize);
+			std.debug.assert(self.bitSize == newBitSize);
-			switch(self.bitSize) {
+			switch(other.bitSize) {
-				0 => @memset(newSelf.data, 0),
+				0 => @memset(self.data, .init(0)),
 				inline 1, 2, 4, 8 => |bits| {
-					for(0..self.data.len) |i| {
+					for(0..other.data.len) |i| {
-						const oldVal = self.data[i];
+						const oldVal = other.data[i].load(.unordered);
-						newSelf.data[2*i] = bitInterleave(bits, oldVal & 0xffff);
+						self.data[2*i].store(bitInterleave(bits, oldVal & 0xffff), .unordered);
-						newSelf.data[2*i + 1] = bitInterleave(bits, oldVal >> 16);
+						self.data[2*i + 1].store(bitInterleave(bits, oldVal >> 16), .unordered);
 					}
 				},
 				else => unreachable,
 			}
 			dynamicIntArrayAllocator.allocator().free(self.data);
 			self.* = newSelf;
 		}
 		pub fn getValue(self: *const Self, i: usize) u32 {
@ -1042,7 +1040,7 @@ pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIn
 			const intIndex = bitIndex >> 5;
 			const bitOffset: u5 = @intCast(bitIndex & 31);
 			const bitMask = (@as(u32, 1) << self.bitSize) - 1;
-			return self.data[intIndex] >> bitOffset & bitMask;
+			return self.data[intIndex].load(.unordered) >> bitOffset & bitMask;
 		}
 		pub fn setValue(self: *Self, i: usize, value: u32) void {
@ -1053,9 +1051,9 @@ pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIn
 			const bitOffset: u5 = @intCast(bitIndex & 31);
 			const bitMask = (@as(u32, 1) << self.bitSize) - 1;
 			std.debug.assert(value <= bitMask);
-			const ptr: *u32 = &self.data[intIndex];
+			const ptr: *Atomic(u32) = &self.data[intIndex];
-			ptr.* &= ~(bitMask << bitOffset);
+			const newValue = (ptr.load(.unordered) & ~(bitMask << bitOffset)) | value << bitOffset;
-			ptr.* |= value << bitOffset;
+			ptr.store(newValue, .unordered);
 		}
 		pub fn setAndGetValue(self: *Self, i: usize, value: u32) u32 {
@ -1066,45 +1064,57 @@ pub fn DynamicPackedIntArray(size: comptime_int) type { // MARK: DynamicPackedIn
 			const bitOffset: u5 = @intCast(bitIndex & 31);
 			const bitMask = (@as(u32, 1) << self.bitSize) - 1;
 			std.debug.assert(value <= bitMask);
-			const ptr: *u32 = &self.data[intIndex];
+			const ptr: *Atomic(u32) = &self.data[intIndex];
-			const result = ptr.* >> bitOffset & bitMask;
+			const oldValue = ptr.load(.unordered);
-			ptr.* &= ~(bitMask << bitOffset);
+			const result = oldValue >> bitOffset & bitMask;
-			ptr.* |= value << bitOffset;
+			const newValue = (oldValue & ~(bitMask << bitOffset)) | value << bitOffset;
 			ptr.store(newValue, .unordered);
 			return result;
 		}
 	};
 }
 pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: PaletteCompressedRegion
-	return struct {
+	const Impl = struct {
 		data: DynamicPackedIntArray(size) = .{},
-		palette: []T,
+		palette: []Atomic(T),
 		paletteOccupancy: []u32,
 		paletteLength: u32,
 		activePaletteEntries: u32,
-
+	};
 	return struct {
 		impl: Atomic(*Impl),
 		const Self = @This();
 		pub fn init(self: *Self) void {
 			const impl = main.globalAllocator.create(Impl);
 			self.* = .{
-				.palette = main.globalAllocator.alloc(T, 1),
+				.impl = .init(impl),
 			};
 			impl.* = .{
 				.palette = main.globalAllocator.alloc(Atomic(T), 1),
 				.paletteOccupancy = main.globalAllocator.alloc(u32, 1),
 				.paletteLength = 1,
 				.activePaletteEntries = 1,
 			};
-			self.palette[0] = std.mem.zeroes(T);
+			impl.palette[0] = .init(std.mem.zeroes(T));
-			self.paletteOccupancy[0] = size;
+			impl.paletteOccupancy[0] = size;
 		}
 		pub fn initCopy(self: *Self, template: *const Self) void {
-			const dataDupe = DynamicPackedIntArray(size).initCapacity(template.data.bitSize);
+			const impl = main.globalAllocator.create(Impl);
-			@memcpy(dataDupe.data, template.data.data);
+			const templateImpl = template.impl.load(.acquire);
 			const dataDupe = DynamicPackedIntArray(size).initCapacity(templateImpl.data.bitSize);
 			@memcpy(dataDupe.data, templateImpl.data.data);
 			self.* = .{
 				.impl = .init(impl),
 			};
 			impl.* = .{
 				.data = dataDupe,
-				.palette = main.globalAllocator.dupe(T, template.palette),
+				.palette = main.globalAllocator.dupe(Atomic(T), templateImpl.palette),
-				.paletteOccupancy = main.globalAllocator.dupe(u32, template.paletteOccupancy),
+				.paletteOccupancy = main.globalAllocator.dupe(u32, templateImpl.paletteOccupancy),
-				.paletteLength = template.paletteLength,
+				.paletteLength = templateImpl.paletteLength,
-				.activePaletteEntries = template.activePaletteEntries,
+				.activePaletteEntries = templateImpl.activePaletteEntries,
 			};
 		}
@ -1112,21 +1122,32 @@ pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: Pale
 			std.debug.assert(paletteLength < 0x80000000 and paletteLength > 0);
 			const bitSize: u5 = getTargetBitSize(paletteLength);
 			const bufferLength = @as(u32, 1) << bitSize;
 			const impl = main.globalAllocator.create(Impl);
 			self.* = .{
 				.impl = .init(impl),
 			};
 			impl.* = .{
 				.data = DynamicPackedIntArray(size).initCapacity(bitSize),
-				.palette = main.globalAllocator.alloc(T, bufferLength),
+				.palette = main.globalAllocator.alloc(Atomic(T), bufferLength),
 				.paletteOccupancy = main.globalAllocator.alloc(u32, bufferLength),
 				.paletteLength = paletteLength,
 				.activePaletteEntries = 1,
 			};
-			self.palette[0] = std.mem.zeroes(T);
+			impl.palette[0] = .init(std.mem.zeroes(T));
-			self.paletteOccupancy[0] = size;
+			impl.paletteOccupancy[0] = size;
 			@memset(impl.paletteOccupancy[1..], 0);
 			@memset(impl.data.data, .init(0));
 		}
-		pub fn deinit(self: *Self) void {
+		fn privateDeinit(impl: *Impl, _: usize) void {
-			self.data.deinit();
+			impl.data.deinit();
-			main.globalAllocator.free(self.palette);
+			main.globalAllocator.free(impl.palette);
-			main.globalAllocator.free(self.paletteOccupancy);
+			main.globalAllocator.free(impl.paletteOccupancy);
 			main.globalAllocator.destroy(impl);
 		}
 		pub fn deferredDeinit(self: *Self) void {
 			main.heap.GarbageCollection.deferredFree(.{.ptr = self.impl.raw, .freeFunction = main.utils.castFunctionSelfToAnyopaque(privateDeinit)});
 		}
 		fn getTargetBitSize(paletteLength: u32) u5 {
@ -1137,57 +1158,87 @@ pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: Pale
 		}
 		pub fn getValue(self: *const Self, i: usize) T {
-			return self.palette[self.data.getValue(i)];
+			const impl = self.impl.load(.acquire);
 			return impl.palette[impl.data.getValue(i)].load(.unordered);
 		}
 		pub fn palette(self: *const Self) []Atomic(T) {
 			const impl = self.impl.raw;
 			return impl.palette[0..impl.paletteLength];
 		}
 		pub fn fillUniform(self: *Self, value: T) void {
 			const impl = self.impl.raw;
 			if(impl.paletteLength == 1) {
 				impl.palette[0].store(value, .unordered);
 				return;
 			}
 			var newSelf: Self = undefined;
 			newSelf.init();
 			newSelf.impl.raw.palette[0] = .init(value);
 			newSelf.impl.raw = self.impl.swap(newSelf.impl.raw, .release);
 			newSelf.deferredDeinit();
 		}
 		fn getOrInsertPaletteIndex(noalias self: *Self, val: T) u32 {
-			std.debug.assert(self.paletteLength <= self.palette.len);
+			var impl = self.impl.raw;
 			std.debug.assert(impl.paletteLength <= impl.palette.len);
 			var paletteIndex: u32 = 0;
-			while(paletteIndex < self.paletteLength) : (paletteIndex += 1) { // TODO: There got to be a faster way to do this. Either using SIMD or using a cache or hashmap.
+			while(paletteIndex < impl.paletteLength) : (paletteIndex += 1) {
-				if(std.meta.eql(self.palette[paletteIndex], val)) {
+				if(std.meta.eql(impl.palette[paletteIndex].load(.unordered), val)) {
 					break;
 				}
 			}
-			if(paletteIndex == self.paletteLength) {
+			if(paletteIndex == impl.paletteLength) {
-				if(self.paletteLength == self.palette.len) {
+				if(impl.paletteLength == impl.palette.len) {
-					self.data.resizeOnce();
+					var newSelf: Self = undefined;
-					self.palette = main.globalAllocator.realloc(self.palette, @as(usize, 1) << self.data.bitSize);
+					newSelf.initCapacity(impl.paletteLength*2);
-					const oldLen = self.paletteOccupancy.len;
+					const newImpl = newSelf.impl.raw;
-					self.paletteOccupancy = main.globalAllocator.realloc(self.paletteOccupancy, @as(usize, 1) << self.data.bitSize);
+					// TODO: Resize stuff
-					@memset(self.paletteOccupancy[oldLen..], 0);
+					newImpl.data.resizeOnceFrom(&impl.data);
 					@memcpy(newImpl.palette[0..impl.palette.len], impl.palette);
 					@memcpy(newImpl.paletteOccupancy[0..impl.paletteOccupancy.len], impl.paletteOccupancy);
 					@memset(newImpl.paletteOccupancy[impl.paletteOccupancy.len..], 0);
 					newImpl.activePaletteEntries = impl.activePaletteEntries;
 					newImpl.paletteLength = impl.paletteLength;
 					newSelf.impl.raw = self.impl.swap(newImpl, .release);
 					newSelf.deferredDeinit();
 					impl = newImpl;
 				}
-				self.palette[paletteIndex] = val;
+				impl.palette[paletteIndex].store(val, .unordered);
-				self.paletteLength += 1;
+				impl.paletteLength += 1;
-				std.debug.assert(self.paletteLength <= self.palette.len);
+				std.debug.assert(impl.paletteLength <= impl.palette.len);
 			}
 			return paletteIndex;
 		}
 		pub fn setRawValue(noalias self: *Self, i: usize, paletteIndex: u32) void {
-			const previousPaletteIndex = self.data.setAndGetValue(i, paletteIndex);
+			const impl = self.impl.raw;
 			const previousPaletteIndex = impl.data.setAndGetValue(i, paletteIndex);
 			if(previousPaletteIndex != paletteIndex) {
-				if(self.paletteOccupancy[paletteIndex] == 0) {
+				if(impl.paletteOccupancy[paletteIndex] == 0) {
-					self.activePaletteEntries += 1;
+					impl.activePaletteEntries += 1;
 				}
-				self.paletteOccupancy[paletteIndex] += 1;
+				impl.paletteOccupancy[paletteIndex] += 1;
-				self.paletteOccupancy[previousPaletteIndex] -= 1;
+				impl.paletteOccupancy[previousPaletteIndex] -= 1;
-				if(self.paletteOccupancy[previousPaletteIndex] == 0) {
+				if(impl.paletteOccupancy[previousPaletteIndex] == 0) {
-					self.activePaletteEntries -= 1;
+					impl.activePaletteEntries -= 1;
 				}
 			}
 		}
 		pub fn setValue(noalias self: *Self, i: usize, val: T) void {
 			const paletteIndex = self.getOrInsertPaletteIndex(val);
-			const previousPaletteIndex = self.data.setAndGetValue(i, paletteIndex);
+			const impl = self.impl.raw;
 			const previousPaletteIndex = impl.data.setAndGetValue(i, paletteIndex);
 			if(previousPaletteIndex != paletteIndex) {
-				if(self.paletteOccupancy[paletteIndex] == 0) {
+				if(impl.paletteOccupancy[paletteIndex] == 0) {
-					self.activePaletteEntries += 1;
+					impl.activePaletteEntries += 1;
 				}
-				self.paletteOccupancy[paletteIndex] += 1;
+				impl.paletteOccupancy[paletteIndex] += 1;
-				self.paletteOccupancy[previousPaletteIndex] -= 1;
+				impl.paletteOccupancy[previousPaletteIndex] -= 1;
-				if(self.paletteOccupancy[previousPaletteIndex] == 0) {
+				if(impl.paletteOccupancy[previousPaletteIndex] == 0) {
-					self.activePaletteEntries -= 1;
+					impl.activePaletteEntries -= 1;
 				}
 			}
 		}
@ -1195,52 +1246,57 @@ pub fn PaletteCompressedRegion(T: type, size: comptime_int) type { // MARK: Pale
 		pub fn setValueInColumn(noalias self: *Self, startIndex: usize, endIndex: usize, val: T) void {
 			std.debug.assert(startIndex < endIndex);
 			const paletteIndex = self.getOrInsertPaletteIndex(val);
 			const impl = self.impl.raw;
 			for(startIndex..endIndex) |i| {
-				const previousPaletteIndex = self.data.setAndGetValue(i, paletteIndex);
+				const previousPaletteIndex = impl.data.setAndGetValue(i, paletteIndex);
-				self.paletteOccupancy[previousPaletteIndex] -= 1;
+				impl.paletteOccupancy[previousPaletteIndex] -= 1;
-				if(self.paletteOccupancy[previousPaletteIndex] == 0) {
+				if(impl.paletteOccupancy[previousPaletteIndex] == 0) {
-					self.activePaletteEntries -= 1;
+					impl.activePaletteEntries -= 1;
 				}
 			}
-			if(self.paletteOccupancy[paletteIndex] == 0) {
+			if(impl.paletteOccupancy[paletteIndex] == 0) {
-				self.activePaletteEntries += 1;
+				impl.activePaletteEntries += 1;
 			}
-			self.paletteOccupancy[paletteIndex] += @intCast(endIndex - startIndex);
+			impl.paletteOccupancy[paletteIndex] += @intCast(endIndex - startIndex);
 		}
 		pub fn optimizeLayout(self: *Self) void {
-			const newBitSize = getTargetBitSize(@intCast(self.activePaletteEntries));
+			const impl = self.impl.raw;
-			if(self.data.bitSize == newBitSize) return;
+			const newBitSize = getTargetBitSize(@intCast(impl.activePaletteEntries));
 			if(impl.data.bitSize == newBitSize) return;
-			var newData = main.utils.DynamicPackedIntArray(size).initCapacity(newBitSize);
+			var newSelf: Self = undefined;
-			const paletteMap: []u32 = main.stackAllocator.alloc(u32, self.paletteLength);
+			newSelf.initCapacity(impl.activePaletteEntries);
 			const newImpl = newSelf.impl.raw;
 			const paletteMap: []u32 = main.stackAllocator.alloc(u32, impl.paletteLength);
 			defer main.stackAllocator.free(paletteMap);
 			{
-				var i: u32 = 0;
+				var iNew: u32 = 0;
-				var len: u32 = self.paletteLength;
+				var iOld: u32 = 0;
-				while(i < len) : (i += 1) outer: {
+				const len: u32 = impl.paletteLength;
-					paletteMap[i] = i;
+				while(iOld < len) : ({
-					if(self.paletteOccupancy[i] == 0) {
+					iNew += 1;
-						while(true) {
+					iOld += 1;
-							len -= 1;
+				}) outer: {
-							if(self.paletteOccupancy[len] != 0) break;
+					while(impl.paletteOccupancy[iOld] == 0) {
-							if(len == i) break :outer;
+						iOld += 1;
-						}
+						if(iOld >= len) break :outer;
 						paletteMap[len] = i;
 						self.palette[i] = self.palette[len];
 						self.paletteOccupancy[i] = self.paletteOccupancy[len];
 						self.paletteOccupancy[len] = 0;
 					}
 					if(iNew >= impl.activePaletteEntries) std.log.err("{} {}", .{iNew, impl.activePaletteEntries});
 					std.debug.assert(iNew < impl.activePaletteEntries);
 					std.debug.assert(iOld < impl.paletteLength);
 					paletteMap[iOld] = iNew;
 					newImpl.palette[iNew] = .init(impl.palette[iOld].load(.unordered));
 					newImpl.paletteOccupancy[iNew] = impl.paletteOccupancy[iOld];
 				}
 			}
 			for(0..size) |i| {
-				newData.setValue(i, paletteMap[self.data.getValue(i)]);
+				newImpl.data.setValue(i, paletteMap[impl.data.getValue(i)]);
 			}
-			self.data.deinit();
+			newImpl.paletteLength = impl.activePaletteEntries;
-			self.data = newData;
+			newImpl.activePaletteEntries = impl.activePaletteEntries;
-			self.paletteLength = self.activePaletteEntries;
+			newSelf.impl.raw = self.impl.swap(newSelf.impl.raw, .release);
-			self.palette = main.globalAllocator.realloc(self.palette, @as(usize, 1) << self.data.bitSize);
+			newSelf.deferredDeinit();
 			self.paletteOccupancy = main.globalAllocator.realloc(self.paletteOccupancy, @as(usize, 1) << self.data.bitSize);
 		}
 	};
 }