std.heap.ArenaAllocator: optimize aligned index calculation

The `alignedIndex` function is very hot (literally every single `alloc` call invokes it at least once) and `std.mem.alignPointerOffset` seems to be very slow, so this commit replaces this functions with a custom implementation that doesn't do any unnecessary validation and doesn't have any branches as a result of that. The validation `std.mem.alignPointerOffset` does isn't necessary anyway, we're not actually calculating an offset that we plan to apply to a pointer directly, but an offset into a valid buffer that we only apply to a pointer if the result is inside of that buffer. This leads to a ~4% speedup in a synthetic benchmark that just puts a lot of concurrent load on an `ArenaAllocator`.
2026-03-08 01:04:43 +01:00 · 2026-03-04 15:27:54 +01:00 · 2026-03-04 15:27:54 +01:00 · f09386cce9
commit f09386cce9
parent 46658257f4
1 changed files with 4 additions and 2 deletions
--- a/lib/std/heap/ArenaAllocator.zig
+++ b/lib/std/heap/ArenaAllocator.zig
@ -315,8 +315,10 @@ fn pushFreeList(arena: *ArenaAllocator, first: *Node, last: *Node) void {
 }

 fn alignedIndex(buf_ptr: [*]u8, end_index: usize, alignment: Alignment) usize {
-    return end_index +
-        mem.alignPointerOffset(buf_ptr + end_index, alignment.toByteUnits()).?;
+    // Wrapping arithmetic to avoid overflows since `end_index` isn't bounded by
+    // `size`. This is always ok since the max alignment in byte units is also
+    // the max value of `usize` so wrapped values are correctly aligned anyway.
+    return alignment.forward(@intFromPtr(buf_ptr) +% end_index) -% @intFromPtr(buf_ptr);
 }

 fn alloc(ctx: *anyopaque, n: usize, alignment: Alignment, ret_addr: usize) ?[*]u8 {