diff --git a/lib/std/math/atan.zig b/lib/std/math/atan.zig index 8b783bf347..26d8e5f83b 100644 --- a/lib/std/math/atan.zig +++ b/lib/std/math/atan.zig @@ -4,6 +4,12 @@ // https://git.musl-libc.org/cgit/musl/tree/src/math/atanf.c // https://git.musl-libc.org/cgit/musl/tree/src/math/atan.c // https://git.musl-libc.org/cgit/musl/tree/src/math/atanl.c +// +// Ported from ARM-software, which is licensed under the MIT license: +// https://github.com/ARM-software/optimized-routines/blob/master/LICENSE +// +// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/atanf.c +// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/atan.c const std = @import("../std.zig"); const math = std.math; @@ -17,14 +23,22 @@ const testing = std.testing; /// - atan(+-inf) = +-pi/2 pub fn atan(x: anytype) @TypeOf(x) { const T = @TypeOf(x); - return switch (T) { - f16 => atanBinary16(x), - f32 => atanBinary32(x), - f64 => atanBinary64(x), - f80 => atanExtended80(x), - f128 => atanBinary128(x), - else => @compileError("atan not implemented for " ++ @typeName(T)), - }; + switch (@typeInfo(T)) { + .float => |info| switch (info.bits) { + 16 => return atanBinary16(x), + 32 => return atanBinary32(x), + 64 => return atanBinary64(x), + 80 => return atanExtended80(x), + 128 => return atanBinary128(x), + else => comptime unreachable, + }, + .vector => |info| switch (info.child) { + f32 => return atanBinary32Vec(info.len, x), + f64 => return atanBinary64Vec(info.len, x), + else => @compileError("unimplemented"), + }, + else => comptime unreachable, + } } fn atanBinary16(x: f16) f16 { @@ -579,3 +593,186 @@ test "atanBinary128" { try testing.expectApproxEqAbs(atanBinary128(0x1.299d54ac7d6afc5154643b601519p1), 0x1.2a24e22d861debfd6f974500567fp0, math.floatEpsAt(f128, 0x1.2a24e22d861debfd6f974500567fp0)); try testing.expectApproxEqAbs(atanBinary128(-0x1.0264fb9f3d50e4f0f966f0686064p1), -0x1.1c617825f97512b7f38656ab12cdp0, math.floatEpsAt(f128, -0x1.1c617825f97512b7f38656ab12cdp0)); } + +fn atanBinary32Vec(comptime vec_len: comptime_int, x: @Vector(vec_len, f32)) @TypeOf(x) { + const sign_mask: @Vector(vec_len, u32) = @splat(0x80000000); + const neg_one: @Vector(vec_len, f32) = @splat(-1.0); + const pi_over_2: @Vector(vec_len, u32) = @splat(0x3fc90fdb); + const zero: @Vector(vec_len, u32) = @splat(0); + const c0: @Vector(vec_len, f32) = @splat(-0x1.5554dcp-2); + const c1: @Vector(vec_len, f32) = @splat(0x1.9978ecp-3); + const c2: @Vector(vec_len, f32) = @splat(-0x1.230a94p-3); + const c3: @Vector(vec_len, f32) = @splat(0x1.b4debp-4); + const c4: @Vector(vec_len, f32) = @splat(-0x1.3550dap-4); + const c5: @Vector(vec_len, f32) = @splat(0x1.61eebp-5); + const c6: @Vector(vec_len, f32) = @splat(-0x1.0c17d4p-6); + const c7: @Vector(vec_len, f32) = @splat(0x1.7ea694p-9); + + const ix: @Vector(vec_len, u32) = @bitCast(x); + const sign = ix & sign_mask; + const pred = @abs(x) > @abs(neg_one); + const z = @select(f32, pred, neg_one / x, x); + const shift: @Vector(vec_len, f32) = @bitCast(@select(u32, pred, pi_over_2 ^ sign, zero)); + const z2 = z * z; + const z3 = z * z2; + const z4 = z2 * z2; + const z8 = z4 * z4; + const p0_1 = @mulAdd(@Vector(vec_len, f32), z2, c1, c0); + const p2_3 = @mulAdd(@Vector(vec_len, f32), z2, c3, c2); + const p4_5 = @mulAdd(@Vector(vec_len, f32), z2, c5, c4); + const p6_7 = @mulAdd(@Vector(vec_len, f32), z2, c7, c6); + const p0_3 = @mulAdd(@Vector(vec_len, f32), z4, p2_3, p0_1); + const p4_7 = @mulAdd(@Vector(vec_len, f32), z4, p6_7, p4_5); + const p0_7 = @mulAdd(@Vector(vec_len, f32), z8, p4_7, p0_3); + return @mulAdd(@Vector(vec_len, f32), z3, p0_7, shift + z); +} + +fn atanBinary64Vec(comptime vec_len: comptime_int, x: @Vector(vec_len, f64)) @TypeOf(x) { + const sign_mask: @Vector(vec_len, u64) = @splat(0x8000000000000000); + const neg_one: @Vector(vec_len, f64) = @splat(-1.0); + const pi_over_2: @Vector(vec_len, u64) = @splat(0x3ff921fb54442d18); + const zero: @Vector(vec_len, u64) = @splat(0); + const c0: @Vector(vec_len, f64) = @splat(-0x1.555555555552ap-2); + const c1: @Vector(vec_len, f64) = @splat(0x1.9999999995aebp-3); + const c2: @Vector(vec_len, f64) = @splat(-0x1.24924923923f6p-3); + const c3: @Vector(vec_len, f64) = @splat(0x1.c71c7184288a2p-4); + const c4: @Vector(vec_len, f64) = @splat(-0x1.745d11fb3d32bp-4); + const c5: @Vector(vec_len, f64) = @splat(0x1.3b136a18051b9p-4); + const c6: @Vector(vec_len, f64) = @splat(-0x1.110e6d985f496p-4); + const c7: @Vector(vec_len, f64) = @splat(0x1.e1bcf7f08801dp-5); + const c8: @Vector(vec_len, f64) = @splat(-0x1.ae644e28058c3p-5); + const c9: @Vector(vec_len, f64) = @splat(0x1.82eeb1fed85c6p-5); + const c10: @Vector(vec_len, f64) = @splat(-0x1.59d7f901566cbp-5); + const c11: @Vector(vec_len, f64) = @splat(0x1.2c982855ab069p-5); + const c12: @Vector(vec_len, f64) = @splat(-0x1.eb49592998177p-6); + const c13: @Vector(vec_len, f64) = @splat(0x1.69d8b396e3d38p-6); + const c14: @Vector(vec_len, f64) = @splat(-0x1.ca980345c4204p-7); + const c15: @Vector(vec_len, f64) = @splat(0x1.dc050eafde0b3p-8); + const c16: @Vector(vec_len, f64) = @splat(-0x1.7ea70755b8eccp-9); + const c17: @Vector(vec_len, f64) = @splat(0x1.ba3da3de903e8p-11); + const c18: @Vector(vec_len, f64) = @splat(-0x1.44a4b059b6f67p-13); + const c19: @Vector(vec_len, f64) = @splat(0x1.c4a45029e5a91p-17); + + const ix: @Vector(vec_len, u64) = @bitCast(x); + const sign = ix & sign_mask; + const pred = @abs(x) > @abs(neg_one); + const shift: @Vector(vec_len, f64) = @bitCast(@select(u64, pred, pi_over_2 ^ sign, zero)); + const z = @select(f64, pred, neg_one / x, x); + const z2 = z * z; + const z3 = z * z2; + const z4 = z2 * z2; + const z8 = z4 * z4; + const z16 = z8 * z8; + const p0_1 = @mulAdd(@Vector(vec_len, f64), z2, c1, c0); + const p2_3 = @mulAdd(@Vector(vec_len, f64), z2, c3, c2); + const p0_3 = @mulAdd(@Vector(vec_len, f64), z4, p2_3, p0_1); + const p4_5 = @mulAdd(@Vector(vec_len, f64), z2, c5, c4); + const p6_7 = @mulAdd(@Vector(vec_len, f64), z2, c7, c6); + const p4_7 = @mulAdd(@Vector(vec_len, f64), z4, p6_7, p4_5); + const p0_7 = @mulAdd(@Vector(vec_len, f64), z8, p4_7, p0_3); + const p8_9 = @mulAdd(@Vector(vec_len, f64), z2, c9, c8); + const p10_11 = @mulAdd(@Vector(vec_len, f64), z2, c11, c10); + const p8_11 = @mulAdd(@Vector(vec_len, f64), z4, p10_11, p8_9); + const p12_13 = @mulAdd(@Vector(vec_len, f64), z2, c13, c12); + const p14_15 = @mulAdd(@Vector(vec_len, f64), z2, c15, c14); + const p12_15 = @mulAdd(@Vector(vec_len, f64), z4, p14_15, p12_13); + const p16_17 = @mulAdd(@Vector(vec_len, f64), z2, c17, c16); + const p18_19 = @mulAdd(@Vector(vec_len, f64), z2, c19, c18); + const p16_19 = @mulAdd(@Vector(vec_len, f64), z4, p18_19, p16_17); + const p8_15 = @mulAdd(@Vector(vec_len, f64), z8, p12_15, p8_11); + const p8_19 = @mulAdd(@Vector(vec_len, f64), z16, p16_19, p8_15); + const p0_19 = @mulAdd(@Vector(vec_len, f64), p8_19, z16, p0_7); + return @mulAdd(@Vector(vec_len, f64), z3, p0_19, shift + z); +} + +test "atanBinary32Vec.special" { + const input: @Vector(7, f32) = .{ + 0x0p+0, + -0x0p+0, + 0x1p+0, + -0x1p+0, + math.inf(f32), + -math.inf(f32), + math.nan(f32), + }; + const output = atanBinary32Vec(7, input); + try testing.expectEqual(output[0], 0x0p+0); + try testing.expectEqual(output[1], -0x0p+0); + try testing.expectApproxEqAbs(output[2], 0x1.921fb6p-1, math.floatEpsAt(f32, 0x1.921fb6p-1)); + try testing.expectApproxEqAbs(output[3], -0x1.921fb6p-1, math.floatEpsAt(f32, -0x1.921fb6p-1)); + try testing.expectApproxEqAbs(output[4], 0x1.921fb6p+0, math.floatEpsAt(f32, 0x1.921fb6p+0)); + try testing.expectApproxEqAbs(output[5], -0x1.921fb6p+0, math.floatEpsAt(f32, -0x1.921fb6p+0)); + try testing.expect(math.isNan(output[6])); +} + +test "atanBinary32Vec" { + const input: @Vector(10, f32) = .{ + -0x1.8629dp-2, + -0x1.59d42ep1, + -0x1.d2dbe2p0, + -0x1.5f314ep-1, + 0x1.5869bp1, + -0x1.b13a06p-2, + 0x1.3cb0f2p1, + -0x1.0ed746p-2, + 0x1.299d54p1, + -0x1.0264fcp1, + }; + const output = atanBinary32Vec(10, input); + try testing.expectApproxEqAbs(output[0], -0x1.74c62p-2, math.floatEpsAt(f32, -0x1.74c62p-2)); + try testing.expectApproxEqAbs(output[1], -0x1.375fd8p0, math.floatEpsAt(f32, -0x1.375fd8p0)); + try testing.expectApproxEqAbs(output[2], -0x1.11b8aep0, math.floatEpsAt(f32, -0x1.11b8aep0)); + try testing.expectApproxEqAbs(output[3], -0x1.33d28cp-1, math.floatEpsAt(f32, -0x1.33d28cp-1)); + try testing.expectApproxEqAbs(output[4], 0x1.37082ep0, math.floatEpsAt(f32, 0x1.37082ep0)); + try testing.expectApproxEqAbs(output[5], -0x1.99d7cap-2, math.floatEpsAt(f32, -0x1.99d7cap-2)); + try testing.expectApproxEqAbs(output[6], 0x1.2fcb12p0, math.floatEpsAt(f32, 0x1.2fcb12p0)); + try testing.expectApproxEqAbs(output[7], -0x1.08c71ap-2, math.floatEpsAt(f32, -0x1.08c71ap-2)); + try testing.expectApproxEqAbs(output[8], 0x1.2a24e2p0, math.floatEpsAt(f32, 0x1.2a24e2p0)); + try testing.expectApproxEqAbs(output[9], -0x1.1c6178p0, math.floatEpsAt(f32, -0x1.1c6178p0)); +} + +test "atanBinary64Vec.special" { + const input: @Vector(7, f64) = .{ + 0x0p+0, + -0x0p+0, + 0x1p+0, + -0x1p+0, + math.inf(f64), + -math.inf(f64), + math.nan(f64), + }; + const output = atanBinary64Vec(7, input); + try testing.expectEqual(output[0], 0x0p+0); + try testing.expectEqual(output[1], -0x0p+0); + try testing.expectApproxEqAbs(output[2], 0x1.921fb54442d18p-1, math.floatEpsAt(f64, 0x1.921fb54442d18p-1)); + try testing.expectApproxEqAbs(output[3], -0x1.921fb54442d18p-1, math.floatEpsAt(f64, -0x1.921fb54442d18p-1)); + try testing.expectApproxEqAbs(output[4], 0x1.921fb54442d18p+0, math.floatEpsAt(f64, 0x1.921fb54442d18p+0)); + try testing.expectApproxEqAbs(output[5], -0x1.921fb54442d18p+0, math.floatEpsAt(f64, -0x1.921fb54442d18p+0)); + try testing.expect(math.isNan(output[6])); +} + +test "atanBinary64Vec" { + const input: @Vector(10, f64) = .{ + -0x1.8629d0244cdccp-2, + -0x1.59d42d4659937p1, + -0x1.d2dbe23d04f06p0, + -0x1.5f314e72398e8p-1, + 0x1.5869af37b7d08p1, + -0x1.b13a05a662618p-2, + 0x1.3cb0f12f39d8ap1, + -0x1.0ed746b39cbb7p-2, + 0x1.299d54ac7d6bp1, + -0x1.0264fb9f3d50ep1, + }; + const output = atanBinary64Vec(10, input); + try testing.expectApproxEqAbs(output[0], -0x1.74c61f4377016p-2, math.floatEpsAt(f64, -0x1.74c61f4377016p-2)); + try testing.expectApproxEqAbs(output[1], -0x1.375fd7987cc2p0, math.floatEpsAt(f64, -0x1.375fd7987cc2p0)); + try testing.expectApproxEqAbs(output[2], -0x1.11b8adeba5616p0, math.floatEpsAt(f64, -0x1.11b8adeba5616p0)); + try testing.expectApproxEqAbs(output[3], -0x1.33d28ca762539p-1, math.floatEpsAt(f64, -0x1.33d28ca762539p-1)); + try testing.expectApproxEqAbs(output[4], 0x1.37082ce2dd03p0, math.floatEpsAt(f64, 0x1.37082ce2dd03p0)); + try testing.expectApproxEqAbs(output[5], -0x1.99d7cac66dd44p-2, math.floatEpsAt(f64, -0x1.99d7cac66dd44p-2)); + try testing.expectApproxEqAbs(output[6], 0x1.2fcb120468e8ep0, math.floatEpsAt(f64, 0x1.2fcb120468e8ep0)); + try testing.expectApproxEqAbs(output[7], -0x1.08c71aa0e509p-2, math.floatEpsAt(f64, -0x1.08c71aa0e509p-2)); + try testing.expectApproxEqAbs(output[8], 0x1.2a24e22d861dfp0, math.floatEpsAt(f64, 0x1.2a24e22d861dfp0)); + try testing.expectApproxEqAbs(output[9], -0x1.1c617825f9751p0, math.floatEpsAt(f64, -0x1.1c617825f9751p0)); +}