zig icon indicating copy to clipboard operation
zig copied to clipboard

std.math: add lerp

Open wooster0 opened this issue 3 years ago • 26 comments
trafficstars

I think it'd be cool if we could have a basic lerp function in the std.

wooster0 avatar Sep 28 '22 18:09 wooster0

This is a drive-by comment, so I apologize for that. I was just curious why you picked the algorithm you picked? Wikipedia [1] for example, lists two versions, an approximate and a precise version.

[1] https://en.wikipedia.org/wiki/Linear_interpolation

steveno avatar Sep 29 '22 03:09 steveno

Mostly for performance. I compared the two and the imprecise method (unsurprisingly) is faster. You are definitely making a good point that the user may want precision over speed. Not sure how to go about that. Maybe impreciseLerp and preciseLerp? It seems C++ has a lerp as well: https://github.com/ziglang/zig/blob/716d9237cb757c15217b21964fde8e755aabe853/lib/libcxx/include/cmath#L621 and it looks like it chooses the precise but slower variant by default? However down here it seems to have the imprecise variant as well: https://github.com/ziglang/zig/blob/716d9237cb757c15217b21964fde8e755aabe853/lib/libcxx/include/cmath#L624 Hmmm... I will investigate this a bit more.

wooster0 avatar Sep 29 '22 06:09 wooster0

FYI here you can see the differences in codegen: https://zig.godbolt.org/z/GPG8YG9qd Just go to the exported main and change the lerp for the one you want to test. C++'s way is the worst as you can see. So I think the way of having two lerps impreciseLerp and preciseLerp is probably best.

For longevity:

const std = @import("std");
const assert = std.debug.assert;

pub fn impreciseLerp(comptime T: type, a: T, b: T, t: T) T {
    assert(t >= 0 and t <= 1);
    return a + (b - a) * t;
}

pub fn preciseLerp(comptime T: type, a: T, b: T, t: T) T {
    assert(t >= 0 and t <= 1);
    return (1 - t) * a + t * b;
}

pub fn cppLerp(comptime T: type, a: T, b: T, t: T) T {
    if ((a <= 0 and b >= 0) or (a >= 0 and b <= 0))
        return t * b + (1 - t) * a;

    if (t == 1) return b;

    const x = a + t * (b - a);
    if ((t > 1) == (b > a)) {
        return if (b < x) x else b;
    } else {
        return if (x < b) x else b;
    }
}

export fn main(a: f32, b: f32, t: f32) f32 {
    return impreciseLerp(f32, a, b, t);
}

wooster0 avatar Sep 29 '22 08:09 wooster0

Maybe fastLerp and preciseLerp? That way each version advertises why you'd use it in the name.

bndbsh avatar Sep 30 '22 08:09 bndbsh

That's not a bad idea either. But I think that name focuses a bit too much on speed over the fact that it's simply just a bit less precise. And it might add a bit of an "unsafe" notion to it ("fast" as in "no safety checks") even though it's perfectly safe. I still wouldn't be against using that name if others prefer it.

Maybe lerp and preciseLerp because in most cases I believe it's perfectly fine to use the slightly less precise lerp until you actually need so much precision. But it's less descriptive and makes the two go less hand in hand than before so I'm not sure.

wooster0 avatar Sep 30 '22 13:09 wooster0

Does using @mulAdd affect performance/precision significantly in either of these?

InKryption avatar Sep 30 '22 14:09 InKryption

Wow, thanks for bringing that up! That is one very interesting builtin that I haven't considered here. Have a look: https://zig.godbolt.org/z/qcMehj4jo mulAddImpreciseLerp is indeed the fastest here (I did some research; not just going by the LOC here). So now we actually have 4 ways to do lerp. I think it's time to wrap it all in a single std.math.lerp and make it accept an enum that allows you to specify the implementation. Perfect.

Because we use @mulAdd, a function that the average user will probably just not think of for this, we now have a very good reason to add this to std.math.

For longevity:

const std = @import("std");
const assert = std.debug.assert;

// 4 assembly LOC
pub fn impreciseLerp(comptime T: type, a: T, b: T, t: T) T {
    assert(t >= 0 and t <= 1);
    return a + (b - a) * t;
}

// 7 assembly LOC
pub fn preciseLerp(comptime T: type, a: T, b: T, t: T) T {
    assert(t >= 0 and t <= 1);
    return (1 - t) * a + t * b;
}

// 3 assembly LOC
pub fn mulAddImpreciseLerp(comptime T: type, a: T, b: T, t: T) T {
    assert(t >= 0 and t <= 1);
    return @mulAdd(T, (b - a), t, a);
}

// 6 assembly LOC
pub fn mulAddPreciseLerp(comptime T: type, a: T, b: T, t: T) T {
    assert(t >= 0 and t <= 1);
    return @mulAdd(T, a, (1 - t), t * b);
}

// too many assembly LOC
pub fn cppLerp(comptime T: type, a: T, b: T, t: T) T {
    if ((a <= 0 and b >= 0) or (a >= 0 and b <= 0))
        return t * b + (1 - t) * a;

    if (t == 1) return b;

    const x = a + t * (b - a);
    if ((t > 1) == (b > a)) {
        return if (b < x) x else b;
    } else {
        return if (x < b) x else b;
    }
}

export fn main(a: f32, b: f32, t: f32) f32 {
    return mulAddPreciseLerp(f32, a, b, t);
}

wooster0 avatar Sep 30 '22 14:09 wooster0

Now the lerp looks like this:

    std.math.lerp(f32, .{}, 0, 100, 0.5); // 50

And in .{} you are free to alter the way it calculates.

wooster0 avatar Sep 30 '22 18:09 wooster0

Hmm, if that is the case, is there any reason to not always use @mulAdd?

InKryption avatar Sep 30 '22 18:09 InKryption

Well I thought it'd be nice to offer more options for all kinds of cases but I guess it would be a ton simpler if there were only 2 options.

wooster0 avatar Sep 30 '22 18:09 wooster0

Or maybe most_precise: bool or precisest: bool or something?

wooster0 avatar Sep 30 '22 19:09 wooster0

I think this is starting to look pretty good now. We're using a builtin function that most users wouldn't think of to provide speed and accuracy. I think this is very well worth merging.

wooster0 avatar Oct 01 '22 08:10 wooster0

Also try to avoid the word "fast". "precise" and "lossy" are okay words. I suggest that the generally useful one has no suffix at all.

andrewrk avatar Oct 01 '22 08:10 andrewrk

Alright. I too think it's perfectly fine to just go with the slightly "less precise" option even though it's still really accurate. In practical usage I can't really think of a situation where you really need the slightly more precise option. Also, if you need more precision I think you could also just turn up the float bits by e.g. using f64 instead of f32, maybe only temporarily for one lerp and then go back to f32, or something like that.

And you're right; "lossy" and "precise" is a much better combination of words. Couldn't think of it in that moment.

wooster0 avatar Oct 01 '22 08:10 wooster0

Perfect.

wooster0 avatar Oct 01 '22 09:10 wooster0

Alright so the first thing I can tell from the CI failures is that there are some problems with f16 for Wasm here... This line in the "lerp" test in particular:

    try testing.expectEqual(@as(f16, 0.0), lerp(f16, 1.0e4, 1.0, 1.0));

fails like this:

$ zig test x.zig -target wasm32-wasi --test-cmd wasmtime --test-cmd-bin
expected 0.0e+00, found 1.0e+00
0 passed; 0 skipped; 1 failed.
test failureError: failed to run main module `test.wasm`

Caused by:
    0: failed to invoke command default
    1: wasm trap: wasm `unreachable` instruction executed
       wasm backtrace:
           0: 0x4205 - <unknown>!os.abort
           1: 0x2806 - <unknown>!builtin.default_panic
           2: 0x1e63 - <unknown>!test_runner.main
           3: 0x1de7 - <unknown>!_start
       note: using the `WASMTIME_BACKTRACE_DETAILS=1` environment variable to may show more debugging information

error: the following test command failed with exit code 134:
wasmtime test.wasm

...while it works with zig test x.zig on Linux. So am I supposed to not use f16? Is it known to be broken on Wasm? Found some possibly related things: #4952 #9900

Apart from that, I also found #7702 which I think is also talking about how @mulAdd doesn't behave the same on all platforms? I mean it is expected of course that a single instruction doesn't exist for it everywhere but I expect it to be substituted with multiple instructions doing the same thing.

Considering that there are still all these problems I think I will just replace the odd f16 and f128 sizes in the test with common ones considering that they're usually not natively supported by the hardware and harder to support.

wooster0 avatar Oct 01 '22 14:10 wooster0

2 questions for you @r00ster91:

  • if an additional function for lerp for integers would be added, what might be its fully qualified name in the std lib, and what might its function signature be?

For example, in a side project I have this function (for better or worse, avoiding float operations):

fn lerp(start: i32, end: i32, num: isize, den: isize) i32 {
    return @intCast(i32, start + @divTrunc(num * (end - start), den));
}
  • have you considered this function signature for the float version? fn lerp(a: anytype, b: anytype, t: anytype) @TypeOf(a,b,t)

andrewrk avatar Oct 04 '22 02:10 andrewrk

  • if an additional function for lerp for integers would be added, what might be its fully qualified name in the std lib, and what might its function signature be?

At first I was gonna say let's overload the existing lerp but seeing that you have another parameter for what I suppose is the denominator, that's an interesting question.

I can definitely understand when someone might not want to depend on floating-point numbers for linear interpolation. I'm not sure how exactly your integer version of lerp is used but if we can do it with 3 parameters, maybe we can overload this same lerp further with integers in the future because we can just check if the args are integers.

However, if that doesn't work out, lerpInt seems fine too? I do think that lerp functions are generally associated with floats; the float version is the main version and the integer version is a bit more "niche" and for cases when you really need that extra performance and or can't afford floating-point calculations (maybe all your floats are implemented in software).

So std.math.lerp will stay the primary lerp function and all other possible additions will have prefixes or suffixes to "lerp" meaning more specialized versions of the standard lerp function.

I'm also interested in something like fixed-point numbers for this but we don't even know if that will be added to the language yet so it seems a bit premature to spend a lot of thought on that just yet. Maybe it will be possible to just make them compatible with the existing lerp (and other functions using floats).

  • have you considered this function signature for the float version? fn lerp(a: anytype, b: anytype, t: anytype) @TypeOf(a,b,t)

Sounds good. Seems like in most cases this will mean having to type less types, and I feel like this could be done for a lot more of the std.

wooster0 avatar Oct 04 '22 17:10 wooster0

I should also note that although the codegen for this looks great on x86, this may codegen poorly on platforms that don't have a native FMA:

const std = @import("std");
const assert = std.debug.assert;

export fn lerp1(a: f32, b: f32, t: f32) f32 {
    assert(t >= 0 and t <= 1);
    return @mulAdd(f32, b - a, t, a);
}

export fn lerp2(a: f32, b: f32, t: f32) f32 {
    assert(t >= 0 and t <= 1);
    return ((b - a) * t) + a;
}
$ zig build-lib x.zig -target wasm32-freestanding -O ReleaseSmall -dynamic --strip
$ wasm2wat x.wasm
(module
  (type (;0;) (func (param f32 f32 f32) (result f32)))
  (func (;0;) (type 0) (param f32 f32 f32) (result f32)
    local.get 1
    local.get 0
    f32.sub
    local.get 2
    local.get 0
    call 2)
  (func (;1;) (type 0) (param f32 f32 f32) (result f32)
    local.get 1
    local.get 0
    f32.sub
    local.get 2
    f32.mul
    local.get 0
    f32.add)
  (func (;2;) (type 0) (param f32 f32 f32) (result f32)
    local.get 0
    f64.promote_f32
    local.get 1
    f64.promote_f32
    f64.mul
    local.get 2
    f64.promote_f32
    f64.add
    f32.demote_f64)
  (memory (;0;) 16)
  (global (;0;) (mut i32) (i32.const 1048576))
  (export "memory" (memory 0))
  (export "lerp1" (func 0))
  (export "lerp2" (func 1)))

Notice call 2 in func (;0;): it calls to func (;2;) (fmaf) which is another bunch of instructions whereas the non-@mulAdd lerp2 is func (;1;) and is only that code. So on Wasm this performs worse with @mulAdd.

  • Should @mulAdd emit a handleable error if the target does not have a native FMA so that we can use return ((b - a) * t) + a instead?
  • Should we fiddle around with @import("builtin").cpu to try to detect if the target has a native FMA and otherwise use return ((b - a) * t) + a?
  • Should we always use return ((b - a) * t) + a and then just hope that it gets turned into an FMA if @setFloatMode(.Optimized)? https://ziglang.org/documentation/master/#setFloatMode

    Perform floating-point contraction (e.g. fusing a multiply followed by an addition into a fused multiply-and-add).

wooster0 avatar Oct 07 '22 07:10 wooster0

Alright so the first thing I can tell from the CI failures is that there are some problems with f16 for Wasm here...

Just a heads up, this will be fixed by #13100

topolarity avatar Oct 13 '22 18:10 topolarity

Or actually, could we possibly make use of the currently set float mode here? There is enum std.builtin.FloatMode with values Strict and Optimized, and we have @setFloatMode to set the mode. So if it's Strict, we could use some other even more precise version and in Optimized we use the fastest one (like the one we have now).

However, I'm not sure how to actually find out what the currently set float mode is at runtime. Maybe that's not possible (indeed; it's not).

wooster0 avatar Oct 13 '22 20:10 wooster0

I often interpolate between vectors with a scalar value. In your opinion, would it makes sense to handle this use case here as well?

Opioid avatar Oct 26 '22 11:10 Opioid

Vectors with a scalar value? So 2 vectors each with 1 scalar? In that case you can do something like this right?

    _ = lerp(f32, (@Vector(1, f32){0.5})[0], (@Vector(1, f32){1.0})[0], 0.5);

I'm not sure 2 vectors with 1 scalar each is common enough to justify shortening this, but I don't work often enough with vectors to have a sophisticated-enough opinion on this, either, so I'm not sure.

wooster0 avatar Oct 26 '22 13:10 wooster0

No, I mean more or less like this:

    fn lerp(a: anytype, b: anytype, t: f32) @TypeOf(a, b) {
        switch (@typeInfo(@TypeOf(a))) {
            .Float => {
                const u = 1.0 - t;
                return u * a + t * b;
            },
            .Vector => |v| {
                const u = 1.0 - t;
                return @splat(v.len, u) * a + @splat(v.len, t) * b;
            },
            else => unreachable,
        }
    }

It is useful for interpolating colors for example.

EDIT:

I thought about it, and it probably is more zig-like style to simply call 'lerp(vector_a, vector_b, @splat(4, scalar_t))' or something like that. It is also more flexible in theory, so probably a solid choice to just keep it simple as a general purpose std-function. But I don´t love the ergonomics so I would just keep using my own implementation.

FWIW, I'm also in favor of using the more precise algorithm for a std-function.

Opioid avatar Oct 26 '22 14:10 Opioid

I would just keep using my own implementation

I'm also in favor of using the more precise algorithm

Clearly we're all so split on the implementation on this because there are so many different ways of doing interpolation that it probably isn't incredibly important what the implementation will end up being (just waiting for Andrew to decide); in all more sophisticated situations we'll probably all use our own implementation, anyway, so std.math.lerp will be good as a general solution for when its implementation is not too important, or for prototyping, testing, etc.

I tried to add support for vectors but I don't think it will work out unless we change the API and add a comptime Type: type parameter because of @TypeOf(a, b, t). And, as you said, maybe it is better to keep it simple (so, as it is).

wooster0 avatar Oct 27 '22 14:10 wooster0

I would make focus on the codegen and performance(same thing?) implementation rather then being absolutely robust, because we might end up with an unoptimized std piece that will be used across different projects be the bottleneck everywhere. That would be disasterous. Reminds me of C++ STL in some cases.

eLeCtrOssSnake avatar Oct 27 '22 14:10 eLeCtrOssSnake

Can anyone explain the hard requirement for t being in the range [0,1]? Coming from a gamedev background, I've never seen a lerp implementation that enforces the expected range of t. lerp(10.0, 20.0, 2.0) // = 30 is very useful behavior to have and mathematically it works out as well.

Manuzor avatar Oct 14 '23 09:10 Manuzor

You're free to open an issue or a PR for that.

wooster0 avatar Oct 14 '23 10:10 wooster0

It looks like one of these tests rely n t being a constant. If you apply this diff:

-try testing.expectEqual(@as(f64, 0.0), lerp(@as(f64, 1.0e16), 1.0, 1.0));
+try testing.expectEqual(@as(f64, 0.0), lerp(@as(f64, 1.0e16), @as(f64, 1.0), @as(f64, 1.0)));

AIR when using @as:

# Begin Function AIR: math.lerp__anon_931:
# Total AIR+Liveness bytes: 524B
# AIR Instructions:         28 (252B)
# AIR Extra Data:           32 (128B)
# Liveness tomb_bits:       16B
# Liveness Extra Data:      4 (16B)
# Liveness special table:   2 (16B)
  %0 = arg(f64, 0)
  %1 = arg(f64, 1)
  %2 = arg(f64, 2)
  %3!= save_err_return_trace_index()
  %5!= dbg_block_begin()
  %6!= dbg_stmt(4:13)
  %9!= dbg_block_begin()
  %10!= dbg_stmt(5:41)
  %11 = cmp_gte(%2, <f64, 0>)
  %12 = block(bool, {
    %16!= cond_br(%11!, {
      %14 = cmp_lte(%2, <f64, 1>)
      %15!= br(%12, %14!)
    }, {
      %13!= br(%12, @Air.Inst.Ref.bool_false)
    })
  } %11!)
  %17!= dbg_stmt(5:41)
  %18!= call(<fn (bool) void, (function 'assert')>, [%12!])
  %19!= dbg_block_end()
  %21!= dbg_stmt(14:28)
  %23 = sub(%1!, %0)
  %24 = mul_add(%23!, %2!, %0!)
  %25!= dbg_stmt(14:5)
  %27!= dbg_block_end()
  %26!= ret(%24!)
# End Function AIR: math.lerp__anon_931

AIR when not using @as

# Begin Function AIR: math.lerp__anon_931:
# Total AIR+Liveness bytes: 363B
# AIR Instructions:         19 (171B)
# AIR Extra Data:           20 (80B)
# Liveness tomb_bits:       16B
# Liveness Extra Data:      0 (0B)
# Liveness special table:   0 (0B)
  %0 = arg(f64, 0)
  %1!= save_err_return_trace_index()
  %3!= dbg_block_begin()
  %4!= dbg_stmt(4:13)
  %7!= dbg_block_begin()
  %8!= dbg_stmt(5:41)
  %9!= call(<fn (bool) void, (function 'assert')>, [@Air.Inst.Ref.bool_true])
  %10!= dbg_block_end()
  %12!= dbg_stmt(14:28)
  %14 = sub(<f64, 1>, %0)
  %15 = mul_add(%14!, <f64, 1>, %0!)
  %16!= dbg_stmt(14:5)
  %18!= dbg_block_end()
  %17!= ret(%15!)
# End Function AIR: math.lerp__anon_931

This ends up getting optimized to not even do the multiply in the @mulAdd, so the result comes out as zero. The problem is that if this optimization doesn't happen, we get 0x20000000000000 (4.450147717014403e-308).

I discovered this during some CBE changes that resulted in this optimization not happening, I'm not actually sure what the resolution should be. It could actually be a bug in the fma implementation in our compiler_rt, assuming the result should be zero?

kcbanner avatar Nov 06 '23 01:11 kcbanner