conv_transpose produce shifted and wrong output for fp16

Open kasper0406 opened this issue 1 year ago • 0 comments

🐞Describing the bug

Consider the following test case:

def test_failing_program(self):
    batch_size = 1
    input_features = 1
    output_features = 2
    conv_weights = np.array(np.random.rand(input_features, output_features, 3), dtype=np.float32)

    @mb.program(input_specs=[mb.TensorSpec(shape=(batch_size, input_features, 4), dtype=types.fp32)])
    def prog(x):
        nonlocal conv_weights
        conv_transpose = mb.conv_transpose(x=x, weight=conv_weights, pad=[1, 0], output_shape=[batch_size, output_features, 8], pad_type="custom", strides=[2])
        return conv_transpose

    @mb.program(input_specs=[mb.TensorSpec(shape=(batch_size, input_features, 4), dtype=types.fp32)])
    def prog_self_cast(x):
        nonlocal conv_weights
        x = mb.cast(x=x, dtype="fp16")
        conv_weights = mb.cast(x=conv_weights, dtype="fp16")
        conv_transpose = mb.conv_transpose(x=x, weight=conv_weights, pad=[1, 0], output_shape=[batch_size, output_features, 8], pad_type="custom", strides=[2])
        return conv_transpose

    pass_pipeline_full = ct.PassPipeline.DEFAULT
    pass_pipeline_no_fp16_cast = ct.PassPipeline.DEFAULT
    pass_pipeline_no_fp16_cast.remove_passes(["common::add_fp16_cast"])

    model_with_fp16_cast = ct.convert(copy.deepcopy(prog), pass_pipeline=pass_pipeline_full)
    model_no_fp16_cast = ct.convert(copy.deepcopy(prog), pass_pipeline=pass_pipeline_no_fp16_cast)
    model_self_fp16_cast = ct.convert(copy.deepcopy(prog_self_cast), pass_pipeline=pass_pipeline_full)

    x = np.array(10 * np.random.rand(batch_size, input_features, 4), dtype=np.float32)
    output_fp16_cast = model_with_fp16_cast.predict({"x": x})[prog.functions["main"].outputs[0].name]
    output_no_fp16_cast = model_no_fp16_cast.predict({"x": x})[prog.functions["main"].outputs[0].name]
    output_self_fp16_cast = model_self_fp16_cast.predict({"x": x})[prog_self_cast.functions["main"].outputs[0].name]
    np.testing.assert_allclose(
        output_fp16_cast,
        output_no_fp16_cast,
        atol=1e-02,
        rtol=1e-03,
    )
    np.testing.assert_allclose(
        output_fp16_cast,
        output_self_fp16_cast,
        atol=1e-02,
        rtol=1e-03,
    )

My expectation is that output_fp16_cast, output_no_fp16_cast and output_self_fp16_cast will all produce the same output up to numerical precision. The actual result is:

E            FP16 cast: array([[[7.609375, 7.300781, 4.597656, 4.304688, 0.771484, 0.807129,
E                    1.739258, 1.612305],
E                   [2.751953, 5.839844, 1.662109, 3.398438, 0.278809, 0.674316,
E                    0.628418, 1.266602]]], dtype=float32)
E            No FP16 cast: array([[[0.402334, 7.611111, 7.301125, 4.597435, 4.30416 , 0.771244,
E                    0.807122, 1.738807],
E                   [0.491837, 2.751568, 5.843967, 1.662064, 3.400388, 0.27882 ,
E                    0.674435, 0.628613]]], dtype=float32)

According to a corresponding Jax program, the expected output seem to be that with no FP16 casting.

The two CoreML programs looks like the following:

print(model_with_fp16_cast._mil_program)

main[CoreML3](%x: (1, 1, 4, fp32)(Tensor)) {
  block0() {
    %x_to_fp16: (1, 1, 4, fp16)(Tensor) = cast(x=%x, dtype="fp16", name="cast_3")
    %conv_transpose_0_cast_fp16: (1, 2, 8, fp16)(Tensor) = conv_transpose(x=%x_to_fp16, weight=[[[0.042572021484375, 0.8056640625, 0.7470703125], [0.05206298828125, 0.291259765625, 0.5869140625]]], pad=[1, 0], output_shape=[1, 2, 8], pad_type="custom", strides=[2], dilations=[1], groups=1, name="conv_transpose_0_cast_fp16")
    %conv_transpose_0: (1, 2, 8, fp32)(Tensor) = cast(x=%conv_transpose_0_cast_fp16, dtype="fp32", name="cast_2")
  } -> (%conv_transpose_0)
}

main[CoreML3](%x: (1, 1, 4, fp32)(Tensor)) {
  block0() {
    %conv_transpose_0: (1, 2, 8, fp32)(Tensor) = conv_transpose(x=%x, weight=[[[0.04258209466934204, 0.805541455745697, 0.7470118403434753], [0.05205479636788368, 0.29121920466423035, 0.5870679020881653]]], pad=[1, 0], output_shape=[1, 2, 8], pad_type="custom", strides=[2], dilations=[1], groups=1, name="conv_transpose_0")
  } -> (%conv_transpose_0)
}

Up to the actual fp16 casts, these programs look identical to me.

This leaves me at a spot, where I believe this is a bug in the runtime implementation of conv_transpose. I think the runtime is not open source though, so I am not able to troubleshoot further.

System environment (please complete the following information):

coremltools version: 8.0b2
OS (e.g. MacOS version or Linux type): MacOS 15.0 Beta (24A5327a), running on an Intel Mac

Aug 27 '24 13:08 kasper0406