mypy icon indicating copy to clipboard operation
mypy copied to clipboard

[mypyc] Fix native tuple unboxing

Open ichard26 opened this issue 2 years ago • 3 comments

Currently, unboxing tuples first involve doing a tuple cast to do error checks in advance. This hack was added because the error handling code for unboxing tuples is busted and would cause crashes when the unbox failed. (It only checks if the first item is uninitialized. If the second item is uninitialized while the first is not, no exception is raised, leading to crashes when it's accessed later.)

This commit reworks the tuple unbox logic so the cast isn't needed anymore. Instead ...

  • Use emit_arg_check() to do a tuple type and length; and then
  • Modify the error handler for the item unboxes/casts to jump to an error block that uninitializes the whole tuple[^1] and sets an exception.

I had to add support for custom error handlers to native integer and float unboxes so a jump to the tuple unbox error block is possible on failure.

NOTE: this doesn't affect tuple->tuple coercions because those are optimized away: each tuple item are coerced individually instead. This only affects coercions where the source is either Any or a tuple of variable length.

Fixes mypyc/mypyc#451.

[^1]: While uninitializing only the first element would suffice, it’s simpler and probably safer to uninitialize the entire tuple.

ichard26 avatar Apr 09 '23 18:04 ichard26

Here's the old and new C code for this test file:

from typing import Any, Tuple
from mypy_extensions import i64, i32

class A: pass

def f(arg: Any) -> None:
    t: Tuple[str, str] = arg
    t2: Tuple[int, float, bool, None] = arg
    t3: Tuple[i64, i32] = arg
    t4: Tuple[A] = arg
New code

char CPyDef_f(PyObject *cpy_r_arg) {
    tuple_T2OO cpy_r_r0;
    tuple_T2OO cpy_r_t;
    tuple_T4IFCC cpy_r_r1;
    tuple_T4IFCC cpy_r_t2;
    tuple_T284 cpy_r_r2;
    int64_t cpy_r_r3;
    char cpy_r_r4;
    tuple_T284 cpy_r_t3;
    tuple_T1O cpy_r_r5;
    PyObject *cpy_r_r6;
    tuple_T1O cpy_r_t4;
    char cpy_r_r7;
CPyL0: ;
    if (!PyTuple_Check(cpy_r_arg) || PyTuple_Size(cpy_r_arg) != 2) {
        CPy_TypeError("tuple[str, str]", cpy_r_arg); cpy_r_r0 = tuple_undefined_T2OO;
    } else {
        PyObject *__tmp3 = PyTuple_GET_ITEM(cpy_r_arg, 0);
        if (likely(PyUnicode_Check(__tmp3)))
            cpy_r_r0.f0 = __tmp3;
        else {
            goto __LL1;
        }
        PyObject *__tmp4 = PyTuple_GET_ITEM(cpy_r_arg, 1);
        if (likely(PyUnicode_Check(__tmp4)))
            cpy_r_r0.f1 = __tmp4;
        else {
            goto __LL1;
        }
        CPy_INCREF(cpy_r_r0.f0);
        CPy_INCREF(cpy_r_r0.f1);
        goto __LL2;
__LL1: ;
        CPy_TypeError("tuple[str, str]", cpy_r_arg); cpy_r_r0 = tuple_undefined_T2OO;
    }
__LL2: ;
    if (unlikely(cpy_r_r0.f0 == NULL)) {
        CPy_AddTraceback("temp/unbox.py", "f", 7, CPyStatic_globals);
        goto CPyL6;
    }
CPyL1: ;
    cpy_r_t = cpy_r_r0;
    CPy_DECREF(cpy_r_t.f0);
    CPy_DECREF(cpy_r_t.f1);
    if (!PyTuple_Check(cpy_r_arg) || PyTuple_Size(cpy_r_arg) != 4) {
        CPy_TypeError("tuple[int, float, bool, None]", cpy_r_arg); cpy_r_r1 = tuple_undefined_T4IFCC;
    } else {
        PyObject *__tmp7 = PyTuple_GET_ITEM(cpy_r_arg, 0);
        if (likely(PyLong_Check(__tmp7)))
            cpy_r_r1.f0 = CPyTagged_BorrowFromObject(__tmp7);
        else {
            goto __LL5;
        }
        PyObject *__tmp8 = PyTuple_GET_ITEM(cpy_r_arg, 1);
        cpy_r_r1.f1 = PyFloat_AsDouble(__tmp8);
        if (cpy_r_r1.f1 == -1.0 && PyErr_Occurred()) {
            goto __LL5;
        }
        PyObject *__tmp9 = PyTuple_GET_ITEM(cpy_r_arg, 2);
        if (unlikely(!PyBool_Check(__tmp9))) {
            goto __LL5;
        } else
            cpy_r_r1.f2 = __tmp9 == Py_True;
        PyObject *__tmp10 = PyTuple_GET_ITEM(cpy_r_arg, 3);
        if (unlikely(__tmp10 != Py_None)) {
            goto __LL5;
        } else
            cpy_r_r1.f3 = 1;
        CPyTagged_INCREF(cpy_r_r1.f0);
        goto __LL6;
__LL5: ;
        if (!PyErr_Occurred()) {
            CPy_TypeError("tuple[int, float, bool, None]", cpy_r_arg); 
        }
        cpy_r_r1 = tuple_undefined_T4IFCC;
    }
__LL6: ;
    if (unlikely(cpy_r_r1.f0 == CPY_INT_TAG)) {
        CPy_AddTraceback("temp/unbox.py", "f", 8, CPyStatic_globals);
        goto CPyL6;
    }
CPyL2: ;
    cpy_r_t2 = cpy_r_r1;
    CPyTagged_DECREF(cpy_r_t2.f0);
    if (!PyTuple_Check(cpy_r_arg) || PyTuple_Size(cpy_r_arg) != 2) {
        CPy_TypeError("tuple[int64, int32]", cpy_r_arg); cpy_r_r2 = tuple_undefined_T284;
    } else {
        PyObject *__tmp13 = PyTuple_GET_ITEM(cpy_r_arg, 0);
        cpy_r_r2.f0 = CPyLong_AsInt64(__tmp13);
        if (cpy_r_r2.f0 == -113 && PyErr_Occurred()) {
            goto __LL11;
        }
        PyObject *__tmp14 = PyTuple_GET_ITEM(cpy_r_arg, 1);
        cpy_r_r2.f1 = CPyLong_AsInt32(__tmp14);
        if (cpy_r_r2.f1 == -113 && PyErr_Occurred()) {
            goto __LL11;
        }
        goto __LL12;
__LL11: ;
        if (!PyErr_Occurred()) {
            CPy_TypeError("tuple[int64, int32]", cpy_r_arg); 
        }
        cpy_r_r2 = tuple_undefined_T284;
    }
__LL12: ;
    cpy_r_r3 = cpy_r_r2.f0;
    cpy_r_r4 = cpy_r_r3 == -113;
    if (unlikely(cpy_r_r4)) goto CPyL4;
CPyL3: ;
    cpy_r_t3 = cpy_r_r2;
    if (!PyTuple_Check(cpy_r_arg) || PyTuple_Size(cpy_r_arg) != 1) {
        CPy_TypeError("tuple[unbox.A]", cpy_r_arg); cpy_r_r5 = tuple_undefined_T1O;
    } else {
        PyObject *__tmp17 = PyTuple_GET_ITEM(cpy_r_arg, 0);
        if (likely(Py_TYPE(__tmp17) == CPyType_A))
            cpy_r_r5.f0 = __tmp17;
        else {
            goto __LL15;
        }
        CPy_INCREF(cpy_r_r5.f0);
        goto __LL16;
__LL15: ;
        CPy_TypeError("tuple[unbox.A]", cpy_r_arg); cpy_r_r5 = tuple_undefined_T1O;
    }
__LL16: ;
    if (unlikely(cpy_r_r5.f0 == NULL)) {
        CPy_AddTraceback("temp/unbox.py", "f", 10, CPyStatic_globals);
        goto CPyL6;
    } else
        goto CPyL5;
CPyL4: ;
    cpy_r_r6 = PyErr_Occurred();
    if (unlikely(cpy_r_r6 != NULL)) {
        CPy_AddTraceback("temp/unbox.py", "f", 9, CPyStatic_globals);
        goto CPyL6;
    } else
        goto CPyL3;
CPyL5: ;
    cpy_r_t4 = cpy_r_r5;
    CPy_DECREF(cpy_r_t4.f0);
    return 1;
CPyL6: ;
    cpy_r_r7 = 2;
    return cpy_r_r7;
}
Old code

char CPyDef_f(PyObject *cpy_r_arg) {
    tuple_T2OO cpy_r_r0;
    tuple_T2OO cpy_r_t;
    tuple_T4IFCC cpy_r_r1;
    tuple_T4IFCC cpy_r_t2;
    tuple_T284 cpy_r_r2;
    int64_t cpy_r_r3;
    char cpy_r_r4;
    tuple_T284 cpy_r_t3;
    tuple_T1O cpy_r_r5;
    PyObject *cpy_r_r6;
    tuple_T1O cpy_r_t4;
    char cpy_r_r7;
CPyL0: ;
    PyObject *__tmp1;
    if (unlikely(!(PyTuple_Check(cpy_r_arg) && PyTuple_GET_SIZE(cpy_r_arg) == 2))) {
        __tmp1 = NULL;
        goto __LL2;
    }
    if (likely(PyUnicode_Check(PyTuple_GET_ITEM(cpy_r_arg, 0))))
        __tmp1 = PyTuple_GET_ITEM(cpy_r_arg, 0);
    else {
        __tmp1 = NULL;
    }
    if (__tmp1 == NULL) goto __LL2;
    if (likely(PyUnicode_Check(PyTuple_GET_ITEM(cpy_r_arg, 1))))
        __tmp1 = PyTuple_GET_ITEM(cpy_r_arg, 1);
    else {
        __tmp1 = NULL;
    }
    if (__tmp1 == NULL) goto __LL2;
    __tmp1 = cpy_r_arg;
__LL2: ;
    if (unlikely(__tmp1 == NULL)) {
        CPy_TypeError("tuple[str, str]", cpy_r_arg); cpy_r_r0 = tuple_undefined_T2OO;
    } else {
        PyObject *__tmp3 = PyTuple_GET_ITEM(cpy_r_arg, 0);
        CPy_INCREF(__tmp3);
        PyObject *__tmp4;
        if (likely(PyUnicode_Check(__tmp3)))
            __tmp4 = __tmp3;
        else {
            CPy_TypeError("str", __tmp3); 
            __tmp4 = NULL;
        }
        cpy_r_r0.f0 = __tmp4;
        PyObject *__tmp5 = PyTuple_GET_ITEM(cpy_r_arg, 1);
        CPy_INCREF(__tmp5);
        PyObject *__tmp6;
        if (likely(PyUnicode_Check(__tmp5)))
            __tmp6 = __tmp5;
        else {
            CPy_TypeError("str", __tmp5); 
            __tmp6 = NULL;
        }
        cpy_r_r0.f1 = __tmp6;
    }
    if (unlikely(cpy_r_r0.f0 == NULL)) {
        CPy_AddTraceback("temp/unbox.py", "f", 7, CPyStatic_globals);
        goto CPyL6;
    }
CPyL1: ;
    cpy_r_t = cpy_r_r0;
    CPy_DECREF(cpy_r_t.f0);
    CPy_DECREF(cpy_r_t.f1);
    PyObject *__tmp7;
    if (unlikely(!(PyTuple_Check(cpy_r_arg) && PyTuple_GET_SIZE(cpy_r_arg) == 4))) {
        __tmp7 = NULL;
        goto __LL8;
    }
    if (likely(PyLong_Check(PyTuple_GET_ITEM(cpy_r_arg, 0))))
        __tmp7 = PyTuple_GET_ITEM(cpy_r_arg, 0);
    else {
        __tmp7 = NULL;
    }
    if (__tmp7 == NULL) goto __LL8;
    if (likely(CPyFloat_Check(PyTuple_GET_ITEM(cpy_r_arg, 1))))
        __tmp7 = PyTuple_GET_ITEM(cpy_r_arg, 1);
    else {
        __tmp7 = NULL;
    }
    if (__tmp7 == NULL) goto __LL8;
    if (likely(PyBool_Check(PyTuple_GET_ITEM(cpy_r_arg, 2))))
        __tmp7 = PyTuple_GET_ITEM(cpy_r_arg, 2);
    else {
        __tmp7 = NULL;
    }
    if (__tmp7 == NULL) goto __LL8;
    if (likely(PyTuple_GET_ITEM(cpy_r_arg, 3) == Py_None))
        __tmp7 = PyTuple_GET_ITEM(cpy_r_arg, 3);
    else {
        __tmp7 = NULL;
    }
    if (__tmp7 == NULL) goto __LL8;
    __tmp7 = cpy_r_arg;
__LL8: ;
    if (unlikely(__tmp7 == NULL)) {
        CPy_TypeError("tuple[int, float, bool, None]", cpy_r_arg); cpy_r_r1 = tuple_undefined_T4IFCC;
    } else {
        PyObject *__tmp9 = PyTuple_GET_ITEM(cpy_r_arg, 0);
        CPyTagged __tmp10;
        if (likely(PyLong_Check(__tmp9)))
            __tmp10 = CPyTagged_FromObject(__tmp9);
        else {
            CPy_TypeError("int", __tmp9); __tmp10 = CPY_INT_TAG;
        }
        cpy_r_r1.f0 = __tmp10;
        PyObject *__tmp11 = PyTuple_GET_ITEM(cpy_r_arg, 1);
        double __tmp12;
        __tmp12 = PyFloat_AsDouble(__tmp11);
        if (__tmp12 == -1.0 && PyErr_Occurred()) {
            __tmp12 = -113.0;
        }
        cpy_r_r1.f1 = __tmp12;
        PyObject *__tmp13 = PyTuple_GET_ITEM(cpy_r_arg, 2);
        char __tmp14;
        if (unlikely(!PyBool_Check(__tmp13))) {
            CPy_TypeError("bool", __tmp13); __tmp14 = 2;
        } else
            __tmp14 = __tmp13 == Py_True;
        cpy_r_r1.f2 = __tmp14;
        PyObject *__tmp15 = PyTuple_GET_ITEM(cpy_r_arg, 3);
        char __tmp16;
        if (unlikely(__tmp15 != Py_None)) {
            CPy_TypeError("None", __tmp15); __tmp16 = 2;
        } else
            __tmp16 = 1;
        cpy_r_r1.f3 = __tmp16;
    }
    if (unlikely(cpy_r_r1.f0 == CPY_INT_TAG)) {
        CPy_AddTraceback("temp/unbox.py", "f", 8, CPyStatic_globals);
        goto CPyL6;
    }
CPyL2: ;
    cpy_r_t2 = cpy_r_r1;
    CPyTagged_DECREF(cpy_r_t2.f0);
    PyObject *__tmp17;
    if (unlikely(!(PyTuple_Check(cpy_r_arg) && PyTuple_GET_SIZE(cpy_r_arg) == 2))) {
        __tmp17 = NULL;
        goto __LL18;
    }
    if (likely(PyLong_Check(PyTuple_GET_ITEM(cpy_r_arg, 0))))
        __tmp17 = PyTuple_GET_ITEM(cpy_r_arg, 0);
    else {
        __tmp17 = NULL;
    }
    if (__tmp17 == NULL) goto __LL18;
    if (likely(PyLong_Check(PyTuple_GET_ITEM(cpy_r_arg, 1))))
        __tmp17 = PyTuple_GET_ITEM(cpy_r_arg, 1);
    else {
        __tmp17 = NULL;
    }
    if (__tmp17 == NULL) goto __LL18;
    __tmp17 = cpy_r_arg;
__LL18: ;
    if (unlikely(__tmp17 == NULL)) {
        CPy_TypeError("tuple[int64, int32]", cpy_r_arg); cpy_r_r2 = tuple_undefined_T284;
    } else {
        PyObject *__tmp19 = PyTuple_GET_ITEM(cpy_r_arg, 0);
        int64_t __tmp20;
        __tmp20 = CPyLong_AsInt64(__tmp19);
        cpy_r_r2.f0 = __tmp20;
        PyObject *__tmp21 = PyTuple_GET_ITEM(cpy_r_arg, 1);
        int32_t __tmp22;
        __tmp22 = CPyLong_AsInt32(__tmp21);
        cpy_r_r2.f1 = __tmp22;
    }
    cpy_r_r3 = cpy_r_r2.f0;
    cpy_r_r4 = cpy_r_r3 == -113;
    if (unlikely(cpy_r_r4)) goto CPyL4;
CPyL3: ;
    cpy_r_t3 = cpy_r_r2;
    PyObject *__tmp23;
    if (unlikely(!(PyTuple_Check(cpy_r_arg) && PyTuple_GET_SIZE(cpy_r_arg) == 1))) {
        __tmp23 = NULL;
        goto __LL24;
    }
    if (likely(Py_TYPE(PyTuple_GET_ITEM(cpy_r_arg, 0)) == CPyType_A))
        __tmp23 = PyTuple_GET_ITEM(cpy_r_arg, 0);
    else {
        __tmp23 = NULL;
    }
    if (__tmp23 == NULL) goto __LL24;
    __tmp23 = cpy_r_arg;
__LL24: ;
    if (unlikely(__tmp23 == NULL)) {
        CPy_TypeError("tuple[unbox.A]", cpy_r_arg); cpy_r_r5 = tuple_undefined_T1O;
    } else {
        PyObject *__tmp25 = PyTuple_GET_ITEM(cpy_r_arg, 0);
        CPy_INCREF(__tmp25);
        PyObject *__tmp26;
        if (likely(Py_TYPE(__tmp25) == CPyType_A))
            __tmp26 = __tmp25;
        else {
            CPy_TypeError("unbox.A", __tmp25); 
            __tmp26 = NULL;
        }
        cpy_r_r5.f0 = __tmp26;
    }
    if (unlikely(cpy_r_r5.f0 == NULL)) {
        CPy_AddTraceback("temp/unbox.py", "f", 10, CPyStatic_globals);
        goto CPyL6;
    } else
        goto CPyL5;
CPyL4: ;
    cpy_r_r6 = PyErr_Occurred();
    if (unlikely(cpy_r_r6 != NULL)) {
        CPy_AddTraceback("temp/unbox.py", "f", 9, CPyStatic_globals);
        goto CPyL6;
    } else
        goto CPyL3;
CPyL5: ;
    cpy_r_t4 = cpy_r_r5;
    CPy_DECREF(cpy_r_t4.f0);
    return 1;
CPyL6: ;
    cpy_r_r7 = 2;
    return cpy_r_r7;
}

ichard26 avatar Apr 09 '23 19:04 ichard26

Actually, I may as well double check the refcount management of this code while I'm fixing it.

ichard26 avatar Apr 13 '23 02:04 ichard26

I created a simple benchmark and it somewhat surprisingly seems about 20% slower with the new code (Python 3.11 and Apple M1 Max):

Good catch! Adding unlikely(...) and using the inline length function seems to have brought performance back to master levels. Unfortunately there's no significant speed up from this PR. If I left the refcounting alone, then a small speed up can be observed (~5%), but incrementing refcounts per item makes it really challenging to deref properly when the unbox fails (since what needs to deref-ed depends on which item failed). We would (probably) have to add custom decref code into the error handler for each item. I don't think that's worth it.

ichard26 avatar Jun 02 '23 22:06 ichard26

I have a (not yet complete) patch that merges the unbox error branches within the unbox op that depends on this PR. @JukkaL I'd appreciate if this could be looked at soon (especially since this seems merge conflict prone). Thanks!

--- build/__native.c	2023-07-20 12:47:55.132023572 -0400
+++ check-build/__native.c	2023-07-20 12:47:19.404279527 -0400
@@ -67,31 +67,22 @@
     if (likely(PyLong_Check(cpy_r_arg)))
         cpy_r_r0 = CPyTagged_FromObject(cpy_r_arg);
     else {
-        CPy_TypeError("int", cpy_r_arg); cpy_r_r0 = CPY_INT_TAG;
-    }
-    if (unlikely(cpy_r_r0 == CPY_INT_TAG)) {
-        CPy_AddTraceback("temp/unbox_size.py", "main", 5, CPyStatic_globals);
+        CPy_TypeErrorTraceback("temp/unbox_size.py", "main", 5, CPyStatic_globals, "int", cpy_r_arg);
         goto CPyL4;
     }
     cpy_r_integer = cpy_r_r0;
     CPyTagged_DECREF(cpy_r_integer);
     if (unlikely(!PyBool_Check(cpy_r_arg))) {
-        CPy_TypeError("bool", cpy_r_arg); cpy_r_r1 = 2;
+        CPy_TypeErrorTraceback("temp/unbox_size.py", "main", 6, CPyStatic_globals, "bool", cpy_r_arg);
+        goto CPyL4;
     } else
         cpy_r_r1 = cpy_r_arg == Py_True;
-    if (unlikely(cpy_r_r1 == 2)) {
-        CPy_AddTraceback("temp/unbox_size.py", "main", 6, CPyStatic_globals);
-        goto CPyL4;
-    }
     cpy_r_boolean = cpy_r_r1;
     if (unlikely(cpy_r_arg != Py_None)) {
-        CPy_TypeError("None", cpy_r_arg); cpy_r_r2 = 2;
+        CPy_TypeErrorTraceback("temp/unbox_size.py", "main", 7, CPyStatic_globals, "None", cpy_r_arg);
+        goto CPyL4;
     } else
         cpy_r_r2 = 1;
-    if (unlikely(cpy_r_r2 == 2)) {
-        CPy_AddTraceback("temp/unbox_size.py", "main", 7, CPyStatic_globals);
-        goto CPyL4;
-    }
     cpy_r_none = cpy_r_r2;
     return 1;
 CPyL4: ;

ichard26 avatar Jul 20 '23 16:07 ichard26

@JukkaL, been a while since I've last been around. I hope you're well. Could you take a look at this?

ichard26 avatar Nov 03 '23 21:11 ichard26

Sorry for the slow response! It looks like the microbenchmark is still about 5% slower with this change (Applex M1 Max). I wonder if we can have something that is faster without too much extra complexity.

JukkaL avatar Feb 28 '24 17:02 JukkaL