HIP icon indicating copy to clipboard operation
HIP copied to clipboard

`libamdhip64.so.4` cause C++ vector assertion error

Open riaqn opened this issue 2 years ago • 11 comments

When I try to run a simple TF program, I get errors; FYI I get backtrace. TF 2.7.0 and rocm 4.5.0.

/usr/include/c++/11.1.0/bits/stl_vector.h:1045: std::vector<_Tp, _Alloc>::reference std::vector<_Tp, _Alloc>::operator[](std::vector<_Tp, _Alloc>::size_type) [with _Tp = hsa_signal_s; _Alloc = std::allocator<hsa_signal_s>; std::vector<_Tp, _Alloc>::reference = hsa_signal_s&; std::vector<_Tp, _Alloc>::size_type = long unsigned int]: Assertion '__n < this->size()' failed.

Thread 1 "python" received signal SIGABRT, Aborted.
0x00007ffff7a61d22 in raise () from /usr/lib/libc.so.6
(gdb) bt
#0  0x00007ffff7a61d22 in raise () from /usr/lib/libc.so.6
#1  0x00007ffff7a4b862 in abort () from /usr/lib/libc.so.6
#2  0x00007ffff5db9a1a in ?? () from /opt/rocm/hip/lib/libamdhip64.so.4
#3  0x00007ffff5fc41e0 in ?? () from /opt/rocm/hip/lib/libamdhip64.so.4
#4  0x00007ffff5fc680a in ?? () from /opt/rocm/hip/lib/libamdhip64.so.4
#5  0x00007ffff5fc6cfa in ?? () from /opt/rocm/hip/lib/libamdhip64.so.4
#6  0x00007ffff5f8a8d0 in ?? () from /opt/rocm/hip/lib/libamdhip64.so.4
#7  0x00007ffff5f52618 in ?? () from /opt/rocm/hip/lib/libamdhip64.so.4
#8  0x00007ffff5e3955b in ?? () from /opt/rocm/hip/lib/libamdhip64.so.4
#9  0x00007ffff5e5d5d2 in hipMemcpyHtoDAsync () from /opt/rocm/hip/lib/libamdhip64.so.4
#10 0x00007fffe8edc768 in stream_executor::gpu::GpuDriver::AsynchronousMemcpyH2D(stream_executor::gpu::GpuContext*, void*, void const*, unsigned long, ihipStream_t*) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#11 0x00007fffe8c59644 in stream_executor::Stream::ThenMemcpy(stream_executor::DeviceMemoryBase*, void const*, unsigned long) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#12 0x00007fffd2866751 in tensorflow::GPUUtil::CopyCPUTensorToGPU(tensorflow::Tensor const*, tensorflow::DeviceContext const*, tensorflow::Device*, tensorflow::Tensor*, std::function<void (tensorflow::Status const&)>, bool) () from /usr/lib/python3.9/site-packages/tensorflow/python/../libtensorflow_framework.so.2
#13 0x00007fffd2868a23 in tensorflow::GPUDeviceContext::CopyCPUTensorToDevice(tensorflow::Tensor const*, tensorflow::Device*, tensorflow::Tensor*, std::function<void (tensorflow::Status const&)>, bool) const () from /usr/lib/python3.9/site-packages/tensorflow/python/../libtensorflow_framework.so.2
#14 0x00007fffd2984c19 in ?? () from /usr/lib/python3.9/site-packages/tensorflow/python/../libtensorflow_framework.so.2
#15 0x00007fffd2987a04 in tensorflow::CopyTensor::ViaDMA(absl::lts_20210324::string_view, tensorflow::DeviceContext*, tensorflow::DeviceContext*, tensorflow::Device*, tensorflow::Device*, tensorflow::AllocatorAttributes, tensorflow::AllocatorAttributes, tensorflow::Tensor const*, tensorflow::Tensor*, int, std::function<void (tensorflow::Status const&)>, bool) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/../libtensorflow_framework.so.2
#16 0x00007fffe41c94b8 in tensorflow::TensorHandle::CopyToDevice(tensorflow::EagerContext const&, tensorflow::Device*, tensorflow::Tensor*) const ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#17 0x00007fffdd9db369 in tensorflow::CopyToDeviceNode::Run() () from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#18 0x00007fffe46b57d0 in tensorflow::EagerExecutor::SyncExecute(tensorflow::EagerNode*) () from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#19 0x00007fffdd9dc91c in tensorflow::EagerCopyToDevice(tensorflow::TensorHandle*, tensorflow::EagerContext*, tensorflow::EagerExecutor*, tensorflow::Device*, bool, tensorflow::TensorHandle**) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#20 0x00007fffdd9e1d2c in ?? () from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#21 0x00007fffdd9e2aac in ?? () from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#22 0x00007fffdd9eb51c in ?? () from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#23 0x00007fffdd9ec6e0 in tensorflow::EagerExecute(tensorflow::EagerOperation*, tensorflow::TensorHandle**, int*) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#24 0x00007fffdd78fa7b in tensorflow::EagerOperation::Execute(absl::lts_20210324::Span<tensorflow::AbstractTensorHandle*>, int*) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#25 0x00007fffe41f30f9 in tensorflow::CustomDeviceOpHandler::Execute(tensorflow::ImmediateExecutionOperation*, tensorflow::ImmediateExecutionTensorHandle**, int*) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#26 0x00007fffdd33e126 in TFE_Execute () from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#27 0x00007fffdd29b733 in tensorflow::EagerConst(TFE_Context*, TFE_TensorHandle*, char const*, TF_Status*) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#28 0x00007fffdd29d3fc in tensorflow::ConvertToEagerTensorUncached(TFE_Context*, _object*, tensorflow::DataType, char const*) ()
   from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#29 0x00007fffdd29e688 in EagerTensor_init () from /usr/lib/python3.9/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so
#30 0x00007ffff7d220cb in _PyObject_MakeTpCall () from /usr/lib/libpython3.9.so.1.0
#31 0x00007ffff7d1e13b in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#32 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#33 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#34 0x00007ffff7d19376 in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#35 0x00007ffff7d296eb in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#36 0x00007ffff7d19376 in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
--Type <RET> for more, q to quit, c to continue without paging--
#37 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#38 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#39 0x00007ffff7d1a18f in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#40 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#41 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#42 0x00007ffff7d1a18f in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#43 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#44 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#45 0x00007ffff7d1a18f in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#46 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#47 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#48 0x00007ffff7d38c49 in PyObject_Call () from /usr/lib/libpython3.9.so.1.0
#49 0x00007ffff7d1bc6b in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#50 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#51 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#52 0x00007ffff7d1a18f in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#53 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#54 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#55 0x00007ffff7d1a18f in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#56 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#57 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#58 0x00007ffff7d38c49 in PyObject_Call () from /usr/lib/libpython3.9.so.1.0
#59 0x00007ffff7d1bc6b in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#60 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#61 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#62 0x00007ffff7d38c49 in PyObject_Call () from /usr/lib/libpython3.9.so.1.0
#63 0x00007ffff7d1bc6b in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#64 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#65 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#66 0x00007ffff7d1a18f in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#67 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#68 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#69 0x00007ffff7d1bc6b in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#70 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#71 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#72 0x00007ffff7d19376 in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#73 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#74 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#75 0x00007ffff7d1db89 in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#76 0x00007ffff7d296eb in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#77 0x00007ffff7d19376 in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#78 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#79 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#80 0x00007ffff7d3517e in ?? () from /usr/lib/libpython3.9.so.1.0
#81 0x00007ffff7d220cb in _PyObject_MakeTpCall () from /usr/lib/libpython3.9.so.1.0
#82 0x00007ffff7d1e2a5 in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#83 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#84 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#85 0x00007ffff7d3517e in ?? () from /usr/lib/libpython3.9.so.1.0
#86 0x00007ffff7d2254b in ?? () from /usr/lib/libpython3.9.so.1.0
--Type <RET> for more, q to quit, c to continue without paging--
#87 0x00007ffff7d38d02 in PyObject_Call () from /usr/lib/libpython3.9.so.1.0
#88 0x00007ffff7d1bc6b in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#89 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#90 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#91 0x00007ffff7d1a18f in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#92 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#93 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#94 0x00007ffff7d38c49 in PyObject_Call () from /usr/lib/libpython3.9.so.1.0
#95 0x00007ffff7d1bc6b in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#96 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#97 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#98 0x00007ffff7d385d4 in ?? () from /usr/lib/libpython3.9.so.1.0
#99 0x00007ffff7d1a18f in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#100 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#101 0x00007ffff7d17c41 in _PyEval_EvalCodeWithName () from /usr/lib/libpython3.9.so.1.0
#102 0x00007ffff7dcf633 in PyEval_EvalCode () from /usr/lib/libpython3.9.so.1.0
#103 0x00007ffff7dd664c in ?? () from /usr/lib/libpython3.9.so.1.0
#104 0x00007ffff7d2a198 in ?? () from /usr/lib/libpython3.9.so.1.0
#105 0x00007ffff7d19376 in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#106 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#107 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#108 0x00007ffff7d19376 in _PyEval_EvalFrameDefault () from /usr/lib/libpython3.9.so.1.0
#109 0x00007ffff7d17fd9 in ?? () from /usr/lib/libpython3.9.so.1.0
#110 0x00007ffff7d2990e in _PyFunction_Vectorcall () from /usr/lib/libpython3.9.so.1.0
#111 0x00007ffff7dfa3c8 in ?? () from /usr/lib/libpython3.9.so.1.0
#112 0x00007ffff7df0cb4 in Py_RunMain () from /usr/lib/libpython3.9.so.1.0
#113 0x00007ffff7dc1ab9 in Py_BytesMain () from /usr/lib/libpython3.9.so.1.0
#114 0x00007ffff7a4cb25 in __libc_start_main () from /usr/lib/libc.so.6
#115 0x000055555555504e in _start ()

riaqn avatar Nov 25 '21 13:11 riaqn

Thank you for reporting the issue, it will be fixed. Basically for some reason your build enabled _GLIBCXX_ASSERTIONS and by default it's disabled in our environment.

gandryey avatar Nov 25 '21 16:11 gandryey

@gandryey Thank you for the quick reply - is there some quick fix I can try?

riaqn avatar Nov 25 '21 17:11 riaqn

Sure, if you rebuild runtime. In hsaCopy() and copyBufferRect() under rocblit.cpp, there are calls of hsa_amd_memory_async_copy(). Replace the argument &wait_events[0] with "(wait_events.size() > 0) ? &wait_events[0] : nullptr" Basically if wait_events.size() is 0, then the list is ignored, but the validation check could trigger abort.

gandryey avatar Nov 25 '21 17:11 gandryey

Thank you. Is there some cmake argument I can pass to disable this out-of-bound check (since that would easier)? Currently my build looks like this

build() {
  mkdir build && cd build
  cmake -Wno-dev \
  -S "$srcdir/$_dirhipamd" \
  -DHIP_COMMON_DIR="$srcdir/$_dirhip" \
  -DAMD_OPENCL_PATH="$srcdir/$_diropencl" \
  -DROCCLR_PATH="$srcdir/$_dirrocclr" \
  -DHIP_PLATFORM=amd \
  -DCMAKE_INSTALL_PREFIX=/opt/rocm/hip

  make
}

riaqn avatar Nov 25 '21 17:11 riaqn

Try to remove -Wno-dev and add explicit release build with -DCMAKE_BUILD_TYPE=Release. However I don't really know what exactly triggered _GLIBCXX_ASSERTIONS in your build. Maybe some global setting in the compiler on your system, because even for debug build our cmake doesn't enable that define. LLVM cmake may declare that define for debug purposes.

gandryey avatar Nov 25 '21 18:11 gandryey

@gandryey thank you - I think patching the source code would just be easier. I will post the patch when I get time to do it.

riaqn avatar Nov 25 '21 20:11 riaqn

Actually instead of that condition you can use just wait_events.data(). That will produce a bit more optimal code.

gandryey avatar Nov 25 '21 22:11 gandryey

@gandryey Thank you - I tried your first suggestion anyway and it works. Here is the patch. But can you elaborate on the new method?

--- ROCclr-rocm-4.5.0/device/rocm/rocblit.cpp	2021-11-25 22:38:12.837707706 +0100
+++ ROCclr-rocm-4.5.0/device/rocm/rocblit.cpp	2021-11-25 22:40:00.699796341 +0100
@@ -460,7 +460,7 @@
               std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0,
               active.handle);
       hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
-          &srcMem, &offset, &dim, agent, direction, wait_events.size(), &wait_events[0], active);
+          &srcMem, &offset, &dim, agent, direction, wait_events.size(), (wait_events.size() > 0) ? &wait_events[0] : nullptr, active);
       if (status != HSA_STATUS_SUCCESS) {
         gpu().Barriers().ResetCurrentSignal();
         LogPrintfError("DMA buffer failed with code %d", status);
@@ -484,7 +484,7 @@
           hsa_status_t status = hsa_amd_memory_async_copy(
               (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
               (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
-              size[0], wait_events.size(), &wait_events[0], active);
+              size[0], wait_events.size(), (wait_events.size() > 0) ? &wait_events[0] : nullptr, active);
           if (status != HSA_STATUS_SUCCESS) {
             gpu().Barriers().ResetCurrentSignal();
             LogPrintfError("DMA buffer failed with code %d", status);
@@ -671,7 +671,7 @@
           active.handle);
 
   status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
-      size[0], wait_events.size(), &wait_events[0], active);
+      size[0], wait_events.size(), (wait_events.size() > 0) ? &wait_events[0] : nullptr, active);
   if (status == HSA_STATUS_SUCCESS) {
     gpu().addSystemScope();
   } else {

riaqn avatar Nov 25 '21 22:11 riaqn

With wait_events.data() the generated CPU code is simpler and will be inline.

hsa_amd_memory_async_copy/hsa_amd_memory_async_copy_rect() ignores the argument if size is 0.

  • @param[in] dep_signals List of signals that must be waited on before the copy
  • operation starts. The copy will start after every signal has been observed with
  • the value 0. The dependent signal should not include completion signal from hsa_amd_memory_async_copy
  • operation to be issued in future as that can result in a deadlock. **If @p num_dep_signals is 0, this
  • argument is ignored.** https://github.com/RadeonOpenCompute/ROCR-Runtime/blob/fc99cf8516ef4bfc6311471b717838604a673b73/src/inc/hsa_ext_amd.h#L1121

gandryey avatar Nov 25 '21 22:11 gandryey

Thank you - that's more elaboration than I expected. :smile: Here is the new patch:

--- ROCclr-rocm-4.5.0/device/rocm/rocblit.cpp	2021-11-25 22:38:12.837707706 +0100
+++ ROCclr-rocm-4.5.0/device/rocm/rocblit.cpp	2021-11-25 22:40:00.699796341 +0100
@@ -460,7 +460,7 @@
               std::this_thread::get_id(), (wait_events.size() != 0) ? wait_events[0].handle : 0,
               active.handle);
       hsa_status_t status = hsa_amd_memory_async_copy_rect(&dstMem, &offset,
-          &srcMem, &offset, &dim, agent, direction, wait_events.size(), &wait_events[0], active);
+          &srcMem, &offset, &dim, agent, direction, wait_events.size(), wait_events.data(), active);
       if (status != HSA_STATUS_SUCCESS) {
         gpu().Barriers().ResetCurrentSignal();
         LogPrintfError("DMA buffer failed with code %d", status);
@@ -484,7 +484,7 @@
           hsa_status_t status = hsa_amd_memory_async_copy(
               (reinterpret_cast<address>(dst) + dstOffset), dstAgent,
               (reinterpret_cast<const_address>(src) + srcOffset), srcAgent,
-              size[0], wait_events.size(), &wait_events[0], active);
+              size[0], wait_events.size(), wait_events.data(), active);
           if (status != HSA_STATUS_SUCCESS) {
             gpu().Barriers().ResetCurrentSignal();
             LogPrintfError("DMA buffer failed with code %d", status);
@@ -671,7 +671,7 @@
           active.handle);
 
   status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent,
-      size[0], wait_events.size(), &wait_events[0], active);
+      size[0], wait_events.size(), wait_events.data(), active);
   if (status == HSA_STATUS_SUCCESS) {
     gpu().addSystemScope();
   } else {

riaqn avatar Nov 25 '21 22:11 riaqn

Looks fine.

gandryey avatar Nov 30 '21 20:11 gandryey