Vulkan-ValidationLayers icon indicating copy to clipboard operation
Vulkan-ValidationLayers copied to clipboard

Crash in spirv::EntryPoint:GetAccessibleIds

Open Axel-Reactor opened this issue 1 year ago • 12 comments

Environment:

  • OS: Windows 11
  • GPU and driver version: NVIDIA 560
  • SDK or header version if building from repo: 1.3.290
  • Options enabled (synchronization, best practices, etc.): Default options

Describe the Issue I'm hitting a crash with a compute shader in spirv::EntryPoint:GetAccessibleIds

[vcruntime140.dll] _CxxThrowException 0x00007ff96eeb51d0
[VkLayer_khronos_validation.dll] robin_hood::detail::Table<1,80,std::basic_string<char,std::char_traits<char>,std::allocator<char> >,enum VkValidationFeatureDisableEXT,robin_hood::hash<std::basic_string<char,std::char_traits<char>,std::allocator<char> >,void>,std::equal_to<std::basic_string<char,std::char_traits<char>,std::allocator<char> > > >::throwOverflowError() 0x00007ff8ee51b54f
[Inlined] [VkLayer_khronos_validation.dll] robin_hood::detail::Table<1,80,unsigned int,void,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> >::insert_move(robin_hood::detail::Table<1,80,unsigned int,void,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> >::DataNode<robin_hood::detail::Table<1,80,unsigned int,void,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> >,1> &&) 0x00007ff8ee54f0f3
[VkLayer_khronos_validation.dll] robin_hood::detail::Table<1,80,unsigned int,void,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> >::rehashPowerOfTwo(unsigned long long,bool) 0x00007ff8ee54f0ee
[VkLayer_khronos_validation.dll] robin_hood::detail::Table<1,80,unsigned int,void,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> >::insertKeyPrepareEmptySpot<unsigned int const &>(const unsigned int &) 0x00007ff8ee54dcf4
[Inlined] [VkLayer_khronos_validation.dll] robin_hood::detail::Table<1,80,unsigned int,void,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> >::emplace(const unsigned int &) 0x00007ff8ee9851d5
[Inlined] [VkLayer_khronos_validation.dll] robin_hood::detail::Table<1,80,unsigned int,void,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> >::insert(const unsigned int &) 0x00007ff8ee9851c2
[VkLayer_khronos_validation.dll] spirv::EntryPoint::GetAccessibleIds(const spirv::Module &,spirv::EntryPoint &) shader_module.cpp:471
[VkLayer_khronos_validation.dll] spirv::EntryPoint::EntryPoint(const spirv::Module &,const spirv::Instruction &,const robin_hood::detail::Table<1,80,unsigned int,std::vector<std::shared_ptr<spirv::ImageAccess const >,std::allocator<std::shared_ptr<spirv::ImageAccess const > > >,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,const robin_hood::detail::Table<1,80,unsigned int,std::vector<spirv::Instruction const *,std::allocator<spirv::Instruction const *> >,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,const robin_hood::detail::Table<1,80,unsigned int,unsigned int,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,const robin_hood::detail::Table<1,80,unsigned int,spirv::Instruction const *,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &) shader_module.cpp:769
[Inlined] [VkLayer_khronos_validation.dll] std::_Construct_in_place(spirv::EntryPoint &,const spirv::Module &,const spirv::Instruction &,robin_hood::detail::Table<1,80,unsigned int,std::vector<std::shared_ptr<spirv::ImageAccess const >,std::allocator<std::shared_ptr<spirv::ImageAccess const > > >,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,std::vector<spirv::Instruction const *,std::allocator<spirv::Instruction const *> >,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,unsigned int,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,spirv::Instruction const *,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &) 0x00007ff8ee988c77
[Inlined] [VkLayer_khronos_validation.dll] std::_Ref_count_obj2<spirv::EntryPoint>::{ctor}(const spirv::Module &,const spirv::Instruction &,robin_hood::detail::Table<1,80,unsigned int,std::vector<std::shared_ptr<spirv::ImageAccess const >,std::allocator<std::shared_ptr<spirv::ImageAccess const > > >,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,std::vector<spirv::Instruction const *,std::allocator<spirv::Instruction const *> >,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,unsigned int,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,spirv::Instruction const *,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &) 0x00007ff8ee988c41
[Inlined] [VkLayer_khronos_validation.dll] std::make_shared(const spirv::Module &,const spirv::Instruction &,robin_hood::detail::Table<1,80,unsigned int,std::vector<std::shared_ptr<spirv::ImageAccess const >,std::allocator<std::shared_ptr<spirv::ImageAccess const > > >,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,std::vector<spirv::Instruction const *,std::allocator<spirv::Instruction const *> >,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,unsigned int,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &,robin_hood::detail::Table<1,80,unsigned int,spirv::Instruction const *,robin_hood::hash<unsigned int,void>,std::equal_to<unsigned int> > &) 0x00007ff8ee988c2c
[VkLayer_khronos_validation.dll] spirv::Module::StaticData::StaticData(const spirv::Module &,spirv::StatelessData *) shader_module.cpp:1204
[VkLayer_khronos_validation.dll] spirv::Module::Module(unsigned long long,const unsigned int *,spirv::StatelessData *) shader_module.h:662
[Inlined] [VkLayer_khronos_validation.dll] std::_Construct_in_place(spirv::Module &,const unsigned long long &,const unsigned int *const &,spirv::StatelessData *&&) 0x00007ff8ee9b4c6e
[Inlined] [VkLayer_khronos_validation.dll] std::_Ref_count_obj2<spirv::Module>::{ctor}(const unsigned long long &,const unsigned int *const &,spirv::StatelessData *&&) 0x00007ff8ee9b4c5e
[Inlined] [VkLayer_khronos_validation.dll] std::make_shared(const unsigned long long &,const unsigned int *const &,spirv::StatelessData *&&) 0x00007ff8ee9b4c46
[VkLayer_khronos_validation.dll] ValidationStateTracker::PreCallRecordCreateShaderModule(VkDevice_T *,const VkShaderModuleCreateInfo *,const VkAllocationCallbacks *,VkShaderModule_T **,const RecordObject &,chassis::CreateShaderModule &) state_tracker.cpp:4756
[VkLayer_khronos_validation.dll] CoreChecks::PreCallRecordCreateShaderModule(VkDevice_T *,const VkShaderModuleCreateInfo *,const VkAllocationCallbacks *,VkShaderModule_T **,const RecordObject &,chassis::CreateShaderModule &) cc_spirv.cpp:2470
[VkLayer_khronos_validation.dll] vulkan_layer_chassis::CreateShaderModule(VkDevice_T *,const VkShaderModuleCreateInfo *,const VkAllocationCallbacks *,VkShaderModule_T **) chassis.cpp:998

image

I know this is probably not terribly helpful without the SPIR-V, but I'm not at liberty to provide that. Maybe someone can take a guess. Happy to provide more info if needed.

One thing I noticed is that the hash map is resizing (rehashPowerOfTwo). Maybe that's a hint. Although this is only a uint32 set, I don't see how that even could get corrupted.

Axel-Reactor avatar Sep 05 '24 16:09 Axel-Reactor

I tried editing the shader and the doesn't seem to be a clear cause and effect. If I remove different pieces of code it passes. It seems like there is a certain threshold of imageStore/imageLoad calls that causes it to fail.

Axel-Reactor avatar Sep 05 '24 17:09 Axel-Reactor

I compiled the validation layers in debug, and it's calling throwOverflowError in robin hood here:

        // we don't retry, fail if overflowing
        // don't need to check max num elements
        if (0 == mMaxNumElementsAllowed && !try_increase_info()) {
            throwOverflowError();
        }

try_increase_info fails because mInfoInc is 2

    bool try_increase_info() {
        ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements
                                   << ", maxNumElementsAllowed="
                                   << calcMaxNumElementsAllowed(mMask + 1))
        if (mInfoInc <= 2) {
            // need to be > 2 so that shift works (otherwise undefined behavior!)
            return false;
        }

I have no idea, is this a bug in the hash map implementation?

Axel-Reactor avatar Sep 05 '24 18:09 Axel-Reactor

This actually goes away if I replace the robin hood set with STL in this case, which is extremely upsetting:

image

Axel-Reactor avatar Sep 05 '24 18:09 Axel-Reactor

thanks for looking into this

  1. can you confirm the SPIR-V is fully valid (spirv-val throws no errors)
  2. when it crashes, what is the size of result_ids?

spencer-lunarg avatar Sep 05 '24 18:09 spencer-lunarg

  1. spirv-val --scalar-block-layout --target-env vulkan1.3 C:\Users\...\shader.spv returns no errors
  2. I don't remember the exact count, but it was ~700 entries, so nothing crazy (not crazy w.r.t. hash map size, it's a pretty big shader)

Axel-Reactor avatar Sep 05 '24 18:09 Axel-Reactor

if possible could you try

         // Try to add to the output set
-        if (!result_ids.insert(worklist_id).second) {
-            continue;  // If we already saw this id, we don't want to walk it again.
+        if (result_ids.contains(worklist_id)) {
+            continue;
+        } else {
+            result_ids.insert(worklist_id);
         }

without knowing any internal of how robin hood works, only thought is if there is an issue when keep trying to insert duplicate entries

spencer-lunarg avatar Sep 05 '24 18:09 spencer-lunarg

Same issue, still crashes on the insert: image State of the hash map: image

It consistently crashes with 689 entries

Axel-Reactor avatar Sep 05 '24 19:09 Axel-Reactor

I'm very confident I have thrown large shaders with over 700 entries for this. I assume you are on Windows 11?

The best thing I can do without the SPIR-V and make sure a large enough shader can not crash at 689 entries

spencer-lunarg avatar Sep 05 '24 19:09 spencer-lunarg

Yes, Windows 11, but I don't see how that's relevant? Let me try if this happens with stripped SPIR-V, I can probably give that to you.

Axel-Reactor avatar Sep 05 '24 19:09 Axel-Reactor

Alright, here is the obfuscated SPIR-V, crashes in the same way for me raygen_rs-0x694a1322c182da48.zip

Axel-Reactor avatar Sep 05 '24 19:09 Axel-Reactor

so quick update, I was able to reproduce the crash... I found removing the 10,000 line OpSource fixed it, so now think this not an issue with the hashmap, but how we might be storing the OpSource for such a large shader

edit - actually just going spirv-dis and then right away going spirv-as fixes it ...

if I go spirv-dis --raw-id and then spirv-as --preserve-numeric-ids it will still crash as normal

spencer-lunarg avatar Sep 08 '24 17:09 spencer-lunarg

more update, wrote a test capturing the IDs

#include <array>
TEST_F(VkPositiveLayerTest, RobinHood) {
    vvl::unordered_set<uint32_t> result_ids;

    std::array<uint32_t, 704> ids = { /* dumped out */ };
    for (auto id : ids) {
        if (!result_ids.insert(id).second) {
        }
    }
}

and it works fine, then I tried going

-    vvl::unordered_set<uint32_t> worklist;
+    std::unordered_set<uint32_t> worklist;

and it worked... something is going on having 2 robin hood uint32_t hashes going together in the same scope, trying to figure out why this is the case

spencer-lunarg avatar Sep 08 '24 21:09 spencer-lunarg

Note that we are still running into this periodically. It seems pretty important to replace the robin hood hash map if it's buggy, that could affect all sorts of other areas with hard to find bugs?

Axel-Reactor avatar Mar 07 '25 02:03 Axel-Reactor

@Axel-Reactor thanks for bringing this up... we are getting the 309 SDK ready this week, I will raise this internally next week, I agree we need to find a fix (either replacing it or some patch or something)

spencer-lunarg avatar Mar 07 '25 02:03 spencer-lunarg

as for @artem-lunarg request to look at, here is the full number set that causes an overflow

TEST_F(VkPositiveLayerTest, RobinHood) {
    vvl::unordered_set<uint32_t> result_ids;

    std::array<uint32_t, 779> ids = {
        5,     25645, 1020,  23505, 7795,  23919, 23495, 24125, 22942, 22762, 24121, 23233, 24170, 22697, 533,   13925, 23123,
        23401, 23102, 22785, 390,   7795,  23759, 24219, 23291, 11538, 24069, 24368, 22490, 22505, 24374, 24515, 24191, 23175,
        23767, 23559, 25671, 24084, 23898, 22531, 1177,  23069, 22595, 23398, 22961, 23593, 22773, 24494, 25688, 23083, 24217,
        22614, 24350, 23626, 24200, 22859, 22606, 23610, 1114,  22777, 23322, 22525, 1119,  1118,  24429, 22643, 1126,  25952,
        24518, 22774, 22508, 24257, 22849, 23336, 1152,  22903, 23005, 388,   24534, 23159, 23016, 25627, 658,   24414, 24453,
        23554, 23641, 23830, 23611, 22719, 23673, 11568, 24180, 1166,  26035, 24320, 24417, 22759, 22669, 23986, 25655, 22516,
        23969, 22886, 22978, 23690, 23394, 23137, 22514, 22624, 15210, 24462, 22869, 23917, 23011, 22676, 24211, 24102, 24194,
        22854, 24163, 23614, 23682, 22826, 23154, 24504, 23441, 22934, 22561, 24072, 23709, 22746, 22458, 24281, 23884, 23373,
        24112, 22783, 22671, 24490, 23590, 23920, 23656, 23430, 23105, 1079,  7795,  25800, 23858, 24377, 23035, 22807, 24354,
        23995, 25720, 23448, 23265, 24412, 25636, 23066, 24427, 22809, 22539, 1017,  23166, 23357, 948,   7795,  21425, 21515,
        21547, 1075,  25751, 25763, 25725, 21404, 21476, 21502, 21532, 25733, 21457, 21554, 21422, 947,   25743, 21524, 24315,
        22923, 25620, 23565, 23936, 22443, 23851, 23312, 24338, 23279, 409,   11846, 11826, 23923, 23161, 23196, 22916, 25929,
        23896, 22625, 1398,  1346,  30128, 30144, 30131, 25792, 21522, 23266, 23694, 23679, 24460, 23496, 24004, 23804, 1333,
        30086, 30080, 30090, 1320,  29981, 29964, 29987, 29979, 25729, 23393, 23647, 24396, 23803, 22756, 23653, 23041, 23290,
        22967, 23529, 22186, 22945, 22987, 23277, 21427, 24040, 22560, 23510, 23099, 23651, 22633, 22682, 23282, 24231, 24528,
        23511, 24188, 23263, 21495, 25631, 22713, 21557, 24011, 25628, 22754, 24434, 25677, 22659, 24409, 25711, 22709, 24106,
        22722, 23209, 23813, 1792,  22494, 23633, 5222,  23338, 24251, 21467, 23205, 23489, 23329, 23826, 1137,  1136,  25971,
        22454, 1142,  23872, 11830, 1192,  7795,  26547, 27508, 26504, 1367,  27656, 27321, 27139, 27269, 27157, 28055, 27340,
        26900, 27136, 26399, 26471, 27289, 27910, 910,   7795,  20332, 20349, 20343, 20353, 28092, 26358, 28067, 27700, 26459,
        27859, 28002, 30254, 28094, 20359, 27021, 27150, 26652, 26522, 30230, 26323, 26901, 20442, 26508, 1360,  30214, 30197,
        27524, 26771, 27064, 26563, 27768, 30243, 27265, 1346,  26743, 26801, 27718, 26985, 27361, 27539, 27172, 26927, 26632,
        26917, 26909, 27130, 27358, 26675, 27051, 27190, 26496, 26738, 20431, 26951, 26309, 27672, 27364, 28034, 27471, 27626,
        1359,  27993, 20322, 26806, 27036, 27038, 27726, 26537, 27169, 27271, 26561, 26730, 27624, 26862, 27601, 27421, 26493,
        27389, 1320,  27821, 27599, 26979, 20325, 26408, 26355, 27680, 26873, 20395, 26532, 27161, 27407, 27887, 26773, 27897,
        26923, 26526, 26456, 20301, 27114, 26435, 26813, 26944, 1183,  26120, 26174, 26208, 26197, 26132, 26328, 26311, 27499,
        30223, 26686, 26458, 26339, 26319, 27659, 26947, 20281, 26984, 27400, 27981, 27199, 26635, 27969, 27764, 26778, 26450,
        27444, 26237, 27846, 26433, 26438, 27246, 26453, 26626, 27184, 1380,  30424, 30430, 30414, 27087, 30236, 27120, 20368,
        26837, 27255, 27055, 26370, 27964, 27378, 27608, 27058, 27100, 27477, 26181, 26391, 27986, 26717, 27464, 26850, 27992,
        26299, 27379, 26853, 27753, 27684, 696,   695,   16311, 16305, 27134, 27669, 28037, 26624, 26162, 26484, 26839, 27693,
        27733, 27554, 26834, 28107, 28073, 27517, 26518, 409,   26447, 27760, 16307, 26767, 27835, 27185, 26534, 20429, 27077,
        26487, 26207, 27142, 26789, 27089, 26920, 27881, 26376, 27330, 27504, 1333,  20319, 30209, 28098, 20306, 26402, 26151,
        27584, 30220, 26634, 26292, 26101, 26764, 20384, 28019, 26436, 798,   797,   18341, 18655, 18329, 18595, 18267, 18618,
        18602, 18501, 18657, 18731, 18628, 18354, 18583, 18301, 18408, 18748, 18770, 18379, 18554, 18558, 18392, 18527, 18525,
        18670, 28075, 23099, 28051, 27145, 27125, 26441, 27409, 27118, 27344, 27032, 27596, 26793, 27124, 26426, 27808, 26529,
        18383, 27627, 27563, 26582, 18391, 18316, 26940, 27982, 27301, 1792,  27072, 27978, 5222,  27802, 26185, 16306, 18447,
        27052, 18297, 27153, 18581, 28022, 27759, 18283, 26160, 27155, 16314, 18266, 27397, 22480, 27849, 23582, 22940, 23111,
        21451, 24529, 23006, 26067, 959,   7795,  21685, 21651, 21662, 21743, 21768, 21730, 21747, 956,   21731, 21780, 21762,
        957,   21647, 21638, 21692, 21640, 21773, 21669, 21775, 21668, 27137, 18625, 967,   21789, 21804, 21814, 1358,  27830,
        24151, 27106, 1317,  1320,  1316,  22728, 20285, 27355, 1376,  30383, 30385, 30379, 30332, 30389, 30317, 30325, 30320,
        30309, 30339, 30315, 23617, 23589, 25626, 23301, 26878, 23309, 22457, 26325, 26974, 21405, 23094, 24408, 24526, 18635,
        26555, 23327, 27724, 27789, 26254, 27558, 23325, 26007, 23674, 23409, 28039, 20408, 21430, 24014, 25643, 1379,  30398,
        23971, 23863, 1190,  23358, 18575, 23778, 26003, 26934, 26543, 27309, 26382, 22444, 24049, 24521, 24118, 809,   7795,
        18926, 743,   18932, 18961, 18900, 16680, 806,   18903, 18876, 18794, 18874, 23878, 30341, 27533, 22830, 692,   691,
        26004, 27807, 26236, 1332,  26448, 22644, 18459, 22789, 11824, 11564, 23997, 26965, 27773, 21722, 1130,  27022, 25787,
        27017, 24078, 23139, 18475, 27734, 20379, 23810, 23780, 24433, 25652, 24177, 23229, 26084, 27885, 25797, 1203,  28234,
        28139, 28132, 1320,  28198, 28327, 28318, 28267, 28304, 28181, 28331, 1201,  28341, 28230, 28300};
    for (auto id : ids) {
        if (!result_ids.insert(id).second) {
        }
    }
}

spencer-lunarg avatar Mar 19 '25 17:03 spencer-lunarg