sonic-swss
sonic-swss copied to clipboard
Support ASIC/SDK health event
What I did
Support ASIC/SDK health event
- Initialization
- Fetch capabilities and expose to STATE_DB
- Register the event handler and categories for each severity when supported
- Handle suppress ASIC/SDK health event categories
- Handle ASIC/SDK health event reported by SAI redis in the callback context
- Decode it
- Log message
- Send event
- Eliminate old events of each severity according to users' configuration
Signed-off-by: Stephen Sun [email protected]
Why I did it
How I verified it
Unit test.
Details if related
@prabhataravind to review once the PR is ready
/azpw run
/AzurePipelines run
Azure Pipelines successfully started running 1 pipeline(s).
Many covered lines were identified as not-covered. Retry for now
/azpw run
/AzurePipelines run
Azure Pipelines successfully started running 1 pipeline(s).
@kperumalbfn for viz
Looks like the coverage report is not accurate. retriggered
/azpw run
/AzurePipelines run
Azure Pipelines successfully started running 1 pipeline(s).
Looks like there is an issue in coverage report. Many covered lines were reported as uncovered.
(gdb) bt
#0 SwitchOrch::doCfgSuppressAsicSdkHealthEventTableTask (this=0x5555561431c0, consumer=...) at ../../orchagent/switchorch.cpp:948
#1 0x0000555555a0c691 in SwitchOrch::doTask (this=0x5555561431c0, consumer=...) at ../../orchagent/switchorch.cpp:1008
#2 0x000055555586d712 in Orch::doTask (this=0x5555561431c0) at ../../orchagent/orch.cpp:541
#3 0x000055555583d508 in switchorch_test::SwitchOrchTest_SwitchOrchTestSuppressCategories_Test::TestBody (this=<optimized out>) at switchorch_ut.cpp:158
#4 0x0000555555ceb1a7 in void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) ()
#5 0x0000555555ce143e in testing::Test::Run() ()
#6 0x0000555555ce1595 in testing::TestInfo::Run() ()
#7 0x0000555555ce1a29 in testing::TestSuite::Run() ()
#8 0x0000555555ce2072 in testing::internal::UnitTestImpl::RunAllTests() ()
#9 0x0000555555ceb717 in bool testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) ()
#10 0x0000555555ce1658 in testing::UnitTest::Run() ()
#11 0x00005555556d7050 in main ()
Build failures were caused by UT which I didn't see locally. Maybe it is relevant to bookworm docker. Will fix it
The committers listed above are authorized under a signed CLA.
- :white_check_mark: login: stephenxs / name: Stephen Sun (f70164a4673af9f1423487242272f3e6b6bd1f25, 0ab03d8a9fbb1d8b3f9ab0663b9a85e6c7b13e17, 884d1d4cf49d6279eaf41f045cd0fb0122d56a5c, 2bfb0d7a0bf21ce1c4d0fcb46dbcbe43ee436e37)
Build failures were caused by UT which I didn't see locally. Maybe it is relevant to bookworm docker. Will fix it
Fixed. It was caused by the failure to load Lua script in the slave docker.
Hi @prsunny Many covered lines were identified as uncovered. who can help to check this? Thanks.
orchagent/switchorch.cpp | 34.2% | 148-150,161-162,205-206,228,231,878-879,883,885,887-888,895-896,909,913,915,917-918,920,922-924,926,928,930-932,938,940,942-944,947,949-951,953-954,956,958,960-963,966,968-969,973,975,978,980,983,985,989,992,994,1014,1016,1073,1080-1086,1088,1090,1092,1094-1095,1097-1098,1101-1104,1106,1108,1110-1111,1113,1117,1120,1122,1124,1128-1129,1132-1134,1136,1138,1140,1142,1144,1259,1261-1262,1264
Thread 1 "tests" hit Breakpoint 2, SwitchOrch::doCfgSuppressAsicSdkHealthEventTableTask (this=0x55555a8ad3e0, consumer=...) at ../../orchagent/switchorch.cpp:915
915 SWSS_LOG_ENTER();
(gdb) n
917 auto &map = consumer.m_toSync;
(gdb)
918 auto it = map.begin();
(gdb)
920 while (it != map.end())
(gdb)
922 auto keyOpFieldsValues = it->second;
(gdb)
923 auto key = kfvKey(keyOpFieldsValues);
(gdb)
924 auto op = kfvOp(keyOpFieldsValues);
(gdb)
926 SWSS_LOG_INFO("KEY: %s, OP: %s", key.c_str(), op.c_str());
(gdb)
928 if (key.empty())
(gdb)
938 saiSeverity = switch_asic_sdk_health_event_severity_to_switch_attribute_map.at(key);
(gdb)
947 if (op == SET_COMMAND)
(gdb)
949 bool categoriesConfigured = false;
(gdb)
950 bool continueMainLoop = false;
(gdb)
951 for (const auto &cit : kfvFieldsValues(keyOpFieldsValues))
(gdb)
953 auto fieldName = fvField(cit);
(gdb)
954 auto fieldValue = fvValue(cit);
(gdb)
956 SWSS_LOG_INFO("FIELD: %s, VALUE: %s", fieldName.c_str(), fieldValue.c_str());
(gdb)
958 if (m_supportedAsicSdkHealthEventAttributes.find(saiSeverity) == m_supportedAsicSdkHealthEventAttributes.end())
(gdb)
966 if (fieldName == "categories")
(gdb)
968 registerAsicSdkHealthEventCategories(saiSeverity, key, fieldValue);
(gdb)
Thread 1 "tests" hit Breakpoint 1, SwitchOrch::registerAsicSdkHealthEventCategories (this=0x55555a8ad3e0, saiSeverity=SAI_SWITCH_ATTR_REG_WARNING_SWITCH_ASIC_SDK_HEALTH_CATEGORY, severityString="warning",
suppressed_category_list="software,cpu_hw,invalid_category", isInitializing=false) at ../../orchagent/switchorch.cpp:878
878 auto &&categories = tokenize(suppressed_category_list, ',');
(gdb)
879 for (auto category : categories)
(gdb)
883 interested_categories_set.erase(switch_asic_sdk_health_event_category_map.at(category));
(gdb)
879 for (auto category : categories)
(gdb)
883 interested_categories_set.erase(switch_asic_sdk_health_event_category_map.at(category));
(gdb)
879 for (auto category : categories)
(gdb)
883 interested_categories_set.erase(switch_asic_sdk_health_event_category_map.at(category));
(gdb)
Thread 1 "tests" hit Breakpoint 3, SwitchOrch::onSwitchAsicSdkHealthEvent (this=0x55555a8ad3e0, switch_id=141733920768, severity=SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_FATAL, timestamp=...,
category=SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, data=..., description=...) at ../../orchagent/switchorch.cpp:1080
1080 std::vector<swss::FieldValueTuple> values;
(gdb) n
1081 const string &severity_str = switch_asic_sdk_health_event_severity_reverse_map.at(severity);
(gdb)
1082 const string &category_str = switch_asic_sdk_health_event_category_reverse_map.at(category);
(gdb)
1083 string description_str;
(gdb)
1084 const std::time_t &t = (std::time_t)timestamp.tv_sec;
(gdb)
1085 stringstream time_ss;
(gdb)
1086 time_ss << std::put_time(std::localtime(&t), "%Y-%m-%d %H:%M:%S");
(gdb)
1088 switch (data.data_type)
(gdb)
1092 vector<uint8_t> description_with_terminator(description.list, description.list + description.count);
(gdb)
1094 description_with_terminator.push_back(0);
(gdb)
1095 description_str = string(reinterpret_cast<char*>(description_with_terminator.data()));
(gdb)
525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc())
(gdb)
1104 description_str.end()))
(gdb)
1103 }),
(gdb)
1104 description_str.end()))
(gdb)
1097 if (description_str.end() !=
(gdb)
1092 vector<uint8_t> description_with_terminator(description.list, description.list + description.count);
(gdb)
1117 { "sai_timestamp", time_ss.str() },
(gdb)
1120 { "description", description_str }};
(gdb)
1122 if (0 == gMyAsicName.size())
(gdb)
1128 SWSS_LOG_NOTICE("[%s] ASIC/SDK health event occurred at %s, asic %s, category %s: %s", severity_str.c_str(), time_ss.str().c_str(), gMyAsicName.c_str(), category_str.c_str(), description_str.c_str());
(gdb)
1129 params["asic_name"] = gMyAsicName;
(gdb)
525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc())
(gdb)
1132 values.emplace_back("severity", severity_str);
(gdb)
1133 values.emplace_back("category", category_str);
(gdb)
1134 values.emplace_back("description", description_str);
(gdb)
1136 m_asicSdkHealthEventTable->set(time_ss.str(),values);
(gdb)
525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc())
(gdb)
1136 m_asicSdkHealthEventTable->set(time_ss.str(),values);
(gdb)
525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc())
(gdb)
1138 event_publish(g_events_handle, "asic-sdk-health-event", ¶ms);
(gdb)
525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc())
(gdb)
1140 if (severity == SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_FATAL)
(gdb)
1142 m_fatalEventCount++;
(gdb)
1120 { "description", description_str }};
(gdb)
1085 stringstream time_ss;
(gdb)
1083 string description_str;
(gdb)
1080 std::vector<swss::FieldValueTuple> values;
(gdb)
1144 }
(gdb)
/azp run
Azure Pipelines successfully started running 1 pipeline(s).
Hi @prsunny Many covered lines were identified as uncovered. who can help to check this? Thanks.
orchagent/switchorch.cpp | 34.2% | 148-150,161-162,205-206,228,231,878-879,883,885,887-888,895-896,909,913,915,917-918,920,922-924,926,928,930-932,938,940,942-944,947,949-951,953-954,956,958,960-963,966,968-969,973,975,978,980,983,985,989,992,994,1014,1016,1073,1080-1086,1088,1090,1092,1094-1095,1097-1098,1101-1104,1106,1108,1110-1111,1113,1117,1120,1122,1124,1128-1129,1132-1134,1136,1138,1140,1142,1144,1259,1261-1262,1264Thread 1 "tests" hit Breakpoint 2, SwitchOrch::doCfgSuppressAsicSdkHealthEventTableTask (this=0x55555a8ad3e0, consumer=...) at ../../orchagent/switchorch.cpp:915 915 SWSS_LOG_ENTER(); (gdb) n 917 auto &map = consumer.m_toSync; (gdb) 918 auto it = map.begin(); (gdb) 920 while (it != map.end()) (gdb) 922 auto keyOpFieldsValues = it->second; (gdb) 923 auto key = kfvKey(keyOpFieldsValues); (gdb) 924 auto op = kfvOp(keyOpFieldsValues); (gdb) 926 SWSS_LOG_INFO("KEY: %s, OP: %s", key.c_str(), op.c_str()); (gdb) 928 if (key.empty()) (gdb) 938 saiSeverity = switch_asic_sdk_health_event_severity_to_switch_attribute_map.at(key); (gdb) 947 if (op == SET_COMMAND) (gdb) 949 bool categoriesConfigured = false; (gdb) 950 bool continueMainLoop = false; (gdb) 951 for (const auto &cit : kfvFieldsValues(keyOpFieldsValues)) (gdb) 953 auto fieldName = fvField(cit); (gdb) 954 auto fieldValue = fvValue(cit); (gdb) 956 SWSS_LOG_INFO("FIELD: %s, VALUE: %s", fieldName.c_str(), fieldValue.c_str()); (gdb) 958 if (m_supportedAsicSdkHealthEventAttributes.find(saiSeverity) == m_supportedAsicSdkHealthEventAttributes.end()) (gdb) 966 if (fieldName == "categories") (gdb) 968 registerAsicSdkHealthEventCategories(saiSeverity, key, fieldValue); (gdb) Thread 1 "tests" hit Breakpoint 1, SwitchOrch::registerAsicSdkHealthEventCategories (this=0x55555a8ad3e0, saiSeverity=SAI_SWITCH_ATTR_REG_WARNING_SWITCH_ASIC_SDK_HEALTH_CATEGORY, severityString="warning", suppressed_category_list="software,cpu_hw,invalid_category", isInitializing=false) at ../../orchagent/switchorch.cpp:878 878 auto &&categories = tokenize(suppressed_category_list, ','); (gdb) 879 for (auto category : categories) (gdb) 883 interested_categories_set.erase(switch_asic_sdk_health_event_category_map.at(category)); (gdb) 879 for (auto category : categories) (gdb) 883 interested_categories_set.erase(switch_asic_sdk_health_event_category_map.at(category)); (gdb) 879 for (auto category : categories) (gdb) 883 interested_categories_set.erase(switch_asic_sdk_health_event_category_map.at(category)); (gdb)Thread 1 "tests" hit Breakpoint 3, SwitchOrch::onSwitchAsicSdkHealthEvent (this=0x55555a8ad3e0, switch_id=141733920768, severity=SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_FATAL, timestamp=..., category=SAI_SWITCH_ASIC_SDK_HEALTH_CATEGORY_FW, data=..., description=...) at ../../orchagent/switchorch.cpp:1080 1080 std::vector<swss::FieldValueTuple> values; (gdb) n 1081 const string &severity_str = switch_asic_sdk_health_event_severity_reverse_map.at(severity); (gdb) 1082 const string &category_str = switch_asic_sdk_health_event_category_reverse_map.at(category); (gdb) 1083 string description_str; (gdb) 1084 const std::time_t &t = (std::time_t)timestamp.tv_sec; (gdb) 1085 stringstream time_ss; (gdb) 1086 time_ss << std::put_time(std::localtime(&t), "%Y-%m-%d %H:%M:%S"); (gdb) 1088 switch (data.data_type) (gdb) 1092 vector<uint8_t> description_with_terminator(description.list, description.list + description.count); (gdb) 1094 description_with_terminator.push_back(0); (gdb) 1095 description_str = string(reinterpret_cast<char*>(description_with_terminator.data())); (gdb) 525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc()) (gdb) 1104 description_str.end())) (gdb) 1103 }), (gdb) 1104 description_str.end())) (gdb) 1097 if (description_str.end() != (gdb) 1092 vector<uint8_t> description_with_terminator(description.list, description.list + description.count); (gdb) 1117 { "sai_timestamp", time_ss.str() }, (gdb) 1120 { "description", description_str }}; (gdb) 1122 if (0 == gMyAsicName.size()) (gdb) 1128 SWSS_LOG_NOTICE("[%s] ASIC/SDK health event occurred at %s, asic %s, category %s: %s", severity_str.c_str(), time_ss.str().c_str(), gMyAsicName.c_str(), category_str.c_str(), description_str.c_str()); (gdb) 1129 params["asic_name"] = gMyAsicName; (gdb) 525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc()) (gdb) 1132 values.emplace_back("severity", severity_str); (gdb) 1133 values.emplace_back("category", category_str); (gdb) 1134 values.emplace_back("description", description_str); (gdb) 1136 m_asicSdkHealthEventTable->set(time_ss.str(),values); (gdb) 525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc()) (gdb) 1136 m_asicSdkHealthEventTable->set(time_ss.str(),values); (gdb) 525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc()) (gdb) 1138 event_publish(g_events_handle, "asic-sdk-health-event", ¶ms); (gdb) 525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc()) (gdb) 1140 if (severity == SAI_SWITCH_ASIC_SDK_HEALTH_SEVERITY_FATAL) (gdb) 1142 m_fatalEventCount++; (gdb) 1120 { "description", description_str }}; (gdb) 1085 stringstream time_ss; (gdb) 1083 string description_str; (gdb) 1080 std::vector<swss::FieldValueTuple> values; (gdb) 1144 } (gdb)
I see coverage works for other PRs. lets check the latest result
/apzw run
/azp run
Azure Pipelines successfully started running 1 pipeline(s).