Add Monitoring Metrics to NebulaGraph
The idea was provided by @AntiTopQuark , thank you.
The description below is in English, if you wish to read the description in Chinese, please scroll to the end of this issue.
Requirement Description
In database systems, statistical metrics and monitoring functions are crucial for troubleshooting, especially when addressing performance issues. This requirement aims to add monitoring metrics to NebulaGraph, specifically to count the number of executions and measure the execution time of the Limit operator.
Implementation Approach
- Learn from the Implementation of Other Metrics
Firstly, by consulting NebulaGraph's official documentation (monitoring metrics), we can find implementations of metrics similar to our requirements, such as num_sort_executors, which will serve as a reference for implementing our new metric.
- Add New Code for Feature Implementation
Based on the reference information above, we can follow these steps for the code implementation of the new metric:
2.1 Define and Declare Metrics
Initially, define and declare the new metrics in the src/graph/stats/GraphStats.h and src/graph/stats/GraphStats.cpp files, as shown below:
# git diff src/graph/stats/GraphStats.h src/graph/stats/GraphStats.cpp
diff --git a/src/graph/stats/GraphStats.cpp b/src/graph/stats/GraphStats.cpp
index f03cd4688..1d7c66806 100644
--- a/src/graph/stats/GraphStats.cpp
+++ b/src/graph/stats/GraphStats.cpp
@@ -32,6 +32,8 @@ stats::CounterId kOptimizerLatencyUs;
stats::CounterId kNumAggregateExecutors;
stats::CounterId kNumSortExecutors;
+stats::CounterId kNumLimitExecutors;
+stats::CounterId kLimitExecutorsLatencyUs;
stats::CounterId kNumIndexScanExecutors;
stats::CounterId kNumOpenedSessions;
@@ -65,7 +67,9 @@ void initGraphStats() {
kNumSortExecutors = stats::StatsManager::registerStats("num_sort_executors", "rate, sum");
kNumIndexScanExecutors =
stats::StatsManager::registerStats("num_indexscan_executors", "rate, sum");
-
+ kNumLimitExecutors = stats::StatsManager::registerStats("num_Limit_executors", "rate, sum");
+ kLimitExecutorsLatencyUs = stats::StatsManager::registerHisto(
+ "limit_executors_latency_us", 1000, 0, 2000, "avg, p75, p95, p99, p999");
kNumOpenedSessions = stats::StatsManager::registerStats("num_opened_sessions", "rate, sum");
kNumAuthFailedSessions =
stats::StatsManager::registerStats("num_auth_failed_sessions", "rate, sum");
diff --git a/src/graph/stats/GraphStats.h b/src/graph/stats/GraphStats.h
index a1facbbf5..e91fd355c 100644
--- a/src/graph/stats/GraphStats.h
+++ b/src/graph/stats/GraphStats.h
@@ -33,6 +33,8 @@ extern stats::CounterId kOptimizerLatencyUs;
// Executor
extern stats::CounterId kNumAggregateExecutors;
extern stats::CounterId kNumSortExecutors;
+extern stats::CounterId kNumLimitExecutors;
+extern stats::CounterId kLimitExecutorsLatencyUs;
extern stats::CounterId kNumIndexScanExecutors;
// Server client traffic
2.2 Metric Logic Implementation
Next, implement the metric logic in LimitExecutor.cpp and Executor.cpp. This specifically includes adding a count for the number of executions during the execution of the Limit operator and calculating the execution time:
# git diff src/graph/executor/query/LimitExecutor.cpp src/graph/executor/Executor.cpp
error: cannot run less: No such file or directory
diff --git a/src/graph/executor/Executor.cpp b/src/graph/executor/Executor.cpp
index 5450722f0..b6ae9f242 100644
--- a/src/graph/executor/Executor.cpp
+++ b/src/graph/executor/Executor.cpp
@@ -209,6 +209,11 @@ Executor *Executor::makeExecutor(QueryContext *qctx, const PlanNode *node) {
return pool->makeAndAdd<FulltextIndexScanExecutor>(node, qctx);
}
case PlanNode::Kind::kLimit: {
+ stats::StatsManager::addValue(kNumLimitExecutors);
+ if (FLAGS_enable_space_level_metrics && spaceName != "") {
+ stats::StatsManager::addValue(
+ stats::StatsManager::counterWithLabels(kNumLimitExecutors, {{"space", spaceName}}));
+ }
return pool->makeAndAdd<LimitExecutor>(node, qctx);
}
case PlanNode::Kind::kSample: {
diff --git a/src/graph/executor/query/LimitExecutor.cpp b/src/graph/executor/query/LimitExecutor.cpp
index 236d8fe3c..7f79fcdaf 100644
--- a/src/graph/executor/query/LimitExecutor.cpp
+++ b/src/graph/executor/query/LimitExecutor.cpp
@@ -5,11 +5,13 @@
#include "graph/executor/query/LimitExecutor.h"
#include "graph/planner/plan/Query.h"
-
+#include "graph/stats/GraphStats.h"
namespace nebula {
namespace graph {
folly::Future<Status> LimitExecutor::execute() {
+ auto start_ts = std::chrono::steady_clock::now();
+ auto &spaceName = qctx()->rctx() ? qctx()->rctx()->session()->spaceName() : "";
SCOPED_TIMER(&execTime_);
auto* limit = asNode<Limit>(node());
@@ -31,6 +33,12 @@ folly::Future<Status> LimitExecutor::execute() {
builder.value(result.valuePtr());
iter->select(offset, count);
builder.iter(std::move(result).iter());
+ auto diff = std::chrono::steady_clock::now() - start_ts;
+ stats::StatsManager::addValue(kLimitExecutorsLatencyUs, std::chrono::duration_cast<std::chrono::milliseconds>(diff).count());
+ if (FLAGS_enable_space_level_metrics && spaceName != "") {
+ stats::StatsManager::addValue(
+ stats::StatsManager::histoWithLabels(kLimitExecutorsLatencyUs, {{"space", spaceName}}));
+ }
return finish(builder.build());
} else {
DataSet ds;
@@ -42,6 +50,12 @@ folly::Future<Status> LimitExecutor::execute() {
ds.rows.emplace_back(std::move(row));
}
}
+ auto diff = std::chrono::steady_clock::now() - start_ts;
+ stats::StatsManager::addValue(kLimitExecutorsLatencyUs, std::chrono::duration_cast<std::chrono::milliseconds>(diff).count());
+ if (FLAGS_enable_space_level_metrics && spaceName != "") {
+ stats::StatsManager::addValue(
+ stats::StatsManager::histoWithLabels(kLimitExecutorsLatencyUs, {{"space", spaceName}}));
+ }
return finish(builder.value(Value(std::move(ds))).iter(Iterator::Kind::kProp).build());
}
}
- Verify Results
Finally, using the method provided by NebulaGraph's official documentation for querying metrics, we can retrieve the results of the newly implemented metric. By executing several queries containing the Limit operator, we can further verify the accuracy of the metric.
中文版描述
为 NebulaGraph 新增监控统计项
需求描述
在数据库系统中,系统的统计项和监控功能对于排查问题至关重要,尤其是在解决性能问题时。本次需求旨在为 NebulaGraph 新增监控统计项,具体为统计 Limit 算子的执行次数及其执行时间。
实现思路
- 学习和借鉴其他统计项的实现
首先,通过查阅 NebulaGraph 的官方文档(监控统计项),我们可以找到与我们需求相似的统计项实现,例如 num_sort_executors,作为我们实现新统计项的参考。
- 新增代码实现功能
基于上述参考信息,我们可以按照以下步骤进行新统计项的代码实现:
2.1 定义和声明统计项
首先,在 src/graph/stats/GraphStats.h 和 src/graph/stats/GraphStats.cpp 文件中定义并声明新的统计项,如下所示:
# git diff src/graph/stats/GraphStats.h src/graph/stats/GraphStats.cpp
diff --git a/src/graph/stats/GraphStats.cpp b/src/graph/stats/GraphStats.cpp
index f03cd4688..1d7c66806 100644
--- a/src/graph/stats/GraphStats.cpp
+++ b/src/graph/stats/GraphStats.cpp
@@ -32,6 +32,8 @@ stats::CounterId kOptimizerLatencyUs;
stats::CounterId kNumAggregateExecutors;
stats::CounterId kNumSortExecutors;
+stats::CounterId kNumLimitExecutors;
+stats::CounterId kLimitExecutorsLatencyUs;
stats::CounterId kNumIndexScanExecutors;
stats::CounterId kNumOpenedSessions;
@@ -65,7 +67,9 @@ void initGraphStats() {
kNumSortExecutors = stats::StatsManager::registerStats("num_sort_executors", "rate, sum");
kNumIndexScanExecutors =
stats::StatsManager::registerStats("num_indexscan_executors", "rate, sum");
-
+ kNumLimitExecutors = stats::StatsManager::registerStats("num_Limit_executors", "rate, sum");
+ kLimitExecutorsLatencyUs = stats::StatsManager::registerHisto(
+ "limit_executors_latency_us", 1000, 0, 2000, "avg, p75, p95, p99, p999");
kNumOpenedSessions = stats::StatsManager::registerStats("num_opened_sessions", "rate, sum");
kNumAuthFailedSessions =
stats::StatsManager::registerStats("num_auth_failed_sessions", "rate, sum");
diff --git a/src/graph/stats/GraphStats.h b/src/graph/stats/GraphStats.h
index a1facbbf5..e91fd355c 100644
--- a/src/graph/stats/GraphStats.h
+++ b/src/graph/stats/GraphStats.h
@@ -33,6 +33,8 @@ extern stats::CounterId kOptimizerLatencyUs;
// Executor
extern stats::CounterId kNumAggregateExecutors;
extern stats::CounterId kNumSortExecutors;
+extern stats::CounterId kNumLimitExecutors;
+extern stats::CounterId kLimitExecutorsLatencyUs;
extern stats::CounterId kNumIndexScanExecutors;
// Server client traffic
2.2 统计项逻辑实现
接着,在 LimitExecutor.cpp 和 Executor.cpp 中实现统计逻辑,具体包括在执行 Limit 算子时增加执行次数的统计,并计算执行时间:
# git diff src/graph/executor/query/LimitExecutor.cpp src/graph/executor/Executor.cpp
error: cannot run less: No such file or directory
diff --git a/src/graph/executor/Executor.cpp b/src/graph/executor/Executor.cpp
index 5450722f0..b6ae9f242 100644
--- a/src/graph/executor/Executor.cpp
+++ b/src/graph/executor/Executor.cpp
@@ -209,6 +209,11 @@ Executor *Executor::makeExecutor(QueryContext *qctx, const PlanNode *node) {
return pool->makeAndAdd<FulltextIndexScanExecutor>(node, qctx);
}
case PlanNode::Kind::kLimit: {
+ stats::StatsManager::addValue(kNumLimitExecutors);
+ if (FLAGS_enable_space_level_metrics && spaceName != "") {
+ stats::StatsManager::addValue(
+ stats::StatsManager::counterWithLabels(kNumLimitExecutors, {{"space", spaceName}}));
+ }
return pool->makeAndAdd<LimitExecutor>(node, qctx);
}
case PlanNode::Kind::kSample: {
diff --git a/src/graph/executor/query/LimitExecutor.cpp b/src/graph/executor/query/LimitExecutor.cpp
index 236d8fe3c..7f79fcdaf 100644
--- a/src/graph/executor/query/LimitExecutor.cpp
+++ b/src/graph/executor/query/LimitExecutor.cpp
@@ -5,11 +5,13 @@
#include "graph/executor/query/LimitExecutor.h"
#include "graph/planner/plan/Query.h"
-
+#include "graph/stats/GraphStats.h"
namespace nebula {
namespace graph {
folly::Future<Status> LimitExecutor::execute() {
+ auto start_ts = std::chrono::steady_clock::now();
+ auto &spaceName = qctx()->rctx() ? qctx()->rctx()->session()->spaceName() : "";
SCOPED_TIMER(&execTime_);
auto* limit = asNode<Limit>(node());
@@ -31,6 +33,12 @@ folly::Future<Status> LimitExecutor::execute() {
builder.value(result.valuePtr());
iter->select(offset, count);
builder.iter(std::move(result).iter());
+ auto diff = std::chrono::steady_clock::now() - start_ts;
+ stats::StatsManager::addValue(kLimitExecutorsLatencyUs, std::chrono::duration_cast<std::chrono::milliseconds>(diff).count());
+ if (FLAGS_enable_space_level_metrics && spaceName != "") {
+ stats::StatsManager::addValue(
+ stats::StatsManager::histoWithLabels(kLimitExecutorsLatencyUs, {{"space", spaceName}}));
+ }
return finish(builder.build());
} else {
DataSet ds;
@@ -42,6 +50,12 @@ folly::Future<Status> LimitExecutor::execute() {
ds.rows.emplace_back(std::move(row));
}
}
+ auto diff = std::chrono::steady_clock::now() - start_ts;
+ stats::StatsManager::addValue(kLimitExecutorsLatencyUs, std::chrono::duration_cast<std::chrono::milliseconds>(diff).count());
+ if (FLAGS_enable_space_level_metrics && spaceName != "") {
+ stats::StatsManager::addValue(
+ stats::StatsManager::histoWithLabels(kLimitExecutorsLatencyUs, {{"space", spaceName}}));
+ }
return finish(builder.value(Value(std::move(ds))).iter(Iterator::Kind::kProp).build());
}
}
- 验证结果
最后,按照 NebulaGraph 官方提供的统计项查询方法,我们可以获取新实现的统计项结果。通过执行几条包含 Limit 算子的查询语句,进一步验证统计项的准确性。