split up large rule files into smaller files when exceeding 4MiB
Description
We have a lot of SLOs, a large amount of them are for a singular system class which results in us having a massive rules file for these specific SLOs. Unfortunately, this breaks in Kubernetes when our ConfigMap(s) exceed 4MiB and causes our custom SlothSLOGenerationFailure alert to fire, which is defined with the following expression:
expr: "sum(rate(kooper_controller_processed_event_duration_seconds_count{job=\"prometheus/sloth-kube-prometheus\",success=\"false\"}[30m])) > 0"
Ideally, we should be able to detect if a rule file is larger than 4MiB and if so, split it up into multiple files (ConfigMaps)
Seems to be hardcoded in the prometheus-operator: https://github.com/prometheus-operator/prometheus-operator/blob/370a2ea18a48000e2ea4bc05acb093502915f5c9/pkg/operator/rules.go#L55-L59
https://github.com/prometheus-operator/prometheus-operator/blob/370a2ea18a48000e2ea4bc05acb093502915f5c9/pkg/operator/rules.go#L192-L196
This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 15 days.
bumping to remove stale label
Hey @cxdy! 😄 Are you using the K8s output? with or without the k8s controller?
Hey, yeah we're using the k8s controller and yes we're using k8s output from Sloth when generating.
On the next version of Sloth, this will come with a new feature, the concept of "K8s transformer plugins". These give you the ability to customize how you want to format your k8s objects into unstructured.Unstructured k8s objects. The concept is similar as SLO and SLI plugins, a go file that its a plugin, receives the SLO Sloth result and returns the k8s objects.
This can be used with lots of purposes, like generating CRs that are not for prometheus operator, setting special annotations, customizing the namespace... or you could create plugins for this specific corner case.
The feature It's already on main and I've make a test for this use case in case you want to test it :)
I'm sorry but haven't document it yet.
K8s transformer plugin
package plugin
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"github.com/slok/sloth/pkg/common/model"
k8sutils "github.com/slok/sloth/pkg/common/utils/k8s"
plugink8stransformv1 "github.com/slok/sloth/pkg/prometheus/plugin/k8stransform/v1"
)
const (
PluginVersion = "prometheus/k8stransform/v1"
PluginID = "sloth.dev/k8stransform/prom-operator-prometheus-rule-big-split/v1"
)
const maxBytes = 0.4 * 1024 * 1024 // 0.4 MB
func NewPlugin() (plugink8stransformv1.Plugin, error) {
return plugin{}, nil
}
type plugin struct{}
func (p plugin) TransformK8sObjects(ctx context.Context, kmeta model.K8sMeta, sloResult model.PromSLOGroupResult) (*plugink8stransformv1.K8sObjects, error) {
groups := []groupAndSize{}
for _, slo := range sloResult.SLOResults {
if len(slo.PrometheusRules.SLIErrorRecRules.Rules) > 0 {
g, err := p.unstructuredGroupWithSize(slo.PrometheusRules.SLIErrorRecRules)
if err != nil {
return nil, err
}
groups = append(groups, *g)
}
if len(slo.PrometheusRules.MetadataRecRules.Rules) > 0 {
g, err := p.unstructuredGroupWithSize(slo.PrometheusRules.MetadataRecRules)
if err != nil {
return nil, err
}
groups = append(groups, *g)
}
if len(slo.PrometheusRules.AlertRules.Rules) > 0 {
g, err := p.unstructuredGroupWithSize(slo.PrometheusRules.AlertRules)
if err != nil {
return nil, err
}
groups = append(groups, *g)
}
for _, extraRG := range slo.PrometheusRules.ExtraRules {
// Skip empty extra rule groups.
if len(extraRG.Rules) == 0 {
continue
}
g, err := p.unstructuredGroupWithSize(extraRG)
if err != nil {
return nil, err
}
groups = append(groups, *g)
}
}
// Lets craft the splitted objects by binpacking.
objs := []*unstructured.Unstructured{}
currentGroups := []any{}
currentObjSize := 0
for _, g := range groups {
// Check if a single group exceeds the limit.
if float64(g.size) > maxBytes {
return nil, fmt.Errorf("single rule group exceeds maximum size")
}
// Max size reached, save current object and reset size counter.
if float64(currentObjSize+g.size) > maxBytes {
obj := p.TransformK8sObjectsSplitted(ctx, kmeta, len(objs), currentGroups)
objs = append(objs, obj)
currentGroups = []any{}
currentObjSize = 0
}
currentGroups = append(currentGroups, g.group)
currentObjSize += g.size
}
// Add last object if it has groups.
if len(currentGroups) > 0 {
obj := p.TransformK8sObjectsSplitted(ctx, kmeta, len(objs), currentGroups)
objs = append(objs, obj)
}
// If no more than 1 object, fix the naming.
if len(objs) == 1 {
objs[0].SetName(kmeta.Name)
}
return &plugink8stransformv1.K8sObjects{
Items: objs,
}, nil
}
func (p plugin) TransformK8sObjectsSplitted(ctx context.Context, kmeta model.K8sMeta, index int, ruleGroups []any) *unstructured.Unstructured {
u := &unstructured.Unstructured{
Object: map[string]any{
"spec": map[string]any{
"groups": ruleGroups,
},
},
}
u.SetAPIVersion("monitoring.coreos.com/v1")
u.SetKind("PrometheusRule")
u.SetNamespace(kmeta.Namespace)
u.SetName(fmt.Sprintf("%s-%03d", kmeta.Name, index))
u.SetLabels(kmeta.Labels)
u.SetAnnotations(kmeta.Annotations)
return u
}
type groupAndSize struct {
group map[string]any
size int
}
func (p plugin) unstructuredGroupWithSize(rg model.PromRuleGroup) (*groupAndSize, error) {
if len(rg.Rules) == 0 {
return nil, fmt.Errorf("empty rule group")
}
g := k8sutils.PromRuleGroupToUnstructuredPromOperator(rg)
// Marshall once to get the approx size.
s, err := k8sutils.UnstructuredToYAMLString(g)
if err != nil {
return nil, err
}
return &groupAndSize{
group: g,
size: len(s),
}, nil
}
Using the plugin
You can use it with
-p ./PATH/TO/PLUGIN --k8s-transform-plugin-id="sloth.dev/k8stransform/prom-operator-prometheus-rule-big-split/v1"`.
Test
I've run it a test that created an slo service big_slo.yaml that outputs a k8s generated rules result.yaml of 8MB split in multiple k8s objects. Giving me files of 500KB
$ yq -s '.metadata.name' ./result.yaml
$ du -h ./*
508K ./big-slo-service-000.yml
508K ./big-slo-service-001.yml
508K ./big-slo-service-002.yml
508K ./big-slo-service-003.yml
508K ./big-slo-service-004.yml
508K ./big-slo-service-005.yml
508K ./big-slo-service-006.yml
508K ./big-slo-service-007.yml
508K ./big-slo-service-008.yml
508K ./big-slo-service-009.yml
508K ./big-slo-service-010.yml
508K ./big-slo-service-011.yml
508K ./big-slo-service-012.yml
508K ./big-slo-service-013.yml
508K ./big-slo-service-014.yml
508K ./big-slo-service-015.yml
508K ./big-slo-service-016.yml
508K ./big-slo-service-017.yml
768K ./big_slo.yaml
8.3M ./result.yaml
Please, if you could try this feature would be amazing 🙇. Works on k8s controller and without it.
Thanks!