autotune
autotune copied to clipboard
Intermittently no recommendations are observed for a job eventhough there is a data available
Describe the bug No recommendations are observed for a "job" intermittently - although there is a data and the recommendations for the same experiment were generated earlier.
How to reproduce it On NERC, create an experiment using
[{
"version": "v2.0",
"experiment_name": "monitor_gpu_ttm",
"cluster_name": "default",
"performance_profile": "resource-optimization-local-monitoring",
"mode": "monitor",
"target_cluster": "local",
"datasource": "prometheus-1",
"kubernetes_objects": [
{
"type": "statefulset",
"name": "training-ttm",
"namespace": "unpartitioned-namespace",
"containers": [
{
"container_image_name": "kruizehub/ttm:v1",
"container_name": "training-container"
}
]
}
],
"trial_settings": {
"measurement_duration": "15min"
},
"recommendation_settings": {
"threshold": "0.1"
}
}]
curl -X POST http://kruize-openshift-tuning.apps.nerc-ocp-test-2.nerc.mghpcc.org/generateRecommendations?experiment_name=monitor_gpu_ttm [ { "cluster_name": "default", "experiment_type": "container", "kubernetes_objects": [ { "type": "job", "name": "training-ttm", "namespace": "unpartitioned-namespace", "containers": [ { "container_image_name": "kruizehub/ttm:v1", "container_name": "training-container", "recommendations": { "version": "1.0", "notifications": { "120001": { "type": "info", "message": "There is not enough data available to generate a recommendation.", "code": 120001 } }, "data": {} } } ] } ], "version": "v2.0", "experiment_name": "monitor_gpu_ttm" } ]
Expected behavior Few minutes earlier to it, recommendations for the same job were observed.
curl -X POST http://kruize-openshift-tuning.apps.nerc-ocp-test-2.nerc.mghpcc.org/generateRecommendations?experiment_name=monitor_gpu_ttm
[
{
"cluster_name": "default",
"experiment_type": "container",
"kubernetes_objects": [
{
"type": "job",
"name": "training-ttm",
"namespace": "unpartitioned-namespace",
"containers": [
{
"container_image_name": "kruizehub/ttm:v1",
"container_name": "training-container",
"recommendations": {
"version": "1.0",
"notifications": {
"111000": {
"type": "info",
"message": "Recommendations Are Available",
"code": 111000
}
},
"data": {
"2024-10-01T20:11:00.000Z": {
"notifications": {
"224001": {
"type": "error",
"message": "Amount field is missing in the Memory Section",
"code": 224001
},
"524002": {
"type": "critical",
"message": "Memory Limit Not Set",
"code": 524002
},
"524001": {
"type": "critical",
"message": "Memory Request Not Set",
"code": 524001
},
"223001": {
"type": "error",
"message": "Amount field is missing in the CPU Section",
"code": 223001
},
"111101": {
"type": "info",
"message": "Short Term Recommendations Available",
"code": 111101
},
"523001": {
"type": "critical",
"message": "CPU Request Not Set",
"code": 523001
},
"423001": {
"type": "warning",
"message": "CPU Limit Not Set",
"code": 423001
}
},
"monitoring_end_time": "2024-10-01T20:11:00.000Z",
"current": {},
"recommendation_terms": {
"short_term": {
"duration_in_hours": 24.0,
"notifications": {
"112101": {
"type": "info",
"message": "Cost Recommendations Available",
"code": 112101
},
"112102": {
"type": "info",
"message": "Performance Recommendations Available",
"code": 112102
}
},
"monitoring_start_time": "2024-09-30T20:11:00.000Z",
"recommendation_engines": {
"cost": {
"pods_count": 1,
"confidence_level": 0.0,
"config": {
"requests": {
"memory": {
"amount": 7.6173901824E9,
"format": "bytes"
},
"cpu": {
"amount": 10.487843658239633,
"format": "cores"
}
},
"limits": {
"memory": {
"amount": 7.6173901824E9,
"format": "bytes"
},
"nvidia.com/mig-7g.40gb": {
"amount": 1.0,
"format": "cores"
},
"cpu": {
"amount": 10.487843658239633,
"format": "cores"
}
}
},
"variation": {
"requests": {
"memory": {
"amount": 7.6173901824E9,
"format": "bytes"
},
"cpu": {
"amount": 10.487843658239633,
"format": "cores"
}
},
"limits": {
"memory": {
"amount": 7.6173901824E9,
"format": "bytes"
},
"cpu": {
"amount": 10.487843658239633,
"format": "cores"
}
}
},
"notifications": {}
},
"performance": {
"pods_count": 1,
"confidence_level": 0.0,
"config": {
"requests": {
"memory": {
"amount": 7.6173901824E9,
"format": "bytes"
},
"cpu": {
"amount": 10.487843658239633,
"format": "cores"
}
},
"limits": {
"memory": {
"amount": 7.6173901824E9,
"format": "bytes"
},
"nvidia.com/mig-7g.40gb": {
"amount": 1.0,
"format": "cores"
},
"cpu": {
"amount": 10.487843658239633,
"format": "cores"
}
}
},
"variation": {
"requests": {
"memory": {
"amount": 7.6173901824E9,
"format": "bytes"
},
"cpu": {
"amount": 10.487843658239633,
"format": "cores"
}
},
"limits": {
"memory": {
"amount": 7.6173901824E9,
"format": "bytes"
},
"cpu": {
"amount": 10.487843658239633,
"format": "cores"
}
}
},
"notifications": {}
}
},
"plots": {
"datapoints": 4,
"plots_data": {
"2024-10-01T08:11:00.000Z": {},
"2024-10-01T20:11:00.000Z": {
"cpuUsage": {
"min": 0.0,
"q1": 0.0,
"median": 10.487843658239633,
"q3": 10.487843658239633,
"max": 10.487843658239633,
"format": "cores"
},
"memoryUsage": {
"min": 1.30064384E9,
"q1": 1.49932032E9,
"median": 6.347825152E9,
"q3": 6.347825152E9,
"max": 6.347825152E9,
"format": "bytes"
}
},
"2024-10-01T14:11:00.000Z": {},
"2024-10-01T02:11:00.000Z": {}
}
}
},
"medium_term": {
"duration_in_hours": 168.0,
"notifications": {
"120001": {
"type": "info",
"message": "There is not enough data available to generate a recommendation.",
"code": 120001
}
}
},
"long_term": {
"duration_in_hours": 360.0,
"notifications": {
"120001": {
"type": "info",
"message": "There is not enough data available to generate a recommendation.",
"code": 120001
}
}
}
}
}
}
}
}
]
}
],
"version": "v2.0",
"experiment_name": "monitor_gpu_ttm"
}
]
Relevant logs No errors were observed.
Environment:
- openshift
- 4.16
- RHEL