dlrover
dlrover copied to clipboard
llama2 failed
执行 examples/pytorch/llama2/elastic_job.yaml 下面的demo ,运行报错了 kubectl -n dlrover apply -f examples/pytorch/llama2/elastic_job.yaml
[root@master dlrover]# kubectl -n dlrover logs elasticjob-fine-tuning-llama2-dlrover-master [2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] port = 50001 [2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] node_num = 1 [2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] job_name = fine-tuning-llama2 [2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] namespace = dlrover [2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] platform = pyk8s [2024-03-19 08:38:52,474] [INFO] [factory.py:34:new_job_args] New pyk8s JobParameters [2024-03-19 08:38:52,808] [INFO] [kubernetes.py:136:init] Load the incluster config. [2024-03-19 08:39:07,849] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403) Reason: Forbidden HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:10 GMT', 'Content-Length': '428'}) HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
[2024-03-19 08:39:27,892] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403) Reason: Forbidden HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:30 GMT', 'Content-Length': '428'}) HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
[2024-03-19 08:39:47,929] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403) Reason: Forbidden HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:51 GMT', 'Content-Length': '428'}) HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
Traceback (most recent call last):
File "/usr/local/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.8/site-packages/dlrover/python/master/main.py", line 64, in
[root@master dlrover]# kubectl -n dlrover get deployment NAME READY UP-TO-DATE AVAILABLE AGE dlrover-controller-manager 1/1 1 1 25m [root@master dlrover]# [root@master dlrover]# kubectl -n dlrover get crd elasticjobs.elastic.iml.github.io NAME CREATED AT elasticjobs.elastic.iml.github.io 2024-03-19T08:24:29Z [root@master dlrover]#
You should execute kubectl -n dlrover apply -f dlrover/go/operator/config/manifests/bases/default-role.yaml to grant permission for the DLRover master to access CRDs.
This issue has been automatically marked as stale because it has not had recent activity.
This issue is being automatically closed due to inactivity.