pai
pai copied to clipboard
submit a job , and the job has not been ran。
openpai/openpai-runtime container on pai-work node error log:
root@pai-worker1:/usr/local# docker logs 6e354aae0a40
+ CHILD_PROCESS=UNKNOWN
+ trap exit_handler EXIT
+ PAI_WORK_DIR=/usr/local/pai
+ PAI_CONFIG_DIR=/usr/local/pai-config
+ PAI_INIT_DIR=/usr/local/pai/init.d
+ PAI_RUNTIME_DIR=/usr/local/pai/runtime.d
+ PAI_LOG_DIR=/usr/local/pai/logs/2e4061a5-244c-45af-8ede-b4b9f6604774
+ PAI_SECRET_DIR=/usr/local/pai/secrets
+ PAI_USER_EXTENSION_SECRET_DIR=/usr/local/pai/user-extension-secrets
+ PAI_TOKEN_SECRET_DIR=/usr/local/pai/token-secrets
+ chmod a+rw /usr/local/pai/logs/2e4061a5-244c-45af-8ede-b4b9f6604774
+ find /usr/local/pai/logs/2e4061a5-244c-45af-8ede-b4b9f6604774 -maxdepth 1 -type f '!' -name init.log
+ LOG_FILES=
+ '[[' '!' -z ]]
+ find /usr/local/pai -maxdepth 1 -mindepth 1 '!' -name logs -exec rm -rf '{}' ';'
rm: can't remove '/usr/local/pai/user-extension-secrets/..data': Read-only file system
rm: can't remove '/usr/local/pai/user-extension-secrets/userExtensionSecrets.yaml': Read-only file system
rm: can't remove '/usr/local/pai/user-extension-secrets/..2021_11_18_08_54_30.144092891/userExtensionSecrets.yaml': Read-only file system
+ mv ./__init__.py ./common ./init ./init.d ./package_cache ./plugins ./requirements.txt ./runtime ./runtime.d /usr/local/pai
+ cd /usr/local/pai
+ '[[' true '=' true ]]
+ CHILD_PROCESS=FRAMEWORK_BARRIER
+ echo 'frameworkbarrier start'
frameworkbarrier start
+ '[[' -f /var/run/secrets/kubernetes.io/serviceaccount/token ]]
+ unset KUBE_APISERVER_ADDRESS
+ /usr/local/pai/init.d/frameworkbarrier
+ tee /usr/local/pai/logs/2e4061a5-244c-45af-8ede-b4b9f6604774/barrier.log
I1118 08:54:33.933533 14 barrier.go:211] Initializing frameworkbarrier
I1118 08:54:33.933705 14 barrier.go:214] With Config:
kubeApiServerAddress: ""
kubeConfigFilePath: ""
frameworkNamespace: default
frameworkName: 3a3cada7aa94eff914b372e17e17c8e1
barrierCheckIntervalSec: 10
barrierCheckTimeoutSec: 600
W1118 08:54:33.933736 14 client_config.go:549] Neither --kubeconfig nor --master was specified. Using the inClusterConfig. This might not work.
I1118 08:54:33.934851 14 barrier.go:227] Running frameworkbarrier
W1118 08:54:33.945092 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
W1118 08:54:43.947234 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
W1118 08:54:53.947552 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
W1118 08:55:03.947109 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
W1118 08:55:13.947432 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
webport log
The cause should be this:
W1118 08:54:33.945092 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
W1118 08:54:43.947234 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
W1118 08:54:53.947552 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
W1118 08:55:03.947109 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
W1118 08:55:13.947432 14 barrier.go:253] Failed to get Framework object from ApiServer: Unauthorized
In your pod, K8s api server is not authorized. Could you please provide your pod definition? Maybe there's something wrong with the service account.
@hzy46 thanks for your response。
Maybe the worker has no ROLES?
this is the pod definition: this pod will removed when the job stopped.
root@pai-master:~# kubectl describe pod 7aac99ce92941ccecdae4b381cfd18f7-taskrole-0
Name: 7aac99ce92941ccecdae4b381cfd18f7-taskrole-0
Namespace: default
Priority: -1947889210
Priority Class Name: 7aac99ce92941ccecdae4b381cfd18f7-priority
Node: pai-worker1/192.168.50.191
Start Time: Wed, 01 Dec 2021 10:54:51 +0800
Labels: FC_FRAMEWORK_NAME=7aac99ce92941ccecdae4b381cfd18f7
FC_TASKROLE_NAME=taskrole
FC_TASK_INDEX=0
type=kube-launcher-task
userName=admin
virtualCluster=default
Annotations: FC_CONFIGMAP_NAME: 7aac99ce92941ccecdae4b381cfd18f7-attempt
FC_CONFIGMAP_UID: 87036c7c-a5e2-4247-9aa1-3e6e93ac9ca7
FC_FRAMEWORK_ATTEMPT_ID: 0
FC_FRAMEWORK_ATTEMPT_INSTANCE_UID: 0_87036c7c-a5e2-4247-9aa1-3e6e93ac9ca7
FC_FRAMEWORK_NAME: 7aac99ce92941ccecdae4b381cfd18f7
FC_FRAMEWORK_NAMESPACE: default
FC_FRAMEWORK_UID: e6ed13d6-fa38-4878-ae6d-432f36d704e4
FC_POD_NAME: 7aac99ce92941ccecdae4b381cfd18f7-taskrole-0
FC_TASKROLE_NAME: taskrole
FC_TASKROLE_UID: ddcb0ed1-5251-11ec-86eb-cacde7d0c152
FC_TASK_ATTEMPT_ID: 0
FC_TASK_INDEX: 0
FC_TASK_UID: ddcb10c7-5251-11ec-86eb-cacde7d0c152
container.apparmor.security.beta.kubernetes.io/app: unconfined
hivedscheduler.microsoft.com/pod-bind-info:
node: pai-worker1
leafCellIsolation:
- 0
cellChain: gpu-machine-NODE-POOL
affinityGroupBindInfo:
- podPlacements:
- physicalNode: pai-worker1
physicalLeafCellIndices:
- 0
preassignedCellTypes:
- gpu-machine-NODE
hivedscheduler.microsoft.com/pod-leaf-cell-isolation: 0
hivedscheduler.microsoft.com/pod-scheduling-spec:
virtualCluster: default
priority: 10
pinnedCellId: null
leafCellType: null
leafCellNumber: 1
affinityGroup:
name: admin~Resnet18_1gpu_22/default
members:
- podNumber: 1
leafCellNumber: 1
rest-server/port-scheduling-spec: {"schedulePortStart":15000,"schedulePortEnd":40000,"ports":{"ssh":{"count":1},"http":{"count":1}}}
Status: Pending
IP: 192.168.50.191
Controlled By: ConfigMap/7aac99ce92941ccecdae4b381cfd18f7-attempt
Init Containers:
init:
Container ID: docker://575cb0186b714a2d64682fc16cbe65a275e3f70e41948cca046f952e9c1af5da
Image: openpai/openpai-runtime:v1.8.0
Image ID: docker-pullable://openpai/openpai-runtime@sha256:dcc0b622249c0538c95e794111ab38823a801600dee9ef44cbbe9d28a2623388
Port: <none>
Host Port: <none>
State: Running
Started: Wed, 01 Dec 2021 10:54:55 +0800
Ready: False
Restart Count: 0
Environment:
FC_FRAMEWORK_NAMESPACE: default
FC_FRAMEWORK_NAME: 7aac99ce92941ccecdae4b381cfd18f7
FC_TASKROLE_NAME: taskrole
FC_TASK_INDEX: 0
FC_CONFIGMAP_NAME: 7aac99ce92941ccecdae4b381cfd18f7-attempt
FC_POD_NAME: 7aac99ce92941ccecdae4b381cfd18f7-taskrole-0
FC_FRAMEWORK_UID: e6ed13d6-fa38-4878-ae6d-432f36d704e4
FC_FRAMEWORK_ATTEMPT_ID: 0
FC_FRAMEWORK_ATTEMPT_INSTANCE_UID: 0_87036c7c-a5e2-4247-9aa1-3e6e93ac9ca7
FC_CONFIGMAP_UID: 87036c7c-a5e2-4247-9aa1-3e6e93ac9ca7
FC_TASKROLE_UID: ddcb0ed1-5251-11ec-86eb-cacde7d0c152
FC_TASK_UID: ddcb10c7-5251-11ec-86eb-cacde7d0c152
FC_TASK_ATTEMPT_ID: 0
FC_POD_UID: (v1:metadata.uid)
FC_TASK_ATTEMPT_INSTANCE_UID: 0_$(FC_POD_UID)
USER_CMD: branch_name=pai-for-edu
wget https://raw.githubusercontent.com/microsoft/pai/pai-for-edu/contrib/edu-examples/pytorch_cifar10/src/cifar.py
wget https://raw.githubusercontent.com/microsoft/pai/pai-for-edu/contrib/edu-examples/pytorch_cifar10/src/init.sh
bash init.sh
python cifar.py --gpuid 0 --arch ResNet18 --epoch 200
KUBE_APISERVER_ADDRESS: https://10.192.0.1:443
REST_SERVER_URI: http://192.168.50.190:9186
GANG_ALLOCATION: true
PAI_FRAMEWORK_NAME: admin~Resnet18_1gpu_22
PAI_JOB_NAME: admin~Resnet18_1gpu_22
PAI_USER_NAME: admin
PAI_DEFAULT_FS_URI:
PAI_TASK_ROLE_COUNT: 1
PAI_TASK_ROLE_LIST: taskrole
PAI_VIRTUAL_CLUSTER: default
PAI_TASK_ROLE_TASK_COUNT_taskrole: 1
PAI_RESOURCE_taskrole: 1,3,8192,0
PAI_MIN_FAILED_TASK_COUNT_taskrole: 1
PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole: -1
PAI_USERNAME: admin
PAI_TASKS_NUM: 1
PAI_JOB_TASK_COUNT: 1
PAI_TASK_ROLES_NUM: 1
PAI_JOB_TASK_ROLE_COUNT: 1
PAI_JOB_TASK_ROLE_LIST: taskrole
PAI_CURRENT_TASK_ROLE_NAME: taskrole
PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX: (v1:metadata.annotations['FC_TASK_INDEX'])
Mounts:
/usr/local/pai from pai-vol (rw)
/usr/local/pai-config from job-exit-spec (rw)
/usr/local/pai/logs from host-log (rw,path="admin/7aac99ce92941ccecdae4b381cfd18f7/taskrole")
/usr/local/pai/user-extension-secrets from user-extension-secrets (rw)
/var/run/secrets/kubernetes.io/serviceaccount from runtime-account-token-bchmp (ro)
Containers:
app:
Container ID:
Image: openpai/standard:python_3.6-pytorch_1.2.0-gpu
Image ID:
Port: <none>
Host Port: <none>
Command:
/usr/local/pai/runtime
State: Waiting
Reason: PodInitializing
Ready: False
Restart Count: 0
Limits:
cpu: 3
github.com/fuse: 1
hivedscheduler.microsoft.com/pod-scheduling-enable: 1
memory: 8Gi
Requests:
cpu: 3
github.com/fuse: 1
hivedscheduler.microsoft.com/pod-scheduling-enable: 1
memory: 8Gi
Environment:
FC_FRAMEWORK_NAMESPACE: default
FC_FRAMEWORK_NAME: 7aac99ce92941ccecdae4b381cfd18f7
FC_TASKROLE_NAME: taskrole
FC_TASK_INDEX: 0
FC_CONFIGMAP_NAME: 7aac99ce92941ccecdae4b381cfd18f7-attempt
FC_POD_NAME: 7aac99ce92941ccecdae4b381cfd18f7-taskrole-0
FC_FRAMEWORK_UID: e6ed13d6-fa38-4878-ae6d-432f36d704e4
FC_FRAMEWORK_ATTEMPT_ID: 0
FC_FRAMEWORK_ATTEMPT_INSTANCE_UID: 0_87036c7c-a5e2-4247-9aa1-3e6e93ac9ca7
FC_CONFIGMAP_UID: 87036c7c-a5e2-4247-9aa1-3e6e93ac9ca7
FC_TASKROLE_UID: ddcb0ed1-5251-11ec-86eb-cacde7d0c152
FC_TASK_UID: ddcb10c7-5251-11ec-86eb-cacde7d0c152
FC_TASK_ATTEMPT_ID: 0
FC_POD_UID: (v1:metadata.uid)
FC_TASK_ATTEMPT_INSTANCE_UID: 0_$(FC_POD_UID)
PAI_FRAMEWORK_NAME: admin~Resnet18_1gpu_22
PAI_JOB_NAME: admin~Resnet18_1gpu_22
PAI_USER_NAME: admin
PAI_DEFAULT_FS_URI:
PAI_TASK_ROLE_COUNT: 1
PAI_TASK_ROLE_LIST: taskrole
PAI_VIRTUAL_CLUSTER: default
PAI_TASK_ROLE_TASK_COUNT_taskrole: 1
PAI_RESOURCE_taskrole: 1,3,8192,0
PAI_MIN_FAILED_TASK_COUNT_taskrole: 1
PAI_MIN_SUCCEEDED_TASK_COUNT_taskrole: -1
PAI_USERNAME: admin
PAI_TASKS_NUM: 1
PAI_JOB_TASK_COUNT: 1
PAI_TASK_ROLES_NUM: 1
PAI_JOB_TASK_ROLE_COUNT: 1
PAI_JOB_TASK_ROLE_LIST: taskrole
PAI_CURRENT_TASK_ROLE_NAME: taskrole
PAI_CURRENT_TASK_ROLE_CURRENT_TASK_INDEX: (v1:metadata.annotations['FC_TASK_INDEX'])
PAI_TASK_INDEX: (v1:metadata.annotations['FC_TASK_INDEX'])
NVIDIA_VISIBLE_DEVICES: (v1:metadata.annotations['hivedscheduler.microsoft.com/pod-leaf-cell-isolation'])
HIVED_VISIBLE_DEVICES: (v1:metadata.annotations['hivedscheduler.microsoft.com/pod-leaf-cell-isolation'])
Mounts:
/dev/shm from dshm (rw)
/usr/local/pai from pai-vol (rw)
/usr/local/pai/logs from host-log (rw,path="admin/7aac99ce92941ccecdae4b381cfd18f7/taskrole")
/usr/local/pai/ssh-secret from job-ssh-secret-volume (ro)
/usr/local/pai/user-extension-secrets from user-extension-secrets (rw)
/var/run/secrets/kubernetes.io/serviceaccount from runtime-account-token-bchmp (ro)
Conditions:
Type Status
Initialized False
Ready False
ContainersReady False
PodScheduled True
Volumes:
dshm:
Type: EmptyDir (a temporary directory that shares a pod's lifetime)
Medium: Memory
SizeLimit: 512Mi
pai-vol:
Type: EmptyDir (a temporary directory that shares a pod's lifetime)
Medium:
SizeLimit: <unset>
host-log:
Type: HostPath (bare host directory volume)
Path: /var/log/pai
HostPathType:
job-ssh-secret-volume:
Type: Secret (a volume populated by a Secret)
SecretName: job-ssh-secret
Optional: false
job-exit-spec:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: runtime-exit-spec-configuration
Optional: false
user-extension-secrets:
Type: Secret (a volume populated by a Secret)
SecretName: 7aac99ce92941ccecdae4b381cfd18f7-usercred
Optional: false
runtime-account-token-bchmp:
Type: Secret (a volume populated by a Secret)
SecretName: runtime-account-token-bchmp
Optional: false
QoS Class: Guaranteed
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute for 300s
node.kubernetes.io/unreachable:NoExecute for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedScheduling 2m16s hivedscheduler-ds-default Code: 400, Message: hivedscheduler: User Error: [982566fb-e5dd-400f-bfc4-ef9ac1562939(default/7aac99ce92941ccecdae4b381cfd18f7-taskrole-0)]: filterRoutine: Failed: Pod does not exist, completed or has not been informed to the scheduler
Normal Scheduled 55s hivedscheduler-ds-default Successfully assigned default/7aac99ce92941ccecdae4b381cfd18f7-taskrole-0 to pai-worker1
Normal Pulling 53s kubelet, pai-worker1 Pulling image "openpai/openpai-runtime:v1.8.0"
Normal Pulled 51s kubelet, pai-worker1 Successfully pulled image "openpai/openpai-runtime:v1.8.0"
Normal Created 51s kubelet, pai-worker1 Created container init
Normal Started 51s kubelet, pai-worker1 Started container init
No, I mean service account. The pod should use the service account runtime-account
to get the framework objects.
Please show me the results of the following command:
kubectl get sa runtime-account -o yaml
kubectl get clusterrole runtime-framework-role -o yaml
kubectl get clusterrolebinding runtime-framework-role-binding -o yaml
Reference: https://github.com/microsoft/pai/blob/7cf1ae351dd15d7228d2675212c1036c0aea8745/src/openpai-runtime/deploy/rbac.yaml
@hzy46
Hi, Here is my results. I can not find any errors.
root@pai-master:/home/xubaishuai# kubectl get sa runtime-account -o yaml
apiVersion: v1
kind: ServiceAccount
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"name":"runtime-account","namespace":"default"}}
creationTimestamp: "2021-10-11T09:32:15Z"
name: runtime-account
namespace: default
resourceVersion: "272755"
selfLink: /api/v1/namespaces/default/serviceaccounts/runtime-account
uid: e5d6431e-982e-4a50-a128-50a6c095add3
secrets:
- name: runtime-account-token-bchmp
root@pai-master:/home/xubaishuai# kubectl get clusterrole runtime-framework-role -o yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"rbac.authorization.k8s.io/v1","kind":"ClusterRole","metadata":{"annotations":{},"name":"runtime-framework-role"},"rules":[{"apiGroups":["frameworkcontroller.microsoft.com"],"resources":["frameworks"],"verbs":["get","watch","list"]}]}
creationTimestamp: "2021-10-11T09:32:15Z"
name: runtime-framework-role
resourceVersion: "272753"
selfLink: /apis/rbac.authorization.k8s.io/v1/clusterroles/runtime-framework-role
uid: 5d245afd-3a5f-45c2-b96f-da56d3443dfc
rules:
- apiGroups:
- frameworkcontroller.microsoft.com
resources:
- frameworks
verbs:
- get
- watch
- list
root@pai-master:/home/xubaishuai# kubectl get clusterrolebinding runtime-framework-role-binding -o yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"rbac.authorization.k8s.io/v1","kind":"ClusterRoleBinding","metadata":{"annotations":{},"name":"runtime-framework-role-binding"},"roleRef":{"apiGroup":"rbac.authorization.k8s.io","kind":"ClusterRole","name":"runtime-framework-role"},"subjects":[{"kind":"ServiceAccount","name":"runtime-account","namespace":"default"}]}
creationTimestamp: "2021-10-11T09:32:15Z"
name: runtime-framework-role-binding
resourceVersion: "272757"
selfLink: /apis/rbac.authorization.k8s.io/v1/clusterrolebindings/runtime-framework-role-binding
uid: 1d9301ff-5493-431b-8b1b-64e96ee43d6e
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: runtime-framework-role
subjects:
- kind: ServiceAccount
name: runtime-account
namespace: default