devices icon indicating copy to clipboard operation
devices copied to clipboard

Improve the logic of finding candidate pod in Allocate RPC

Open shinytang6 opened this issue 3 years ago • 1 comments

Currently in the device plugin Allocate RPC, we need to find the candidate pod according to the container in the request. If there are multiple gpu containers in one pod, obviously there will be logic problems when finding the candidate pod.

func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
	var reqCount uint
	for _, req := range reqs.ContainerRequests {
		reqCount += uint(len(req.DevicesIDs))
	}

	responses := pluginapi.AllocateResponse{}

	firstContainerReq := reqs.ContainerRequests[0]
	firstContainerReqDeviceCount := uint(len(firstContainerReq.DevicesIDs))

	availablePods := podSlice{}
	pendingPods, err := m.kubeInteractor.GetPendingPodsOnNode()
	if err != nil {
		return nil, err
	}
	for _, pod := range pendingPods {
		current := pod
		if IsGPURequiredPod(&current) && !IsGPUAssignedPod(&current) && !IsShouldDeletePod(&current) {
			availablePods = append(availablePods, &current)
		}
	}

	sort.Sort(availablePods)

	var candidatePod *v1.Pod
	for _, pod := range availablePods {
		for i, c := range pod.Spec.Containers {
			if !IsGPURequiredContainer(&c) {
				continue
			}

			if GetGPUResourceOfContainer(&pod.Spec.Containers[i]) == firstContainerReqDeviceCount {
				klog.Infof("Got candidate Pod %s(%s), the device count is: %d", pod.UID, c.Name, firstContainerReqDeviceCount)
				candidatePod = pod
				goto Allocate
			}
		}
	}

        ....

shinytang6 avatar Jun 11 '22 01:06 shinytang6

/cc @wpeng102 @william-wang

Thor-wl avatar Jun 11 '22 04:06 Thor-wl