kubernetes device-plugin源碼分析

1. 基本信息

代碼位於./pkg/kubelet/cm/devicemanager/,對於kubelet來說其包的入口爲manager.go

2. DeviceManager

DeviceManager通過ManagerImpl的podDevices成員的狀態向kubelet發送設備分配的進度和分配信息,通過allocatedDevices記錄已分配的設備,通過healthyDevices-allocatedDevices生成當前空閒的設備列表。
types.go定義了一個名爲Manager的接口,其有如下幾個成員,功能分別爲:

// ManagerImpl is the structure in charge of managing Device Plugins.
type ManagerImpl struct {
	socketname string
	socketdir  string

	endpoints map[string]endpointInfo // Key is ResourceName
	mutex     sync.Mutex

	server *grpc.Server
	wg     sync.WaitGroup

	// activePods is a method for listing active pods on the node
	// so the amount of pluginResources requested by existing pods
	// could be counted when updating allocated devices
	// 獲得Node上所有存在pods的列表
	activePods ActivePodsFunc

	// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
	// We use it to determine when we can purge inactive pods from checkpointed state.
	// sourcesReady提供了kubelet配置源的準備情況,例如apiserver update readyiness,以此決定什麼時候可以清除不活躍節點從某個檢查點狀態
	sourcesReady config.SourcesReady

	// callback is used for updating devices' states in one time call.
	// e.g. a new device is advertised, two old devices are deleted and a running device fails.
	// 更新某resourceName的所有設備狀態
	callback monitorCallback

	// healthyDevices contains all of the registered healthy resourceNames and their exported device IDs.
	healthyDevices map[string]sets.String

	// unhealthyDevices contains all of the unhealthy devices and their exported device IDs.
	unhealthyDevices map[string]sets.String

	// allocatedDevices contains allocated deviceIds, keyed by resourceName.
	// 已被分配的設備列表
	allocatedDevices map[string]sets.String

	// podDevices contains pod to allocated device mapping.
	// type podDevices map[string]containerDevices ,記錄了哪個pod分配了哪些設備
	podDevices        podDevices
	checkpointManager checkpointmanager.CheckpointManager
}

在manager中ManagerImpl實現了該接口,並在此基礎之上定義了幾個數據成員用來保存devicemanager的狀態信息。
分別爲

type ManagerImpl struct {
	socketname string	//kubelet
	socketdir  string	//kubelet.sock的路徑
	// type endpointInfo struct {
	// 	// defined in ./endpoint.go ,endpoint can run ,stop ,allocate prestartContainer ,etc.
	// 	e    endpoint   
	// 	// defined in apis ,only have a item
	// 	opts *pluginapi.DevicePluginOptions
	// }
	endpoints map[string]endpointInfo	//保存了所有endpoint的信息
	mutex     sync.Mutex	//互斥鎖,因爲endpoint是單獨起的協程,有一些臨界區代碼
	server *grpc.Server	//保存了register grpc的server
	wg     sync.WaitGroup //防止協程未關閉
	// 獲得Node上所有存在active pods的列表
	activePods ActivePodsFunc
	// sourcesReady提供了kubelet配置源的準備情況,例如apiserver update readyiness,以此決定什麼時候可以清除不活躍節點從某個檢查點狀態
	sourcesReady config.SourcesReady
	// 更新某resource的所有設備狀態
	callback monitorCallback
	// healthyDevices contains all of the registered healthy resourceNames and their exported device IDs.
	healthyDevices map[string]sets.String
	// unhealthyDevices contains all of the unhealthy devices and their exported device IDs.
	unhealthyDevices map[string]sets.String
	// allocatedDevices contains allocated deviceIds, keyed by resourceName.
	allocatedDevices map[string]sets.String
	// podDevices contains pod to allocated device mapping.
	// type podDevices map[string]containerDevices ,記錄了哪個pod分配了哪些設備
	podDevices        podDevices
	//記錄checkpoint
	checkpointManager checkpointmanager.CheckpointManager
}

首先看一下創建ManagerImpl實例的函數

newManagerImpl(socketPath string) (*ManagerImpl, error) {
	... // 檢查是否已經有已存在的manager
	// 創建manager
	manager := &ManagerImpl{
		endpoints: make(map[string]endpointInfo),

		socketname:       file,
		socketdir:        dir,
		healthyDevices:   make(map[string]sets.String),
		unhealthyDevices: make(map[string]sets.String),
		allocatedDevices: make(map[string]sets.String),
		podDevices:       make(podDevices),
	}
	// 將healthyDevice,unhealthyDevice列表清空,後根據最新的列表重新生成設備列表,並寫入檢查點
	manager.callback = manager.genericDeviceUpdateCallback 
	// The following structs are populated with real implementations in manager.Start()
	// Before that, initializes them to perform no-op operations.
	manager.activePods = func() []*v1.Pod { return []*v1.Pod{} }
	manager.sourcesReady = &sourcesReadyStub{} // add resource or return if is allready
	checkpointManager, err := checkpointmanager.NewCheckpointManager(dir)
	if err != nil {
		return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
	}
	manager.checkpointManager = checkpointManager
	return manager, nil
}

這裏特別注意一下manager.callback,該函數會被所有的endpoint調用,故用了互斥鎖,從邏輯上我們可以看出其每次更新資源列表的時候都會清空之前的列表

func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) {
	m.mutex.Lock()
	m.healthyDevices[resourceName] = sets.NewString()
	m.unhealthyDevices[resourceName] = sets.NewString()
	for _, dev := range devices {
		if dev.Health == pluginapi.Healthy {
			m.healthyDevices[resourceName].Insert(dev.ID)
		} else {
			m.unhealthyDevices[resourceName].Insert(dev.ID)
		}
	}
	m.mutex.Unlock()
	m.writeCheckpoint()
}

Manger有一個Start函數,從名字上我們可以看出kubelet是調用該方法做device-manager的初始化的,其主要功能是創建kubelet.sock路徑並掛plugin-grpc服務,該方法接受傳入所有pod的列表,和前置資源是否準備好。

3. Register()函數

通過pluginapi的grpc定義,我們可以知道plugin和devicemanager交互的開始是從Register函數開始的,那麼ManagerImpl也一定會有一個Register函數:

// 該方法由device-plugin通過grpc調用,將自身信息註冊進來
// 1. 判斷api版本是否支持
// 2. 判斷resourceName是否合法
// 3. 創建對應該資源的endpoint並調用listandwatch持續檢測該資源的設備健康
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
	//判斷是否是支持的版本
	klog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
	metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
	var versionCompatible bool
	for _, v := range pluginapi.SupportedVersions {
		if r.Version == v {
			versionCompatible = true
			break
		}
	}
	if !versionCompatible {
		errorString := fmt.Sprintf(errUnsupportedVersion, r.Version, pluginapi.SupportedVersions)
		klog.Infof("Bad registration request from device plugin with resource name %q: %s", r.ResourceName, errorString)
		return &pluginapi.Empty{}, fmt.Errorf(errorString)
	}
	//判斷resourceName是否合法
	if !v1helper.IsExtendedResourceName(v1.ResourceName(r.ResourceName)) {
		errorString := fmt.Sprintf(errInvalidResourceName, r.ResourceName)
		klog.Infof("Bad registration request from device plugin: %s", errorString)
		return &pluginapi.Empty{}, fmt.Errorf(errorString)
	}

	// TODO: for now, always accepts newest device plugin. Later may consider to
	// add some policies here, e.g., verify whether an old device plugin with the
	// same resource name is still alive to determine whether we want to accept
	// the new registration.
	// 大致意思是會可能會添加新老版本plugin更替的邏輯功能
	// 爲該Resource添加一個的endpoint
	go m.addEndpoint(r)

	return &pluginapi.Empty{}, nil
}

這裏用 go m.addEndpoint®爲每一個resource單獨起了一個協程做處理,m.runEndpoint(r.ResourceName, new)也是單獨起的協程

// 1. newEndpointImpl創建新的endpoint實例,並返回與plugin的連接conn
// 2. m.registerEndpoint將resourceName保存在m.endpoints[resourceName]中
// 3. go m.runEndpoint啓動一個endpoint,調用其run方法,通過listandwatch grpc獲取plugin發來的最新設備信息
//    ,並實時更新該資源設備的狀態
func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
	// 和device-plugin提供的endpoint建立連接,並返回client
	new, err := newEndpointImpl(filepath.Join(m.socketdir, r.Endpoint), r.ResourceName, m.callback)
	if err != nil {
		klog.Errorf("Failed to dial device plugin with request %v: %v", r, err)
		return
	}
	// 將resourceName保存在m.endpoints[resourceName]中
	m.registerEndpoint(r.ResourceName, r.Options, new)
	go func() {
		m.runEndpoint(r.ResourceName, new)
	}()
}

下面看一下m.runEndpoint(r.ResourceName, new)

func (m *ManagerImpl) runEndpoint(resourceName string, e endpoint) {
	// 調用plugin的ListAndWatch,並持續檢測
	e.run()
	// 僅關閉了和plugin的grpc連接
	e.stop()

	m.mutex.Lock()
	defer m.mutex.Unlock()
	// 如果該endpoint從run方法中跳出來並且停止了,便將該資源標定爲不健康
	if old, ok := m.endpoints[resourceName]; ok && old.e == e {
		// 將對應該resource的所有設備健康狀態標記爲不健康,resource本身不做改變
		m.markResourceUnhealthy(resourceName)
	}

	klog.V(2).Infof("Endpoint (%s, %v) became unhealthy", resourceName, e)
}

這裏e.run()實際上調用了該resource對應plugin的listandwatch函數,並且持續監控資源最新的信息,每當資源有更新時便調用m.callback更新device-manager的資源列表,包括m.healthyDevices[resourceName]和m.unhealthyDevices[resourceName],直到出錯返回調用e.stop()並標定該資源的所有設備健康狀態爲不健康。

func (e *endpointImpl) run() {
	stream, err := e.client.ListAndWatch(context.Background(), &pluginapi.Empty{})
	if err != nil {
		klog.Errorf(errListAndWatch, e.resourceName, err)

		return
	}
	// 持續檢測plugin發來的設備信息,並通過調用e.callback(實際調用爲manager的genericDeviceUpdateCallback函數)來更新設備列表
	// 但是從邏輯上來看調用callback時所有的endpoint都進入了一個臨界區代碼,resource如果太多的話可能需要等待
	// 每次更新設備信息都會把之前的該resource下的內容清空
	for {
		response, err := stream.Recv()
		if err != nil {
			klog.Errorf(errListAndWatch, e.resourceName, err)
			return
		}

		devs := response.Devices
		klog.V(2).Infof("State pushed for device plugin %s", e.resourceName)

		var newDevs []pluginapi.Device
		for _, d := range devs {
			newDevs = append(newDevs, *d)
		}

		e.callback(e.resourceName, newDevs)
	}
}

e.callback(e.resourceName, newDevs)實際上是調用了func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device)

// 將healthyDevice,unhealthyDevice列表清空,後根據最新的列表重新生成設備列表,並寫入檢查點
func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) {
	m.mutex.Lock()
	m.healthyDevices[resourceName] = sets.NewString()
	m.unhealthyDevices[resourceName] = sets.NewString()
	for _, dev := range devices {
		if dev.Health == pluginapi.Healthy {
			m.healthyDevices[resourceName].Insert(dev.ID)
		} else {
			m.unhealthyDevices[resourceName].Insert(dev.ID)
		}
	}
	m.mutex.Unlock()
	m.writeCheckpoint()
}

從上述過程我們可以看到,manager和endpoint是一對多的關係,endpoint和plugin是一對一的關係,通過每個resource綁定到一個單獨協程的endpoint上,每當plugin狀態更新時通過訪問manager的臨界區代碼callback回調實現manager的信息統計,充分利用了go語言高併發的特性,也保證了每個plugin的信息反饋可以得到及時的處理。
至此Register的實現結束,流程圖如下:
Register()

4. Allocate() 函數

Allocate()方法作用是根據scheduler傳來的准許某爲pod分配device,從後面函數的分析我們可以知道,scheduler發給kubelet分配pod設備的信息十分簡單,就是limits,實際上pod真正掛載哪些device的策略是由kubelet的device-manager決定的,scheduler只需要聲明需要幾個什麼類型的設備就可以了。
首先其先申明瞭一個deviceToReuse,這個map保存可重用的設備,爲什麼要聲明這樣一個map呢,原因是k8s支持在pod中創建init-container,init-container可以有多個,先於container執行,每個init-container按順序依次執行完畢後container纔會開始創建,而在爲container或init-container分配設備的時候會優先利用deviceToReuse的設備,代碼作者可能是想利用這樣的特性避免資源浪費。但是分析邏輯可以看到,如果deviceToReuse的設備數量大於container所需的設備數量,那麼這些多餘的設備會在pod運行結束之前無法被重用。有待驗證一下
所以我們可以在代碼中看到,首先其爲init-container分配設備,並且將該設備添加進入deviceToReuse爲=被下一個init-container重用,當init-container處理完成以後會將這些可重用設備交給container繼續使用。最後更新nodeinfo。

// Allocate is the call that you can use to allocate a set of devices
// from the registered device plugins.
// 當已註冊的plugin可以分配一組設備時調用
// PodAdmitAttributes is the context for a pod admission decision.被批准分配設備的pod
// type PodAdmitAttributes struct {
// 	// the pod to evaluate for admission
// 	Pod *v1.Pod
// 	// all pods bound to the kubelet excluding the pod being evaluated
// 	OtherPods []*v1.Pod
// }
// 傳入nodeInfo,PodAdmitAttributes,根據node信息和該pod的container組成如何分配設備?
func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
	pod := attrs.Pod
	devicesToReuse := make(map[string]sets.String) //保存可重用的設備
	// 下面的這個for循環實現了爲init-container分配設備,init-container設備先於container執行,依次執行完畢後纔會執行container
	for _, container := range pod.Spec.InitContainers {
		// allocateContainerResources嘗試和plugin做通信爲init-container分配設備
		if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
			return err
		}
		// 將init-container中分配的設備加入可重用設備中,
		// 如果爲一個 Pod 指定了多個 Init 容器,那些容器會按順序一次運行一個。 每個 Init 容器必須運行成功,
		// 下一個才能夠運行。故這些設備是在所有init-container中傳遞的,邏輯上可能有多個init-container用的是同一個設備
		m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
	}
	// 下面的這個for循環實現了爲container分配設備
	for _, container := range pod.Spec.Containers {
		if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
			return err
		}
		m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
	}

	m.mutex.Lock()
	defer m.mutex.Unlock()

	// quick return if no pluginResources requested
	if _, podRequireDevicePluginResource := m.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
		return nil
	}

	m.sanitizeNodeAllocatable(node)
	return nil
}

然後我們看一下其中的幾個關鍵函數,首先是m.allocateContainerResources(pod, &container, devicesToReuse),該函數嘗試和plugin做通信併爲這個pod的某個container/init-container分配設備,最終的結果是更新了manager的podDevices成員信息,該成員包含着該node下所有pod包含的所有container各個resource的分配數量和分配方式,例如env,mount等等。
最後寫入檢查點保存當前狀態。

// allocateContainerResources attempts to allocate all of required device
// plugin resources for the input container, issues an Allocate rpc request
// for each new device resource requirement, processes their AllocateResponses,
// and updates the cached containerDevices on success.
// allocateContainerResources嘗試爲輸入容器分配所有必需的設備插件資源,
// 爲每個新設備資源需求發出Allocate rpc請求,處理其AllocateResponses,
// 並在成功時更新緩存的containerDevices。並將結果插入m.podDevices中,記錄哪個pod分配了哪些設備,和plugin返回的設備分配方式,mount,env,etc.
// 可以推測得知下面函數的輸入變量爲待分配設備的pod名,裏面某一container,和deviceToReuse這個map
func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container, devicesToReuse map[string]sets.String) error {
	podUID := string(pod.UID)
	contName := container.Name
	allocatedDevicesUpdated := false
	// Extended resources are not allowed to be overcommitted.
	// Since device plugin advertises extended resources,
	// therefore Requests must be equal to Limits and iterating
	// over the Limits should be sufficient.
	// Limits是yaml文件中對於擴展資源的標記字段,形如:
	// resources:
	// 	limits:
	// 	nvidia.com/gpu: 1
	// 不允許過度使用擴展資源。
	// 由於設備插件負責擴展資源分配,因此請求必須等於Limits,並且迭代限制應該足夠。
	// 以此可以推斷下面的k是擴展資源名,v是請求數量
	// 對於請求的每一類資源
	for k, v := range container.Resources.Limits {
		resource := string(k)
		needed := int(v.Value())
		klog.V(3).Infof("needs %d %s", needed, resource)
		// 判斷是否是註冊了的擴展資源
		if !m.isDevicePluginResource(resource) {
			continue
		}
		// Updates allocatedDevices to garbage collect any stranded resources
		// before doing the device plugin allocation.
		// 更新已分配設備爲了在device-plugin之前垃圾收集任何標準設備
		// 在分配新設備之前首先將能回收資源的pod釋放掉
		if !allocatedDevicesUpdated {
			// 釋放所有終結狀態pod所分配的device,做垃圾收集用
			// 從m.podDevices刪除掉將要被移除的pod,同時也就將該pod中的device變爲可以分配的了
			m.updateAllocatedDevices(m.activePods())
			allocatedDevicesUpdated = true
		}
		// 返回一個爲該container分配的設備列表,如果是contianer restart則重新mount原設備,如果有可重用設備則用可重用設備,否則在當前可分配設備列表裏取前幾個,
		// 並將這幾個加入已分配設備中
		allocDevices, err := m.devicesToAllocate(podUID, contName, resource, needed, devicesToReuse[resource])
		if err != nil {
			return err
		}
		if allocDevices == nil || len(allocDevices) <= 0 {
			continue
		}

		startRPCTime := time.Now()
		// Manager.Allocate involves RPC calls to device plugin, which
		// could be heavy-weight. Therefore we want to perform this operation outside
		// mutex lock. Note if Allocate call fails, we may leave container resources
		// partially allocated for the failed container. We rely on updateAllocatedDevices()
		// to garbage collect these resources later. Another side effect is that if
		// we have X resource A and Y resource B in total, and two containers, container1
		// and container2 both require X resource A and Y resource B. Both allocation
		// requests may fail if we serve them in mixed order.
		// TODO: may revisit this part later if we see inefficient resource allocation
		// in real use as the result of this. Should also consider to parallize device
		// plugin Allocate grpc calls if it becomes common that a container may require
		// resources from multiple device plugins.
		m.mutex.Lock()
		// 每個endpoint都保存了一個可以和device-plugin交互的位置和客戶端
		eI, ok := m.endpoints[resource]
		m.mutex.Unlock()
		// 如果連接endpoint不成功,則重置已分配設備
		if !ok {
			m.mutex.Lock()
			m.allocatedDevices = m.podDevices.devices()
			m.mutex.Unlock()
			return fmt.Errorf("Unknown Device Plugin %s", resource)
		}

		devs := allocDevices.UnsortedList()
		// TODO: refactor this part of code to just append a ContainerAllocationRequest
		// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
		klog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
		// 遠程調用grpc向plugin請求分配設備
		resp, err := eI.e.allocate(devs)
		metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime))
		if err != nil {
			// In case of allocation failure, we want to restore m.allocatedDevices
			// to the actual allocated state from m.podDevices.
			m.mutex.Lock()
			m.allocatedDevices = m.podDevices.devices()
			m.mutex.Unlock()
			return err
		}

		if len(resp.ContainerResponses) == 0 {
			return fmt.Errorf("No containers return in allocation response %v", resp)
		}

		// Update internal cached podDevices state.
		m.mutex.Lock()
		// type podDevices map[string]containerDevices ,記錄了哪個pod的哪個container分配了哪些設備
		m.podDevices.insert(podUID, contName, resource, allocDevices, resp.ContainerResponses[0])
		m.mutex.Unlock()
	}

	// Checkpoints device to container allocation information.
	return m.writeCheckpoint()
}

updateAllocatedDevices(activePods []*v1.Pod)函數的功能是從m.podDevices中刪除所有處於終結狀態的pod,並回收其設備,反映到代碼上是用m.podDevices.device()重新生成一份已分配設備列表。

// updateAllocatedDevices gets a list of active pods and then frees any Devices that are bound to
// terminated pods. Returns error on failure.
// 釋放所有終結狀態pod所分配的device,做垃圾收集用,刪除m.podDevices下對應的pod全部內容,然後用m.podDevices.devices()
// 重新生成m.allocatedDevices的內容
func (m *ManagerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
	if !m.sourcesReady.AllReady() {
		return
	}
	m.mutex.Lock()
	defer m.mutex.Unlock()
	activePodUids := sets.NewString()
	for _, pod := range activePods {
		activePodUids.Insert(string(pod.UID))
	}
	allocatedPodUids := m.podDevices.pods()
	// 已分配設備pod-activatepod = podsToRemoved
	podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
	if len(podsToBeRemoved) <= 0 {
		return
	}
	klog.V(3).Infof("pods to be removed: %v", podsToBeRemoved.List())
	// 從m.podDevices刪除掉將要被移除的pod
	m.podDevices.delete(podsToBeRemoved.List())
	// Regenerated allocatedDevices after we update pod allocation information.
	// 將resourceName作爲鍵值返回所有被跟蹤pod的設備列表,即所有當前已被分配的資源設備列表
	m.allocatedDevices = m.podDevices.devices()
}

m.devicesToAllocate(podUID, contName, resource, needed, devicesToReuse[resource])函數用來生成需要向plugin請求的設備列表,如果可重用設備已經夠用或者沒有設備需求時則不向plugin請求分配新的設備,否則調用grpc向plugin申請分配新的設備。
設備分配的邏輯是首先看container中是否已經分配了設備,如果設備夠用則返回nil,否則查看reusableDevices,取出裏面的設備分配,否則根據最終缺少的設備量返回healthdevice - inusedevice(m.allocatedDevices[resource]),中的前needed個,這便是其分配設備的策略。

// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
// 返回一個爲該container分配的設備列表,如果是contianer restart則重新mount原設備,如果有可重用設備則用可重用設備,否則在當前可分配設備列表裏取前幾個,
// 並將這幾個加入已分配設備中
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
	m.mutex.Lock()
	defer m.mutex.Unlock()
	needed := required
	// Gets list of devices that have already been allocated.
	// This can happen if a container restarts for example.
	// 如果是container重啓的時候,則依舊mount原設備
	devices := m.podDevices.containerDevices(podUID, contName, resource)
	if devices != nil {
		klog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, contName, podUID, devices.List())
		needed = needed - devices.Len()
		// A pod's resource is not expected to change once admitted by the API server,
		// so just fail loudly here. We can revisit this part if this no longer holds.
		if needed != 0 {
			return nil, fmt.Errorf("pod %q container %q changed request for resource %q from %d to %d", podUID, contName, resource, devices.Len(), required)
		}
	}
	if needed == 0 {
		// No change, no work.
		return nil, nil
	}
	klog.V(3).Infof("Needs to allocate %d %q for pod %q container %q", needed, resource, podUID, contName)
	// Needs to allocate additional devices.
	if _, ok := m.healthyDevices[resource]; !ok {
		return nil, fmt.Errorf("can't allocate unregistered device %s", resource)
	}
	devices = sets.NewString()

	// Allocates from reusableDevices list first.
	// 優先利用reusableDevices裏面的的設備
	for device := range reusableDevices {
		devices.Insert(device)
		needed--
		if needed == 0 {
			return devices, nil
		}
	}
	// Needs to allocate additional devices.
	if m.allocatedDevices[resource] == nil {
		m.allocatedDevices[resource] = sets.NewString()
	}
	// Gets Devices in use.
	devicesInUse := m.allocatedDevices[resource]
	// Gets a list of available devices.
	// 每一次的可用設備都由所有健康設備減去已分配設備,這樣便可以將剛剛從終結狀態釋放掉的設備利用起來
	available := m.healthyDevices[resource].Difference(devicesInUse)
	if int(available.Len()) < needed {
		return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
	}
	// 將被分配的設備是直接取前面的needed個,如果想實現擴展資源的自定義分配只需要改下面的代碼就可以了
	allocated := available.UnsortedList()[:needed]
	// Updates m.allocatedDevices with allocated devices to prevent them
	// from being allocated to other pods/containers, given that we are
	// not holding lock during the rpc call.
	// 加入已被分配的的設備之中
	for _, device := range allocated {
		m.allocatedDevices[resource].Insert(device)
		devices.Insert(device)
	}
	//返回設備列表
	return devices, nil
}

然後是m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)這個函數。
該函數實現將init-container中分配的設備不斷的加入可重用設備中。

// Populates allocatedResources with the device resources allocated to the specified <podUID, contName>.
// 將某資源分配給某pod內的container
// type podDevices map[string]containerDevices保存了所有pod已分配的設備列表,每一項對應一個containerDevice
// 每個containerDevice包含了所有該container已分配的資源名稱,每個資源名稱下保存着設備列表
func (pdev podDevices) addContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.String) {
	// 有沒有這個pod
	containers, exists := pdev[podUID]
	if !exists {
		return
	}
	resources, exists := containers[contName]
	if !exists {
		return
	}
	// 將已分配的設備全部添加進到該resource的allocatedResources中
	for resource, devices := range resources {
		allocatedResources[resource] = allocatedResources[resource].Union(devices.deviceIds)
	}
}

然後是m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)函數,其實現了將從可重用設備列表刪除本次分配給該container的設備。但是如果init-container生成的devicesToReuse數量大於container的所需設備那麼這些多餘的設備便還是處於已分配狀態,故其他的pod便無法利用這個設備,直到該pod進入終結狀態後纔會被釋放。舉個例子,如果init-container每次請求4個gpu而container總共請求2個,那麼剩餘的兩個gpu會被認爲還在這個pod中被利用,爲已分配狀態,那麼這個pod便會在運行時佔用4個gpu,但是實際上他只需要2個,另外兩個在pod終結前一直處於空閒狀態而其他的pod無法再被分配這兩塊gpu。

// Removes the device resources allocated to the specified <podUID, contName> from allocatedResources.
// 將設備從該resource的已分配設備列表中刪除
func (pdev podDevices) removeContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.String) {
	containers, exists := pdev[podUID]
	if !exists {
		return
	}
	resources, exists := containers[contName]
	if !exists {
		return
	}
	for resource, devices := range resources {
		// 返回差集,未被利用的設備
		allocatedResources[resource] = allocatedResources[resource].Difference(devices.deviceIds)
	}
}

最後是m.sanitizeNodeAllocatable(node),該函數實現將最新的資源申請狀況同步到node上去。

// sanitizeNodeAllocatable scans through allocatedDevices in the device manager
// and if necessary, updates allocatableResource in nodeInfo to at least equal to
// the allocated capacity. This allows pods that have already been scheduled on
// the node to pass GeneralPredicates admission checking even upon device plugin failure.
// 更新當前node的已分配設備資源量
func (m *ManagerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
	var newAllocatableResource *schedulercache.Resource
	allocatableResource := node.AllocatableResource()
	if allocatableResource.ScalarResources == nil {
		allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
	}
	// needed表明對於scheduler來說申請設備是一個增量更新,最大程度上利用已經被plugin分配的設備。如果當前node已經分配的設備量
	// 沒有scheduler申請記錄表裏的多,那麼就保持scheduler的申請記錄,否則更新該記錄。
	for resource, devices := range m.allocatedDevices {
		needed := devices.Len()
		quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
		if ok && int(quant) >= needed {
			continue
		}
		// Needs to update nodeInfo.AllocatableResource to make sure
		// NodeInfo.allocatableResource at least equal to the capacity already allocated.
		if newAllocatableResource == nil {
			newAllocatableResource = allocatableResource.Clone()
		}
		newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
	}
	if newAllocatableResource != nil {
		node.SetAllocatableResource(newAllocatableResource)
	}
}

至此Allocate()函數分析完畢,流程圖如下:
Allocate()

5. kubelet和scheduler的資源調度

上面我們分析了kubernetes如何註冊設備插件、使用插件分配設備以及維護現有的設備信息,那kubelet是如何獲取這些信息的,scheduler是如何獲取信息並且感知調度的呢?
kubelet對於當前節點設備信息的獲取依賴func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
該函數返回三個值,分別爲當前node上設備總量、健康設備總量和刪除資源,分別叫做capacity, allocatable, deletedResources,刪除資源的意思是這個resource的endpoint不健康了,所以需要報告給master節點處理。Node.Status.Capacity和Node.Status.Allocatable記錄了節點全部資源,包括可用於分配的資源。
kubelet調用該函數的傳遞關係大致是:kubelet有一個ContainerManager的類型,ContainerManager有一個接口函數,containerManagerImpl是通過devicemanager.ManagerImpl實現設備插件資源管理的。(調用關係參考https://www.kubernetes.org.cn/4391.html)

// GetCapacity is expected to be called when Kubelet updates its node status.
// The first returned variable contains the registered device plugin resource capacity.
// The second returned variable contains the registered device plugin resource allocatable.
// The third returned variable contains previously registered resources that are no longer active.
// Kubelet uses this information to update resource capacity/allocatable in its node status.
// After the call, device plugin can remove the inactive resources from its internal list as the
// change is already reflected in Kubelet node status.
// Note in the special case after Kubelet restarts, device plugin resource capacities can
// temporarily drop to zero till corresponding device plugins re-register. This is OK because
// cm.UpdatePluginResource() run during predicate Admit guarantees we adjust nodeinfo
// capacity for already allocated pods so that they can continue to run. However, new pods
// requiring device plugin resources will not be scheduled till device plugin re-registers.
// 得到當前node上設備總量、健康設備總量和刪除資源,分別叫做capacity, allocatable, deletedResources,
// 這是kubelet的視角
func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) {
	needsUpdateCheckpoint := false
	var capacity = v1.ResourceList{}
	var allocatable = v1.ResourceList{}
	deletedResources := sets.NewString()
	m.mutex.Lock()
	// 可分配資源等於healthyDevices數
	for resourceName, devices := range m.healthyDevices {
		eI, ok := m.endpoints[resourceName]
		// 如果不存在或endpoint失效,則刪除資源
		if (ok && eI.e.stopGracePeriodExpired()) || !ok {
			// The resources contained in endpoints and (un)healthyDevices
			// should always be consistent. Otherwise, we run with the risk
			// of failing to garbage collect non-existing resources or devices.
			if !ok {
				klog.Errorf("unexpected: healthyDevices and endpoints are out of sync")
			}
			delete(m.endpoints, resourceName)
			delete(m.healthyDevices, resourceName)
			deletedResources.Insert(resourceName)
			needsUpdateCheckpoint = true
		} else {
			capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
			allocatable[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
		}
	}
	// 節點資源總數等於healthyDevices+unhealthyDevices數
	for resourceName, devices := range m.unhealthyDevices {
		eI, ok := m.endpoints[resourceName]
		if (ok && eI.e.stopGracePeriodExpired()) || !ok {
			if !ok {
				klog.Errorf("unexpected: unhealthyDevices and endpoints are out of sync")
			}
			delete(m.endpoints, resourceName)
			delete(m.unhealthyDevices, resourceName)
			deletedResources.Insert(resourceName)
			needsUpdateCheckpoint = true
		} else {
			capacityCount := capacity[v1.ResourceName(resourceName)]
			unhealthyCount := *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
			capacityCount.Add(unhealthyCount)
			capacity[v1.ResourceName(resourceName)] = capacityCount
		}
	}
	m.mutex.Unlock()
	if needsUpdateCheckpoint {
		m.writeCheckpoint()
	}
	// 返回資源容量,可用資源數,刪除的資源數
	return capacity, allocatable, deletedResources.UnsortedList()
}

那麼scheduler是如何得知當前節點有多少設備可以被分配呢,還記得Allocate()有一個函數叫sanitizeNodeAllocatable(node *schedulercache.NodeInfo)嗎,scheduler便是通過經由該函數維護的schedulercache.NodeInfo.ScalarResources[v1.ResourceName(resource)]獲取當前節點的可用設備的。
總結一下,kubelet通過GetCapacity()獲取當前節點所有資源信息,Scheduler通過NodeInfo獲取資源可用量,以此爲依據進行調度。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章